# Retrieve annotations 

### XML Parsing Function
This function takes an XML file as input and parses it using the `lxml` library. It returns the parsed XML object for further processing.

In [35]:
from lxml import etree as ET

import os
import sys
path=os.getcwd()

def get_xml(xml_input):
    parser = ET.XMLParser(remove_comments=False)
    xml = ET.parse(xml_input, parser=parser)
    return xml 

### Load and Process XMI Files
This section of the notebook loads XMI files and processes the annotations contained within them. Specifically, it extracts and stores token annotations, which will be mapped to unique identifiers for further analysis.


In [36]:
import csv  
from cassis import *  

# Function to load a Common Analysis Structure (CAS) from an XMI file
def load_cas(file_input):
    """
    This function loads a Common Analysis Structure (CAS) from an XMI file.
    
    Arguments:
    file_input : str : Path to the XMI file to be loaded
    
    Returns:
    cassis.cas.Cas : Loaded CAS object
    """
    # Open typesystem.xml to load the type system
    f = open('typesystem.xml', 'rb')  
    typesystem = load_typesystem(f)  # Load the type system from the XML file
    
    # Open the XMI file to load the CAS
    fxmi = open(file_input, 'rb')  
    cas = load_cas_from_xmi(fxmi, typesystem=typesystem)  # Load the CAS from the XMI file
    return cas  # Return the loaded CAS

# List of all input XMI files
all_files = ['Caesar, De bello Gallico 1-4.xmi', 'Virgil, Aeneid.xmi']

# Abbreviated names for the input files
files_abbreviated = ['Caes', 'Virg']

id2tok = dict()  # Dictionary to store IDs mapped to tokens
count = 0  # Initialize count variable for tracking iterations

for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    print(file_input)  # Printing the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    # Looping through each 'Actionality' annotation in the CAS
    for relation in cas.select('webanno.custom.Actionality'):
        # Looping through each token covered by the 'Actionality' annotation
        for token in cas.select_covered('webanno.custom.Actionality', relation):
            tok = token.get_covered_text()  # Getting the text covered by the current token
            id = token.begin  # Getting the beginning offset of the token
            
            # Option to limit output: print every 100th token
            # To print all tokens, comment out the line below and uncomment the one after
            if count % 100 == 0:  # Print every 100th token (can be removed if you want all tokens)
                print(f"ID: {id}, Token: {tok}")  # Print the token ID and its text
            
            count = count + 1  # Incrementing the count
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            id2tok[id] = tok  # Storing the token text in the dictionary with the token ID as the key

            #Uncomment the following line to print all tokens if needed (instead of limiting to 100th token)
            #print(f"ID: {id}, Token: {tok}")  # Print every token (this line will show all tokens)

Caesar, De bello Gallico 1-4.xmi
ID: 1498, Token: exirent
ID: 78841, Token: circumventas
Virgil, Aeneid.xmi
ID: 28254, Token: succurrere
ID: 171793, Token: inibo
ID: 331311, Token: egressisque
ID: 436869, Token: succurrere


### Import pandas

In [37]:
import pandas as pd

### Creating a DataFrame from the items in id2tok dictionary



In [38]:
tokenid_df = pd.DataFrame([(k, v) for k, v in id2tok.items()], columns=["ID", "VERB TOKEN"]) #where 'tokenid_df includes the token id and the token'

tokenid_df


Unnamed: 0,ID,VERB TOKEN
0,1498Caes,exirent
1,4296Caes,exeant
2,4578Caes,subeunda
3,4854Caes,transierant
4,4978Caes,exire
...,...,...
499,435595Virg,procurrit
500,436869Virg,succurrere
501,438653Virg,occurrere
502,440736Virg,subirent


## Morphological features

#### Morphological Features Extraction and Storage


In [39]:
list_morphological_features = []  # Initializing an empty list to store morphological features
pred2_morphological_features = dict()  # Initializing an empty dictionary to store predicted morphological features mapped to IDs
count = 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'MorphologicalFeatures' annotation in the CAS
    for relation in cas.select('de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures'):
        # Looping through each token covered by the 'MorphologicalFeatures' annotation
        for token in cas.select_covered('de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures', relation):
            tok = token.get_covered_text()  # Getting the text covered by the current token
            id = token.begin  # Getting the beginning offset of the token
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            morphological_feature = relation.value  # Getting the morphological feature value
            
            # Checking if the ID already exists in pred2mf dictionary
            if id in pred2_morphological_features:
                # If ID exists, append the new morphological feature to the existing list of features
                list_morphological_features = pred2_morphological_features[id] + morphological_feature
                pred2_morphological_features[id] = list_morphological_features
            else:
                # If ID doesn't exist, store the morphological feature directly
                pred2_morphological_features[id] = morphological_feature

print(pred2_morphological_features)


{'1498Caes': 'Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin|Voice=Act', '4296Caes': 'Mood=Subj|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act', '4578Caes': 'Case=Nom|Gender=Neut|Number=Plur|Tense=Pres|VerbFor=Gdv|Voice=Act', '4854Caes': 'Mood=Ind|Number=Plur|Person=3|Tense=Pqp|VerbForm=Fin|Voice=Act', '4978Caes': 'Tense=Pres|VerbForm=Inf|Voice=Act', '5366Caes': 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Pass', '5732Caes': 'Mood=Subj|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act', '5997Caes': 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act', '6831Caes': 'Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin|Voice=Act', '7012Caes': 'Mood=Ind|Number=Plur|Person=3|Tense=Pqp|VerbForm=Fin|Voice=Act', '7293Caes': 'Tense=Pres|VerbForm=Inf|Voice=Act', '8498Caes': 'Mood=Subj|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act', '9426Caes': 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act', '9649Caes': 'Mood=Ind|Number

#### Creating DataFrame for Morphological Features

In [40]:
morphological_features_df = pd.DataFrame([(k,v) for k,v in pred2_morphological_features.items()], columns=["ID", "MORPHOLOGICAL FEATURES"]) #where 'morphological_features_df' is a dataframe containing token IDs and morphological features
morphological_features_df

Unnamed: 0,ID,MORPHOLOGICAL FEATURES
0,1498Caes,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...
1,4296Caes,Mood=Subj|Number=Plur|Person=3|Tense=Pres|Verb...
2,4578Caes,Case=Nom|Gender=Neut|Number=Plur|Tense=Pres|Ve...
3,4854Caes,Mood=Ind|Number=Plur|Person=3|Tense=Pqp|VerbFo...
4,4978Caes,Tense=Pres|VerbForm=Inf|Voice=Act
...,...,...
499,435595Virg,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...
500,436869Virg,Tense=Pres|VerbForm=Inf|Voice=Act
501,438653Virg,Tense=Pres|VerbForm=Inf|Voice=Act
502,440736Virg,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...


#### Merging Token IDs with Morphological Features

In [41]:
id_morphological_features_df = tokenid_df.merge(morphological_features_df, on='ID', how='left') #where 'id_morphological_features_df' merges 'tokenid_df' and 'morphological_features_df'
id_morphological_features_df

Unnamed: 0,ID,VERB TOKEN,MORPHOLOGICAL FEATURES
0,1498Caes,exirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...
1,4296Caes,exeant,Mood=Subj|Number=Plur|Person=3|Tense=Pres|Verb...
2,4578Caes,subeunda,Case=Nom|Gender=Neut|Number=Plur|Tense=Pres|Ve...
3,4854Caes,transierant,Mood=Ind|Number=Plur|Person=3|Tense=Pqp|VerbFo...
4,4978Caes,exire,Tense=Pres|VerbForm=Inf|Voice=Act
...,...,...,...
499,435595Virg,procurrit,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...
500,436869Virg,succurrere,Tense=Pres|VerbForm=Inf|Voice=Act
501,438653Virg,occurrere,Tense=Pres|VerbForm=Inf|Voice=Act
502,440736Virg,subirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...


## Lemma

#### Extracting Lemmas from CAS Annotations

In [42]:
list_lemmas = []  # Initializing an empty list to store lemmas
pred2_lemmas = dict()  # Initializing an empty dictionary to store predicted lemmas mapped to IDs
count = 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Lemma' annotation in the CAS
    for relation in cas.select('de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma'):
        # Looping through each token covered by the 'Lemma' annotation
        for token in cas.select_covered('de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma', relation):
            tok = token.get_covered_text()  # Getting the text covered by the current token
            id = token.begin  # Getting the beginning offset of the token
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            lemma = relation.value  # Getting the lemma value
            
            # Checking if the ID already exists in pred2_lemmas dictionary
            if id in pred2_lemmas:
                # If ID exists, append the new lemma to the existing list of lemmas
                list_lemmas = pred2_lemmas[id] + lemma
                pred2_lemmas[id] = list_lemmas
            else:
                # If ID doesn't exist, store the lemma directly
                pred2_lemmas[id] = lemma

print(pred2_lemmas)


{'1463Caes': 'de', '1466Caes': 'finis', '1498Caes': 'exeo', '4281Caes': 'ex', '4283Caes': 'finis', '4296Caes': 'exeo', '4560Caes': 'ad', '4569Caes': 'periculum', '4578Caes': 'subeo', '4587Caes': 'sum', '4805Caes': 'qui', '4837Caes': 'in', '4840Caes': 'ager', '4854Caes': 'transeo', '4962Caes': 'iter', '4973Caes': 'domus', '4978Caes': 'exeo', '5323Caes': 'Rhodanus', '5366Caes': 'transeo', '5709Caes': 'ad', '5712Caes': 'ripa', '5732Caes': 'convenio', '5986Caes': 'ad', '5989Caes': 'Genava', '5997Caes': 'pervenio', '6806Caes': 'miles', '6831Caes': 'convenio', '6995Caes': 'qui', '6999Caes': 'ex', '7002Caes': 'provincia', '7012Caes': 'convenio', '7293Caes': 'transeo', '8498Caes': 'transeo', '9371Caes': 'in', '9374Caes': 'finis', '9426Caes': 'pervenio', '9630Caes': 'in', '9643Caes': 'finis', '9649Caes': 'pervenio', '10386Caes': 'in', '10389Caes': 'Santoni', '10398Caes': 'Helvetii', '10407Caes': 'pervenio', '10576Caes': 'is', '10579Caes': 'Helvetii', '10617Caes': 'transeo', '10848Caes': 'ad', '

#### Creating DataFrame for Lemmas

In [43]:
lemmas_df = pd.DataFrame([(k,v) for k,v in pred2_lemmas.items()], columns=["ID", "LEMMA"]) #where 'lemmas_df' is a dataframe containing token IDs and lemmas
lemmas_df

Unnamed: 0,ID,LEMMA
0,1463Caes,de
1,1466Caes,finis
2,1498Caes,exeo
3,4281Caes,ex
4,4283Caes,finis
...,...,...
1178,440708Virg,ille
1179,440736Virg,subeo
1180,441917Virg,per
1181,441937Virg,transeo


#### Merging Token IDs, Morphological Features, and Lemmas

In [44]:
id_lemmas_df = id_morphological_features_df.merge(lemmas_df, on='ID', how='left') #where 'id_lemmas_df' merges 'id_morphological_features_df' and 'lemmas_df'
id_lemmas_df

Unnamed: 0,ID,VERB TOKEN,MORPHOLOGICAL FEATURES,LEMMA
0,1498Caes,exirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,exeo
1,4296Caes,exeant,Mood=Subj|Number=Plur|Person=3|Tense=Pres|Verb...,exeo
2,4578Caes,subeunda,Case=Nom|Gender=Neut|Number=Plur|Tense=Pres|Ve...,subeo
3,4854Caes,transierant,Mood=Ind|Number=Plur|Person=3|Tense=Pqp|VerbFo...,transeo
4,4978Caes,exire,Tense=Pres|VerbForm=Inf|Voice=Act,exeo
...,...,...,...,...
499,435595Virg,procurrit,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,procurro
500,436869Virg,succurrere,Tense=Pres|VerbForm=Inf|Voice=Act,succurro
501,438653Virg,occurrere,Tense=Pres|VerbForm=Inf|Voice=Act,occurro
502,440736Virg,subirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,subeo


## Sentence

#### Extracting Sentences from CAS Annotations


In [45]:
list_sentences = []  # Initializing an empty list to store sentences
pred2_sentences = dict()  # Initializing an empty dictionary to store predicted sentences mapped to IDs
count = 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Includes' relation in the CAS
    for relation in cas.select('webanno.custom.Includes'):
        dep = relation.Dependent  # Getting the dependent token of the relation
        tokdep = dep.get_covered_text()  # Getting the text covered by the dependent token
        id = str(dep.begin) + file_input_abbr  # Creating a unique ID for the dependent token based on its offset and file abbreviation
        list_sentences.append(str(tokdep))  # Appending the text of the dependent token to the list of sentences
        
        gov = relation.Governor  # Getting the governor token of the relation
        sentence = gov.get_covered_text()  # Getting the text covered by the governor token
        
        # Checking if the ID already exists in pred2_sentences dictionary
        if id in pred2_sentences:
            # If ID exists, append the new token text to the existing list of tokens representing the sentence
            list_sentences = pred2_sentences[id] + sentence
            pred2_sentences[id] = list_sentences
        else:
            # If ID doesn't exist, store the token text directly
            pred2_sentences[id] = sentence

print(pred2_sentences)


{'1498Caes': 'Pisone consulibus regni cupiditate inductus coniurationem nobilitatis fecit et civitati persuasit ut de finibus suis cum omnibus copiis exirent', '4296Caes': 'Post eius mortem nihilo minus Helvetii id quod constituerant facere conantur, ut e finibus suis exeant', '4578Caes': 'frumentum omne, praeter quod secum portaturi erant, comburunt, ut domum reditionis spe sublata paratiores ad omnia pericula subeunda essent', '4854Caes': 'Persuadent Rauracis et Tulingis et Latobrigis finitimis, uti eodem usi consilio oppidis suis vicisque exustis una cum iis proficiscantur, Boiosque, qui trans Rhenum incoluerant et in agrum Noricum transierant Noreiamque oppugnabant, receptos ad se socios sibi adsciscunt', '4978Caes': 'Erant omnino itinera duo, quibus itineribus domo exire possent', '5366Caes': 'alterum per provinciam nostram, multo facilius atque expeditius, propterea quod inter fines Helvetiorum et Allobrogum, qui nuper pacati erant, Rhodanus fluit isque non nullis locis vado tran

#### Creating DataFrame for Sentences


In [46]:
sentences_df = pd.DataFrame([(k,v) for k,v in pred2_sentences.items()], columns=["ID", "SENTENCE"]) #where 'sentences_df' is a dataframe containing token IDs and the whole sentences including the verb tokens
sentences_df

Unnamed: 0,ID,SENTENCE
0,1498Caes,Pisone consulibus regni cupiditate inductus co...
1,4296Caes,Post eius mortem nihilo minus Helvetii id quod...
2,4578Caes,"frumentum omne, praeter quod secum portaturi e..."
3,4854Caes,Persuadent Rauracis et Tulingis et Latobrigis ...
4,4978Caes,"Erant omnino itinera duo, quibus itineribus do..."
...,...,...
499,435595Virg,"Dum nititur acer et instat, rursus in aurigae ..."
500,436869Virg,"Iuturnam misero, fateor, succurrere fratri sua..."
501,438653Virg,Harum unam celerem demisit ab aethere summo Iu...
502,440736Virg,"Vix illud lecti bis sex cervice subirent, qual..."


In [47]:
id_sentences_df = id_lemmas_df.merge(sentences_df, on='ID', how='left') #where 'id_sentences_df' merges 'id_lemmas_df' and 'sentences_df'
id_sentences_df

Unnamed: 0,ID,VERB TOKEN,MORPHOLOGICAL FEATURES,LEMMA,SENTENCE
0,1498Caes,exirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,exeo,Pisone consulibus regni cupiditate inductus co...
1,4296Caes,exeant,Mood=Subj|Number=Plur|Person=3|Tense=Pres|Verb...,exeo,Post eius mortem nihilo minus Helvetii id quod...
2,4578Caes,subeunda,Case=Nom|Gender=Neut|Number=Plur|Tense=Pres|Ve...,subeo,"frumentum omne, praeter quod secum portaturi e..."
3,4854Caes,transierant,Mood=Ind|Number=Plur|Person=3|Tense=Pqp|VerbFo...,transeo,Persuadent Rauracis et Tulingis et Latobrigis ...
4,4978Caes,exire,Tense=Pres|VerbForm=Inf|Voice=Act,exeo,"Erant omnino itinera duo, quibus itineribus do..."
...,...,...,...,...,...
499,435595Virg,procurrit,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,procurro,"Dum nititur acer et instat, rursus in aurigae ..."
500,436869Virg,succurrere,Tense=Pres|VerbForm=Inf|Voice=Act,succurro,"Iuturnam misero, fateor, succurrere fratri sua..."
501,438653Virg,occurrere,Tense=Pres|VerbForm=Inf|Voice=Act,occurro,Harum unam celerem demisit ab aethere summo Iu...
502,440736Virg,subirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,subeo,"Vix illud lecti bis sex cervice subirent, qual..."


#### Merging Token IDs, Lemmas, Morphological Features, and Sentences

## Actionality

#### Extracting Actionality from CAS Annotations

In [48]:
list_actionality = []  # Initializing an empty list to store actionality values
pred2_actionality = dict()  # Initializing an empty dictionary to store predicted actionality values mapped to IDs
count = 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Actionality' annotation in the CAS
    for relation in cas.select('webanno.custom.Actionality'):
        # Looping through each token covered by the 'Actionality' annotation
        for token in cas.select_covered('webanno.custom.Actionality', relation):
            tok = token.get_covered_text()  # Getting the text covered by the current token
            id = token.begin  # Getting the beginning offset of the token
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            actionality = relation.Actionality  # Getting the actionality value
            
            # Checking if the ID already exists in pred2_actionality dictionary
            if id in pred2_actionality:
                # If ID exists, append the new actionality to the existing list of actionalities
                list_actionality = pred2_actionality[id] + actionality
                pred2_actionality[id] = list_actionality
            else:
                # If ID doesn't exist, store the actionality directly
                pred2_actionality[id] = actionality

print(pred2_actionality)


{'1498Caes': 'Activity', '4296Caes': 'Activity', '4578Caes': 'Achievement', '4854Caes': 'Achievement', '4978Caes': 'Activity', '5366Caes': 'Accomplishment', '5732Caes': 'Accomplishment', '5997Caes': 'Accomplishment', '6831Caes': 'Activity', '7012Caes': 'Accomplishment', '7293Caes': 'Accomplishment', '8498Caes': 'Activity', '9426Caes': 'Accomplishment', '9649Caes': 'Accomplishment', '10407Caes': 'Accomplishment', '10617Caes': 'Activity', '10862Caes': 'Accomplishment', '10890Caes': 'Accomplishment', '10931Caes': 'Accomplishment', '11152Caes': 'Achievement', '11818Caes': 'Accomplishment', '12323Caes': 'Accomplishment', '21567Caes': 'Accomplishment', '24504Caes': 'Accomplishment', '24513Caes': 'Accomplishment', '25610Caes': 'Accomplishment', '26036Caes': 'Achievement', '26225Caes': 'Accomplishment', '26268Caes': 'Accomplishment', '26640Caes': 'Achievement', '27386Caes': 'Accomplishment', '27831Caes': 'Achievement', '28359Caes': 'Accomplishment', '29918Caes': 'Accomplishment', '30767Caes': 

#### Creating DataFrame for Actionality

In [49]:
actionality_df = pd.DataFrame([(k,v) for k,v in pred2_actionality.items()], columns=["ID", "ACTIONALITY"]) #where 'actionality_df' is a dataframe containing token IDs and actionality values of the verb tokens
actionality_df

Unnamed: 0,ID,ACTIONALITY
0,1498Caes,Activity
1,4296Caes,Activity
2,4578Caes,Achievement
3,4854Caes,Achievement
4,4978Caes,Activity
...,...,...
499,435595Virg,Achievement
500,436869Virg,Activity
501,438653Virg,Accomplishment
502,440736Virg,Achievement


#### Merging Sentences and Actionality DataFrames

In [50]:
id_actionality_df = id_sentences_df.merge(actionality_df, on='ID', how='left') #where 'id_actionality_df' merges 'id_sentences_df' and 'actionality_df'
id_actionality_df

Unnamed: 0,ID,VERB TOKEN,MORPHOLOGICAL FEATURES,LEMMA,SENTENCE,ACTIONALITY
0,1498Caes,exirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,exeo,Pisone consulibus regni cupiditate inductus co...,Activity
1,4296Caes,exeant,Mood=Subj|Number=Plur|Person=3|Tense=Pres|Verb...,exeo,Post eius mortem nihilo minus Helvetii id quod...,Activity
2,4578Caes,subeunda,Case=Nom|Gender=Neut|Number=Plur|Tense=Pres|Ve...,subeo,"frumentum omne, praeter quod secum portaturi e...",Achievement
3,4854Caes,transierant,Mood=Ind|Number=Plur|Person=3|Tense=Pqp|VerbFo...,transeo,Persuadent Rauracis et Tulingis et Latobrigis ...,Achievement
4,4978Caes,exire,Tense=Pres|VerbForm=Inf|Voice=Act,exeo,"Erant omnino itinera duo, quibus itineribus do...",Activity
...,...,...,...,...,...,...
499,435595Virg,procurrit,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,procurro,"Dum nititur acer et instat, rursus in aurigae ...",Achievement
500,436869Virg,succurrere,Tense=Pres|VerbForm=Inf|Voice=Act,succurro,"Iuturnam misero, fateor, succurrere fratri sua...",Activity
501,438653Virg,occurrere,Tense=Pres|VerbForm=Inf|Voice=Act,occurro,Harum unam celerem demisit ab aethere summo Iu...,Accomplishment
502,440736Virg,subirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,subeo,"Vix illud lecti bis sex cervice subirent, qual...",Achievement


## Verb class

#### Extracting Verb Class from CAS Annotations

In [51]:
list_verbclass = []  # Initializing an empty list to store verb classes
pred2_verbclass = dict()  # Initializing an empty dictionary to store predicted verb classes mapped to IDs
count = 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Motionclass' annotation in the CAS
    for relation in cas.select('webanno.custom.Motionclass'):
        # Looping through each token covered by the 'Motionclass' annotation
        for token in cas.select_covered('webanno.custom.Motionclass', relation):
            tok = token.get_covered_text()  # Getting the text covered by the current token
            id = token.begin  # Getting the beginning offset of the token
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            verbclass = relation.Motionclass  # Getting the verb class value
            
            # Checking if the ID already exists in pred2_verbclass dictionary
            if id in pred2_verbclass:
                # If ID exists, append the new verb class to the existing list of verb classes
                list_verbclass = pred2_verbclass[id] + verbclass
                pred2_verbclass[id] = list_verbclass
            else:
                # If ID doesn't exist, store the verb class directly
                pred2_verbclass[id] = verbclass

print(pred2_verbclass)


{'1498Caes': 'ESCAPE-51.1', '4296Caes': 'ESCAPE-51.1', '4578Caes': 'MARVEL-31.3', '4854Caes': 'ESCAPE-51.1', '4978Caes': 'ESCAPE-51.1', '5366Caes': 'ESCAPE-51.1', '5732Caes': 'HERD-47.5.2', '5997Caes': 'ESCAPE-51.1', '6831Caes': 'HERD-47.5.2', '7012Caes': 'HERD-47.5.2', '7293Caes': 'ESCAPE-51.1', '8498Caes': 'ESCAPE-51.1', '9426Caes': 'ESCAPE-51.1', '9649Caes': 'ESCAPE-51.1', '10407Caes': 'ESCAPE-51.1', '10617Caes': 'ESCAPE-51.1', '10862Caes': 'ESCAPE-51.1', '10890Caes': 'ESCAPE-51.1', '10931Caes': 'ATTACK-60.1', '11152Caes': 'REMOVE-10.1', '11818Caes': 'ESCAPE-51.1', '12323Caes': 'ESCAPE-51.1', '21567Caes': 'RUN-51.3.2', '24504Caes': 'ATTACK-60.1', '24513Caes': 'ESCAPE-51.1', '25610Caes': 'ESCAPE-51.1', '26036Caes': 'MEEET-36.3', '26225Caes': 'ESCAPE-51.1', '26268Caes': 'LEAVE-51.2', '26640Caes': 'ESCAPE-51.1', '27386Caes': 'ESCAPE-51.1', '27831Caes': 'ESCAPE-51.1', '28359Caes': 'HERD-47.5.2', '29918Caes': 'ESCAPE-51.1', '30767Caes': 'ESCAPE-51.1', '31365Caes': 'ESCAPE-51.1', '33967Ca

#### Creating DataFrame for Verb Class

In [52]:
verbclass_df = pd.DataFrame([(k,v) for k,v in pred2_verbclass.items()], columns=["ID", "VERB CLASS"]) #where 'verbclass_df' is a dataframe containing token IDs and motion clases of the verb tokens
verbclass_df

Unnamed: 0,ID,VERB CLASS
0,1498Caes,ESCAPE-51.1
1,4296Caes,ESCAPE-51.1
2,4578Caes,MARVEL-31.3
3,4854Caes,ESCAPE-51.1
4,4978Caes,ESCAPE-51.1
...,...,...
499,435595Virg,RUN-51.3.2
500,436869Virg,HELP-72.1
501,438653Virg,RUN-51.3.2
502,440736Virg,PUT_DIRECTION-9.4


#### Merging Actionality and Verb Class DataFrames

In [53]:
id_verbclass_df = id_actionality_df.merge(verbclass_df, on='ID', how='left') #where 'id_verbclass_df' merges 'id_actionality_df' and 'verbclass_df'
id_verbclass_df

Unnamed: 0,ID,VERB TOKEN,MORPHOLOGICAL FEATURES,LEMMA,SENTENCE,ACTIONALITY,VERB CLASS
0,1498Caes,exirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,exeo,Pisone consulibus regni cupiditate inductus co...,Activity,ESCAPE-51.1
1,4296Caes,exeant,Mood=Subj|Number=Plur|Person=3|Tense=Pres|Verb...,exeo,Post eius mortem nihilo minus Helvetii id quod...,Activity,ESCAPE-51.1
2,4578Caes,subeunda,Case=Nom|Gender=Neut|Number=Plur|Tense=Pres|Ve...,subeo,"frumentum omne, praeter quod secum portaturi e...",Achievement,MARVEL-31.3
3,4854Caes,transierant,Mood=Ind|Number=Plur|Person=3|Tense=Pqp|VerbFo...,transeo,Persuadent Rauracis et Tulingis et Latobrigis ...,Achievement,ESCAPE-51.1
4,4978Caes,exire,Tense=Pres|VerbForm=Inf|Voice=Act,exeo,"Erant omnino itinera duo, quibus itineribus do...",Activity,ESCAPE-51.1
...,...,...,...,...,...,...,...
499,435595Virg,procurrit,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,procurro,"Dum nititur acer et instat, rursus in aurigae ...",Achievement,RUN-51.3.2
500,436869Virg,succurrere,Tense=Pres|VerbForm=Inf|Voice=Act,succurro,"Iuturnam misero, fateor, succurrere fratri sua...",Activity,HELP-72.1
501,438653Virg,occurrere,Tense=Pres|VerbForm=Inf|Voice=Act,occurro,Harum unam celerem demisit ab aethere summo Iu...,Accomplishment,RUN-51.3.2
502,440736Virg,subirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,subeo,"Vix illud lecti bis sex cervice subirent, qual...",Achievement,PUT_DIRECTION-9.4


## Literal meaning

#### Extracting Literal Meaning Values


In [54]:
list_literalmeanings = []  # Initializing an empty list to store literal meanings
pred2_literalmeaning = dict()  # Initializing an empty dictionary to store predicted literal meanings mapped to IDs
count = 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Literalmeaning' annotation in the CAS
    for relation in cas.select('webanno.custom.Literalmeaning'):
        # Looping through each token covered by the 'Literalmeaning' annotation
        for token in cas.select_covered('webanno.custom.Literalmeaning', relation):
            tok = token.get_covered_text()  # Getting the text covered by the current token
            id = token.begin  # Getting the beginning offset of the token
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            literalmeaning = relation.Literalmeaning  # Getting the literal meaning value
            
            # Checking if the ID already exists in pred2_literalmeaning dictionary
            if id in pred2_literalmeaning:
                # If ID exists, append the new literal meaning to the existing list of literal meanings
                list_literalmeanings = pred2_literalmeaning[id] + literalmeaning
                pred2_literalmeaning[id] = list_literalmeanings
            else:
                # If ID doesn't exist, store the literal meaning directly
                pred2_literalmeaning[id] = literalmeaning

print(pred2_literalmeaning)


{'1498Caes': True, '4296Caes': True, '4578Caes': False, '4854Caes': True, '4978Caes': True, '5366Caes': True, '5732Caes': True, '5997Caes': True, '6831Caes': True, '7012Caes': True, '7293Caes': True, '8498Caes': True, '9426Caes': True, '9649Caes': True, '10407Caes': True, '10617Caes': True, '10862Caes': True, '10890Caes': True, '10931Caes': False, '11152Caes': False, '11818Caes': True, '12323Caes': True, '21567Caes': True, '24504Caes': True, '24513Caes': True, '25610Caes': True, '26036Caes': True, '26225Caes': True, '26268Caes': True, '26640Caes': True, '27386Caes': True, '27831Caes': True, '28359Caes': True, '29918Caes': True, '30767Caes': True, '31365Caes': True, '33967Caes': True, '34210Caes': True, '34348Caes': False, '36546Caes': False, '36804Caes': False, '37047Caes': False, '37157Caes': True, '37517Caes': True, '38986Caes': False, '41443Caes': False, '46744Caes': True, '46852Caes': True, '48088Caes': True, '50312Caes': True, '52574Caes': True, '53622Caes': True, '56162Caes': Tru

#### Creating DataFrame for Literal Meanings


In [55]:
literal_meaning_df = pd.DataFrame([(k,v) for k,v in pred2_literalmeaning.items()], columns=["ID", "LITERAL MEANING"])  #where 'literal_meaning_df' is a dataframe containing token IDs and (Boolean) literal meanings of the verb tokens
literal_meaning_df

Unnamed: 0,ID,LITERAL MEANING
0,1498Caes,True
1,4296Caes,True
2,4578Caes,False
3,4854Caes,True
4,4978Caes,True
...,...,...
499,435595Virg,True
500,436869Virg,False
501,438653Virg,False
502,440736Virg,False


#### Merging Verb Class and Literal Meaning DataFrames


In [56]:
id_literal_meaning_df = id_verbclass_df.merge(literal_meaning_df, on='ID', how='left') #where 'id_literal_meaning_df' merges 'id_verbclass_df' and 'literal_meaning_df'
id_literal_meaning_df

Unnamed: 0,ID,VERB TOKEN,MORPHOLOGICAL FEATURES,LEMMA,SENTENCE,ACTIONALITY,VERB CLASS,LITERAL MEANING
0,1498Caes,exirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,exeo,Pisone consulibus regni cupiditate inductus co...,Activity,ESCAPE-51.1,True
1,4296Caes,exeant,Mood=Subj|Number=Plur|Person=3|Tense=Pres|Verb...,exeo,Post eius mortem nihilo minus Helvetii id quod...,Activity,ESCAPE-51.1,True
2,4578Caes,subeunda,Case=Nom|Gender=Neut|Number=Plur|Tense=Pres|Ve...,subeo,"frumentum omne, praeter quod secum portaturi e...",Achievement,MARVEL-31.3,False
3,4854Caes,transierant,Mood=Ind|Number=Plur|Person=3|Tense=Pqp|VerbFo...,transeo,Persuadent Rauracis et Tulingis et Latobrigis ...,Achievement,ESCAPE-51.1,True
4,4978Caes,exire,Tense=Pres|VerbForm=Inf|Voice=Act,exeo,"Erant omnino itinera duo, quibus itineribus do...",Activity,ESCAPE-51.1,True
...,...,...,...,...,...,...,...,...
499,435595Virg,procurrit,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,procurro,"Dum nititur acer et instat, rursus in aurigae ...",Achievement,RUN-51.3.2,True
500,436869Virg,succurrere,Tense=Pres|VerbForm=Inf|Voice=Act,succurro,"Iuturnam misero, fateor, succurrere fratri sua...",Activity,HELP-72.1,False
501,438653Virg,occurrere,Tense=Pres|VerbForm=Inf|Voice=Act,occurro,Harum unam celerem demisit ab aethere summo Iu...,Accomplishment,RUN-51.3.2,False
502,440736Virg,subirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,subeo,"Vix illud lecti bis sex cervice subirent, qual...",Achievement,PUT_DIRECTION-9.4,False


## Verb stem

#### Extracting Verb Stem Values


In [57]:
list_verbstem = []  # Initializing an empty list to store verb stems
pred2_verbstem = dict()  # Initializing an empty dictionary to store predicted verb stems mapped to IDs
count = 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Verbstem' annotation in the CAS
    for relation in cas.select('webanno.custom.Verbstem'):
        # Looping through each token covered by the 'Verbstem' annotation
        for token in cas.select_covered('webanno.custom.Verbstem', relation):
            tok = token.get_covered_text()  # Getting the text covered by the current token
            id = token.begin  # Getting the beginning offset of the token
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            verbstem = relation.Verbstem  # Getting the verb stem value
            
            # Checking if the ID already exists in pred2_verbstem dictionary
            if id in pred2_verbstem:
                # If ID exists, append the new verb stem to the existing list of verb stems
                list_verbstem = pred2_verbstem[id] + verbstem
                pred2_verbstem[id] = list_verbstem
            else:
                # If ID doesn't exist, store the verb stem directly
                pred2_verbstem[id] = verbstem

print(pred2_verbstem)


{'1498Caes': 'present stem', '4296Caes': 'present stem', '4578Caes': 'present stem', '4854Caes': 'perfect stem', '4978Caes': 'present stem', '5366Caes': 'present stem', '5732Caes': 'present stem', '5997Caes': 'present stem', '6831Caes': 'present stem', '7012Caes': 'perfect stem', '7293Caes': 'present stem', '8498Caes': 'present stem', '9426Caes': 'present stem', '9649Caes': 'perfect stem', '10407Caes': 'present stem', '10617Caes': 'present stem', '10862Caes': 'perfect stem', '10890Caes': 'perfect stem', '10931Caes': 'supine stem', '11152Caes': 'perfect stem', '11818Caes': 'present stem', '12323Caes': 'perfect stem', '21567Caes': 'present stem', '24504Caes': 'supine stem', '24513Caes': 'present stem', '25610Caes': 'perfect stem', '26036Caes': 'perfect stem', '26225Caes': 'perfect stem', '26268Caes': 'perfect stem', '26640Caes': 'supine stem', '27386Caes': 'present stem', '27831Caes': 'perfect stem', '28359Caes': 'perfect stem', '29918Caes': 'perfect stem', '30767Caes': 'perfect stem', '

#### Creating DataFrame for Verb Stem

In [58]:
verbstem_df = pd.DataFrame([(k,v) for k,v in pred2_verbstem.items()], columns=["ID", "VERB STEM"]) #where 'verbstem_df' is a dataframe containing token IDs and verb stems of the verb tokens
verbstem_df

Unnamed: 0,ID,VERB STEM
0,1498Caes,present stem
1,4296Caes,present stem
2,4578Caes,present stem
3,4854Caes,perfect stem
4,4978Caes,present stem
...,...,...
499,435595Virg,present stem
500,436869Virg,present stem
501,438653Virg,present stem
502,440736Virg,present stem


#### Merging Literal Meaning and Verb Stem DataFrames


In [59]:
id_verbstem_df = id_literal_meaning_df.merge(verbstem_df, on='ID', how='left') #where 'id_verbstem_df' merges 'id_literal_meaning_df' and 'verbstem_df'
id_verbstem_df

Unnamed: 0,ID,VERB TOKEN,MORPHOLOGICAL FEATURES,LEMMA,SENTENCE,ACTIONALITY,VERB CLASS,LITERAL MEANING,VERB STEM
0,1498Caes,exirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,exeo,Pisone consulibus regni cupiditate inductus co...,Activity,ESCAPE-51.1,True,present stem
1,4296Caes,exeant,Mood=Subj|Number=Plur|Person=3|Tense=Pres|Verb...,exeo,Post eius mortem nihilo minus Helvetii id quod...,Activity,ESCAPE-51.1,True,present stem
2,4578Caes,subeunda,Case=Nom|Gender=Neut|Number=Plur|Tense=Pres|Ve...,subeo,"frumentum omne, praeter quod secum portaturi e...",Achievement,MARVEL-31.3,False,present stem
3,4854Caes,transierant,Mood=Ind|Number=Plur|Person=3|Tense=Pqp|VerbFo...,transeo,Persuadent Rauracis et Tulingis et Latobrigis ...,Achievement,ESCAPE-51.1,True,perfect stem
4,4978Caes,exire,Tense=Pres|VerbForm=Inf|Voice=Act,exeo,"Erant omnino itinera duo, quibus itineribus do...",Activity,ESCAPE-51.1,True,present stem
...,...,...,...,...,...,...,...,...,...
499,435595Virg,procurrit,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,procurro,"Dum nititur acer et instat, rursus in aurigae ...",Achievement,RUN-51.3.2,True,present stem
500,436869Virg,succurrere,Tense=Pres|VerbForm=Inf|Voice=Act,succurro,"Iuturnam misero, fateor, succurrere fratri sua...",Activity,HELP-72.1,False,present stem
501,438653Virg,occurrere,Tense=Pres|VerbForm=Inf|Voice=Act,occurro,Harum unam celerem demisit ab aethere summo Iu...,Accomplishment,RUN-51.3.2,False,present stem
502,440736Virg,subirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,subeo,"Vix illud lecti bis sex cervice subirent, qual...",Achievement,PUT_DIRECTION-9.4,False,present stem


## Preverb

#### Extracting Preverb Values

In [60]:
list_preverbs = []  # Initializing an empty list to store preverbs
pred2_preverbs = dict()  # Initializing an empty dictionary to store predicted preverbs mapped to IDs
count = 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Preverb' annotation in the CAS
    for relation in cas.select('webanno.custom.Preverb'):
        # Looping through each token covered by the 'Preverb' annotation
        for token in cas.select_covered('webanno.custom.Preverb', relation):
            tok = token.get_covered_text()  # Getting the text covered by the current token
            id = token.begin  # Getting the beginning offset of the token
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            preverb = relation.Prev  # Getting the preverb value
            
            # Checking if the ID already exists in pred2_preverbs dictionary
            if id in pred2_preverbs:
                # If ID exists, append the new preverb to the existing list of preverbs
                list_preverbs = pred2_preverbs[id] + preverb
                pred2_preverbs[id] = list_preverbs
            else:
                # If ID doesn't exist, store the preverb directly
                pred2_preverbs[id] = preverb

print(pred2_preverbs)


{'1498Caes': 'ex', '4296Caes': 'ex', '4578Caes': 'sub', '4854Caes': 'trans', '4978Caes': 'ex', '5366Caes': 'trans', '5732Caes': 'cum', '5997Caes': 'per', '6831Caes': 'cum', '7012Caes': 'cum', '7293Caes': 'trans', '8498Caes': 'trans', '9426Caes': 'per', '9649Caes': 'per', '10407Caes': 'per', '10617Caes': 'trans', '10862Caes': 'per', '10890Caes': 'trans', '10931Caes': 'ad', '11152Caes': 'ex', '11818Caes': 'trans', '12323Caes': 'trans', '21567Caes': 'ad', '24504Caes': 'ad', '24513Caes': 'circum', '25610Caes': 'per', '26036Caes': 'cum', '26225Caes': 'per', '26268Caes': 'per', '26640Caes': 'ex', '27386Caes': 'trans', '27831Caes': 'ex', '28359Caes': 'cum', '29918Caes': 'trans', '30767Caes': 'pro', '31365Caes': 'trans', '33967Caes': 'trans', '34210Caes': 'ex', '34348Caes': 'ob', '36546Caes': 'cum', '36804Caes': 'cum', '37047Caes': 'cum', '37157Caes': 'sub', '37517Caes': 'trans', '38986Caes': 'cum', '41443Caes': 'cum', '46744Caes': 'trans', '46852Caes': 'trans', '48088Caes': 'ex', '50312Caes':

#### Creating DataFrame for Preverb


In [61]:
preverbs_df = pd.DataFrame([(k,v) for k,v in pred2_preverbs.items()], columns=["ID", "PREVERB"]) #where 'preverbs_df' is a dataframe containing token IDs and preverbs of the verb tokens
preverbs_df

Unnamed: 0,ID,PREVERB
0,1498Caes,ex
1,4296Caes,ex
2,4578Caes,sub
3,4854Caes,trans
4,4978Caes,ex
...,...,...
499,435595Virg,pro
500,436869Virg,sub
501,438653Virg,ob
502,440736Virg,sub


#### Merging Verb Stem and Preverb DataFrames


In [62]:
id_preverbs_df = id_verbstem_df.merge(preverbs_df, on='ID', how='left') #where 'id_preverbs_df' merges 'id_verbstem_df' and 'preverbs_df'
id_preverbs_df

Unnamed: 0,ID,VERB TOKEN,MORPHOLOGICAL FEATURES,LEMMA,SENTENCE,ACTIONALITY,VERB CLASS,LITERAL MEANING,VERB STEM,PREVERB
0,1498Caes,exirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,exeo,Pisone consulibus regni cupiditate inductus co...,Activity,ESCAPE-51.1,True,present stem,ex
1,4296Caes,exeant,Mood=Subj|Number=Plur|Person=3|Tense=Pres|Verb...,exeo,Post eius mortem nihilo minus Helvetii id quod...,Activity,ESCAPE-51.1,True,present stem,ex
2,4578Caes,subeunda,Case=Nom|Gender=Neut|Number=Plur|Tense=Pres|Ve...,subeo,"frumentum omne, praeter quod secum portaturi e...",Achievement,MARVEL-31.3,False,present stem,sub
3,4854Caes,transierant,Mood=Ind|Number=Plur|Person=3|Tense=Pqp|VerbFo...,transeo,Persuadent Rauracis et Tulingis et Latobrigis ...,Achievement,ESCAPE-51.1,True,perfect stem,trans
4,4978Caes,exire,Tense=Pres|VerbForm=Inf|Voice=Act,exeo,"Erant omnino itinera duo, quibus itineribus do...",Activity,ESCAPE-51.1,True,present stem,ex
...,...,...,...,...,...,...,...,...,...,...
499,435595Virg,procurrit,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,procurro,"Dum nititur acer et instat, rursus in aurigae ...",Achievement,RUN-51.3.2,True,present stem,pro
500,436869Virg,succurrere,Tense=Pres|VerbForm=Inf|Voice=Act,succurro,"Iuturnam misero, fateor, succurrere fratri sua...",Activity,HELP-72.1,False,present stem,sub
501,438653Virg,occurrere,Tense=Pres|VerbForm=Inf|Voice=Act,occurro,Harum unam celerem demisit ab aethere summo Iu...,Accomplishment,RUN-51.3.2,False,present stem,ob
502,440736Virg,subirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,subeo,"Vix illud lecti bis sex cervice subirent, qual...",Achievement,PUT_DIRECTION-9.4,False,present stem,sub


## Preverb semantics

#### Extracting Preverb Semantics values


In [63]:
list_preverb_semantics = []  # Initializing an empty list to store preverb semantics
pred2_preverb_semantics = dict()  # Initializing an empty dictionary to store predicted preverb semantics mapped to IDs
count = 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'SemPrev' annotation in the CAS
    for relation in cas.select('webanno.custom.SemPrev'):
        # Looping through each token covered by the 'SemPrev' annotation
        for token in cas.select_covered('webanno.custom.SemPrev', relation):
            tok = token.get_covered_text()  # Getting the text covered by the current token
            id = token.begin  # Getting the beginning offset of the token
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            preverb_semantics = relation.Preverbsemantics  # Getting the preverb semantics value
            
            # Checking if the ID already exists in pred2_preverb_semantics dictionary
            if id in pred2_preverb_semantics:
                # If ID exists, append the new preverb semantics to the existing list of preverb semantics
                list_preverb_semantics = pred2_preverb_semantics[id] + preverb_semantics 
                pred2_preverb_semantics[id] = list_preverb_semantics
            else:
                # If ID doesn't exist, store the preverb semantics directly
                pred2_preverb_semantics[id] = preverb_semantics

print(pred2_preverb_semantics)


{'1498Caes': uima_cas_StringArray(xmiID=None, elements=['out'], type=Type(name=uima.cas.StringArray)), '4296Caes': uima_cas_StringArray(xmiID=None, elements=['out'], type=Type(name=uima.cas.StringArray)), '4578Caes': uima_cas_StringArray(xmiID=None, elements=['under'], type=Type(name=uima.cas.StringArray)), '4854Caes': uima_cas_StringArray(xmiID=None, elements=['across'], type=Type(name=uima.cas.StringArray)), '4978Caes': uima_cas_StringArray(xmiID=None, elements=['out'], type=Type(name=uima.cas.StringArray)), '5366Caes': uima_cas_StringArray(xmiID=None, elements=['across'], type=Type(name=uima.cas.StringArray)), '5732Caes': uima_cas_StringArray(xmiID=None, elements=['together'], type=Type(name=uima.cas.StringArray)), '5997Caes': uima_cas_StringArray(xmiID=None, elements=['completely'], type=Type(name=uima.cas.StringArray)), '6831Caes': uima_cas_StringArray(xmiID=None, elements=['together'], type=Type(name=uima.cas.StringArray)), '7012Caes': uima_cas_StringArray(xmiID=None, elements=['

#### Extracting Preverb Semantics Values Removing Unwanted Characters

In [64]:
list_preverb_semantics = []  # Initializing an empty list to store preverb semantics

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'SemPrev' annotation in the CAS
    for relation in cas.select('webanno.custom.SemPrev'):
        preverb_semantics = relation.Preverbsemantics  # Getting the preverb semantics value
        # Append the preverb semantics to the list after removing unwanted characters
        preverb_semantics_str = ', '.join(preverb_semantics.elements)  # Joining the elements of StringArray
        list_preverb_semantics.append(preverb_semantics_str)  # Appending the preverb semantics to the list

print(list_preverb_semantics)


['out', 'out', 'under', 'across', 'out', 'across', 'together', 'completely', 'together', 'together', 'across', 'across', 'completely', 'completely', 'completely', 'across', 'completely', 'across', '(malefactive), to', 'out', 'across', 'across', 'to', '(malefactive), to', 'around', 'completely', 'together with', 'completely', 'completely', 'away', 'across', 'out', 'together', 'across', 'forth', 'across', 'across', 'away', 'against', '(malefactive), together', 'together', '(malefactive), together', 'under', 'across', '(malefactive), together', '(malefactive), together', 'across', 'across', 'away', 'around', 'together', 'onwards', 'completely', 'across', 'into', 'forward', '(idea of destruction/death), across', 'into', 'completely', 'into', 'together', 'to', 'around', 'across', 'across', '(malefactive), to', 'across', '(malefactive), to', 'across', 'across', 'around', 'across', 'onwards', 'together', 'away', 'completely', 'together', 'away', 'forth', 'completely', 'into', 'completely', 'a

#### Extracting Token IDs from Actionality Annotations
Note that any other span layer can be used.


In [65]:
list_token_ids = []  # Initializing an empty list to store token IDs
count = 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Actionality' annotation in the CAS
    for relation in cas.select('webanno.custom.Actionality'):
        # Looping through each token covered by the 'Actionality' annotation
        for token in cas.select_covered('webanno.custom.Actionality', relation):
            id = token.begin  # Getting the beginning offset of the token
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            list_token_ids.append(id)  # Appending the token ID to the list

print(list_token_ids)


['1498Caes', '4296Caes', '4578Caes', '4854Caes', '4978Caes', '5366Caes', '5732Caes', '5997Caes', '6831Caes', '7012Caes', '7293Caes', '8498Caes', '9426Caes', '9649Caes', '10407Caes', '10617Caes', '10862Caes', '10890Caes', '10931Caes', '11152Caes', '11818Caes', '12323Caes', '21567Caes', '24504Caes', '24513Caes', '25610Caes', '26036Caes', '26225Caes', '26268Caes', '26640Caes', '27386Caes', '27831Caes', '28359Caes', '29918Caes', '30767Caes', '31365Caes', '33967Caes', '34210Caes', '34348Caes', '36546Caes', '36804Caes', '37047Caes', '37157Caes', '37517Caes', '38986Caes', '41443Caes', '46744Caes', '46852Caes', '48088Caes', '50312Caes', '52574Caes', '53622Caes', '56162Caes', '56208Caes', '56243Caes', '56345Caes', '56583Caes', '58654Caes', '59161Caes', '60301Caes', '61670Caes', '63933Caes', '64997Caes', '65326Caes', '65391Caes', '65423Caes', '65528Caes', '66243Caes', '66324Caes', '66389Caes', '66411Caes', '66489Caes', '66567Caes', '66805Caes', '67219Caes', '67317Caes', '68853Caes', '69478Caes',

#### Creating a dictionary where each token ID from idlist is mapped to its corresponding preverb semantics from list_preverb_semantics

In [66]:
pred2_preverb_semantics = {list_token_ids[i]: list_preverb_semantics[i] for i in range(len(list_token_ids))}
pred2_preverb_semantics

{'1498Caes': 'out',
 '4296Caes': 'out',
 '4578Caes': 'under',
 '4854Caes': 'across',
 '4978Caes': 'out',
 '5366Caes': 'across',
 '5732Caes': 'together',
 '5997Caes': 'completely',
 '6831Caes': 'together',
 '7012Caes': 'together',
 '7293Caes': 'across',
 '8498Caes': 'across',
 '9426Caes': 'completely',
 '9649Caes': 'completely',
 '10407Caes': 'completely',
 '10617Caes': 'across',
 '10862Caes': 'completely',
 '10890Caes': 'across',
 '10931Caes': '(malefactive), to',
 '11152Caes': 'out',
 '11818Caes': 'across',
 '12323Caes': 'across',
 '21567Caes': 'to',
 '24504Caes': '(malefactive), to',
 '24513Caes': 'around',
 '25610Caes': 'completely',
 '26036Caes': 'together with',
 '26225Caes': 'completely',
 '26268Caes': 'completely',
 '26640Caes': 'away',
 '27386Caes': 'across',
 '27831Caes': 'out',
 '28359Caes': 'together',
 '29918Caes': 'across',
 '30767Caes': 'forth',
 '31365Caes': 'across',
 '33967Caes': 'across',
 '34210Caes': 'away',
 '34348Caes': 'against',
 '36546Caes': '(malefactive), tog

#### Creating DataFrame for Preverb Semantics

In [67]:
preverb_semantics_df = pd.DataFrame([(k,v) for k,v in pred2_preverb_semantics.items()], columns=["ID", "PREVERB SEMANTICS"]) #where 'preverb_semantics_df' is a dataframe containing token IDs and preverb semantics of the verb tokens
preverb_semantics_df

Unnamed: 0,ID,PREVERB SEMANTICS
0,1498Caes,out
1,4296Caes,out
2,4578Caes,under
3,4854Caes,across
4,4978Caes,out
...,...,...
499,435595Virg,forward
500,436869Virg,under
501,438653Virg,to
502,440736Virg,under


#### Merging Preverb Semantics and Verb Stem DataFrames


In [68]:
id_preverb_semantics_df = id_preverbs_df.merge(preverb_semantics_df, on='ID', how='left') #where 'id_preverb_semantics_df' merges 'id_preverbs_df' and 'preverb_semantics_df'
id_preverb_semantics_df

Unnamed: 0,ID,VERB TOKEN,MORPHOLOGICAL FEATURES,LEMMA,SENTENCE,ACTIONALITY,VERB CLASS,LITERAL MEANING,VERB STEM,PREVERB,PREVERB SEMANTICS
0,1498Caes,exirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,exeo,Pisone consulibus regni cupiditate inductus co...,Activity,ESCAPE-51.1,True,present stem,ex,out
1,4296Caes,exeant,Mood=Subj|Number=Plur|Person=3|Tense=Pres|Verb...,exeo,Post eius mortem nihilo minus Helvetii id quod...,Activity,ESCAPE-51.1,True,present stem,ex,out
2,4578Caes,subeunda,Case=Nom|Gender=Neut|Number=Plur|Tense=Pres|Ve...,subeo,"frumentum omne, praeter quod secum portaturi e...",Achievement,MARVEL-31.3,False,present stem,sub,under
3,4854Caes,transierant,Mood=Ind|Number=Plur|Person=3|Tense=Pqp|VerbFo...,transeo,Persuadent Rauracis et Tulingis et Latobrigis ...,Achievement,ESCAPE-51.1,True,perfect stem,trans,across
4,4978Caes,exire,Tense=Pres|VerbForm=Inf|Voice=Act,exeo,"Erant omnino itinera duo, quibus itineribus do...",Activity,ESCAPE-51.1,True,present stem,ex,out
...,...,...,...,...,...,...,...,...,...,...,...
499,435595Virg,procurrit,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,procurro,"Dum nititur acer et instat, rursus in aurigae ...",Achievement,RUN-51.3.2,True,present stem,pro,forward
500,436869Virg,succurrere,Tense=Pres|VerbForm=Inf|Voice=Act,succurro,"Iuturnam misero, fateor, succurrere fratri sua...",Activity,HELP-72.1,False,present stem,sub,under
501,438653Virg,occurrere,Tense=Pres|VerbForm=Inf|Voice=Act,occurro,Harum unam celerem demisit ab aethere summo Iu...,Accomplishment,RUN-51.3.2,False,present stem,ob,to
502,440736Virg,subirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,subeo,"Vix illud lecti bis sex cervice subirent, qual...",Achievement,PUT_DIRECTION-9.4,False,present stem,sub,under


## Verb semantics

#### Extracting Verb Semantics Values


In [69]:
list_synsets = []  # Initializing an empty list to store semantic classes
tok2_synsets = dict()  # Initializing an empty dictionary to store predicted semantic classes mapped to IDs
count = 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'SemClass' annotation in the CAS
    for relation in cas.select('webanno.custom.SemClass'):
        # Looping through each token covered by the 'SemClass' annotation
        for token in cas.select_covered('webanno.custom.SemClass', relation):
            tok = token.get_covered_text()  # Getting the text covered by the current token
            id = token.begin  # Getting the beginning offset of the token
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            synset = relation.Synsets  # Getting the semantic class value
            
            # Checking if the ID already exists in tok2_synsets dictionary
            if id in tok2_synsets:
                # If ID exists, append the new semantic class to the existing list of semantic classes
                list_synsets = tok2_synsets[id] + synset 
                tok2_synsets[id] = list_synsets
            else:
                # If ID doesn't exist, store the semantic class directly
                tok2_synsets[id] = synset

print(tok2_synsets)


{'1466Caes': uima_cas_StringArray(xmiID=None, elements=['n#06299747 the territory occupied by a nation'], type=Type(name=uima.cas.StringArray)), '1498Caes': uima_cas_StringArray(xmiID=None, elements=['v#01376117 move out of; as of a room, a country, a bus, etc.'], type=Type(name=uima.cas.StringArray)), '4283Caes': uima_cas_StringArray(xmiID=None, elements=['n#06299747 the territory occupied by a nation'], type=Type(name=uima.cas.StringArray)), '4296Caes': uima_cas_StringArray(xmiID=None, elements=['v#01376117 move out of; as of a room, a country, a bus, etc.'], type=Type(name=uima.cas.StringArray)), '4569Caes': uima_cas_StringArray(xmiID=None, elements=['n#10428324 a source of danger'], type=Type(name=uima.cas.StringArray)), '4578Caes': uima_cas_StringArray(xmiID=None, elements=['v#01444459 undergo or suffer'], type=Type(name=uima.cas.StringArray)), '4840Caes': uima_cas_StringArray(xmiID=None, elements=['n#06299747 the territory occupied by a nation'], type=Type(name=uima.cas.StringArr

#### Extracting Verb Semantics Values Removing Unwanted Characters

In [70]:
list_synsets = []  # Initializing an empty list to store synsets

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'SemClass' annotation in the CAS
    for relation in cas.select('webanno.custom.SemClass'):
        synset = relation.Synsets  # Getting the synset value
        # Extracting the useful information from the StringArray elements
        synset_values = [element.strip().strip("'") for element in synset.elements]
        # Appending the synset values as a sublist
        list_synsets.append(synset_values)

print(list_synsets)


[['n#06299747 the territory occupied by a nation'], ['v#01376117 move out of; as of a room, a country, a bus, etc.'], ['n#06299747 the territory occupied by a nation'], ['v#01376117 move out of; as of a room, a country, a bus, etc.'], ['n#10428324 a source of danger'], ['v#01444459 undergo or suffer'], ['n#06299747 the territory occupied by a nation'], ['v#01253107 change location; move, travel, or proceed'], ['n#03243979 an open way (generally public) for travel or transportation'], ['n#06277165 the country where you were born'], ['v#01371248 go away from a place'], ['n#06789983 a large natural stream of water (larger than a creek)'], ['v#01401176 go across or through'], ['n#06800223 sloping land (especially the slope beside a body of water)'], ['v#01654097 collect in one place'], ['n#06588141 a city in southwestern Switzerland at the western end of Lake Geneva; the Palace of Nations originally housed the League of Nations and is now the European headquarters for the United Nations', 

#### Extracting Token IDs from Synset Annotations (Verbs and Nouns)


In [71]:
list_token_synsets_ids = []  # Initializing an empty list to store IDs of tokens annotated with a synset (i.e., verbs and nouns)
count = 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'SemClass' annotation in the CAS
    for relation in cas.select('webanno.custom.SemClass'):
        # Looping through each token covered by the 'SemClass' annotation
        for token in cas.select_covered('webanno.custom.SemClass', relation):
            id = token.begin  # Getting the beginning offset of the token
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            list_token_synsets_ids.append(id)  # Appending the token ID to the list

print(list_token_synsets_ids)


['1466Caes', '1498Caes', '4283Caes', '4296Caes', '4569Caes', '4578Caes', '4840Caes', '4854Caes', '4962Caes', '4973Caes', '4978Caes', '5323Caes', '5366Caes', '5712Caes', '5732Caes', '5989Caes', '5997Caes', '6806Caes', '6831Caes', '6995Caes', '7002Caes', '7012Caes', '7293Caes', '8498Caes', '9374Caes', '9426Caes', '9643Caes', '9649Caes', '10389Caes', '10398Caes', '10407Caes', '10576Caes', '10579Caes', '10617Caes', '10855Caes', '10862Caes', '10883Caes', '10890Caes', '10902Caes', '10931Caes', '11131Caes', '11147Caes', '11152Caes', '11811Caes', '11818Caes', '12312Caes', '12316Caes', '12323Caes', '21537Caes', '21563Caes', '21567Caes', '24479Caes', '24490Caes', '24504Caes', '24513Caes', '25584Caes', '25610Caes', '26021Caes', '26036Caes', '26206Caes', '26225Caes', '26250Caes', '26257Caes', '26264Caes', '26268Caes', '26620Caes', '26640Caes', '27313Caes', '27357Caes', '27380Caes', '27386Caes', '27818Caes', '27826Caes', '27831Caes', '28307Caes', '28339Caes', '28359Caes', '29911Caes', '29918Caes', 

#### Creating a dictionary where each token ID from syn_idlist is mapped to its corresponding synset from list_synsets

In [72]:
pred2_synset = {list_token_synsets_ids[i]: list_synsets[i] for i in range(len(list_token_synsets_ids))}

pred2_synset


{'1466Caes': ['n#06299747 the territory occupied by a nation'],
 '1498Caes': ['v#01376117 move out of; as of a room, a country, a bus, etc.'],
 '4283Caes': ['n#06299747 the territory occupied by a nation'],
 '4296Caes': ['v#01376117 move out of; as of a room, a country, a bus, etc.'],
 '4569Caes': ['n#10428324 a source of danger'],
 '4578Caes': ['v#01444459 undergo or suffer'],
 '4840Caes': ['n#06299747 the territory occupied by a nation'],
 '4854Caes': ['v#01253107 change location; move, travel, or proceed'],
 '4962Caes': ['n#03243979 an open way (generally public) for travel or transportation'],
 '4973Caes': ['n#06277165 the country where you were born'],
 '4978Caes': ['v#01371248 go away from a place'],
 '5323Caes': ['n#06789983 a large natural stream of water (larger than a creek)'],
 '5366Caes': ['v#01401176 go across or through'],
 '5712Caes': ['n#06800223 sloping land (especially the slope beside a body of water)'],
 '5732Caes': ['v#01654097 collect in one place'],
 '5989Caes': 

#### Creating DataFrame for Token Synsets

In [73]:
synsets_df = pd.DataFrame([(k,v) for k,v in pred2_synset.items()], columns=["ID", "VERB SEMANTICS"]) #where 'synsets_df' is a dataframe containing token IDs and synsets of all the tokens (verbs and nouns) annotated with a synset
synsets_df

Unnamed: 0,ID,VERB SEMANTICS
0,1466Caes,[n#06299747 the territory occupied by a nation]
1,1498Caes,"[v#01376117 move out of; as of a room, a count..."
2,4283Caes,[n#06299747 the territory occupied by a nation]
3,4296Caes,"[v#01376117 move out of; as of a room, a count..."
4,4569Caes,[n#10428324 a source of danger]
...,...,...
1025,438653Virg,[v#01410345 run or move very quickly or hastily]
1026,440708Virg,[n#06669293 a lump of hard consolidated minera...
1027,440736Virg,[v#01343923 raise from a lower to a higher pos...
1028,441937Virg,[v#00988556 penetrate or cut through with a sh...


#### Merging Preverb Semantics with Synsets DataFrames


In [74]:
id_synsets_df = id_preverb_semantics_df.merge(synsets_df, on='ID', how='left') #where 'id_synsets_df' merges 'id_preverb_semantics_df' and 'synsets_df', basically adding the synset to the other verb parameters contained in 'id_preverb_semantics_df'
id_synsets_df

Unnamed: 0,ID,VERB TOKEN,MORPHOLOGICAL FEATURES,LEMMA,SENTENCE,ACTIONALITY,VERB CLASS,LITERAL MEANING,VERB STEM,PREVERB,PREVERB SEMANTICS,VERB SEMANTICS
0,1498Caes,exirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,exeo,Pisone consulibus regni cupiditate inductus co...,Activity,ESCAPE-51.1,True,present stem,ex,out,"[v#01376117 move out of; as of a room, a count..."
1,4296Caes,exeant,Mood=Subj|Number=Plur|Person=3|Tense=Pres|Verb...,exeo,Post eius mortem nihilo minus Helvetii id quod...,Activity,ESCAPE-51.1,True,present stem,ex,out,"[v#01376117 move out of; as of a room, a count..."
2,4578Caes,subeunda,Case=Nom|Gender=Neut|Number=Plur|Tense=Pres|Ve...,subeo,"frumentum omne, praeter quod secum portaturi e...",Achievement,MARVEL-31.3,False,present stem,sub,under,[v#01444459 undergo or suffer]
3,4854Caes,transierant,Mood=Ind|Number=Plur|Person=3|Tense=Pqp|VerbFo...,transeo,Persuadent Rauracis et Tulingis et Latobrigis ...,Achievement,ESCAPE-51.1,True,perfect stem,trans,across,"[v#01253107 change location; move, travel, or ..."
4,4978Caes,exire,Tense=Pres|VerbForm=Inf|Voice=Act,exeo,"Erant omnino itinera duo, quibus itineribus do...",Activity,ESCAPE-51.1,True,present stem,ex,out,[v#01371248 go away from a place]
...,...,...,...,...,...,...,...,...,...,...,...,...
499,435595Virg,procurrit,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,procurro,"Dum nititur acer et instat, rursus in aurigae ...",Achievement,RUN-51.3.2,True,present stem,pro,forward,[v#01410345 run or move very quickly or hastily]
500,436869Virg,succurrere,Tense=Pres|VerbForm=Inf|Voice=Act,succurro,"Iuturnam misero, fateor, succurrere fratri sua...",Activity,HELP-72.1,False,present stem,sub,under,[v#01737682 help in a difficult situation]
501,438653Virg,occurrere,Tense=Pres|VerbForm=Inf|Voice=Act,occurro,Harum unam celerem demisit ab aethere summo Iu...,Accomplishment,RUN-51.3.2,False,present stem,ob,to,[v#01410345 run or move very quickly or hastily]
502,440736Virg,subirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,subeo,"Vix illud lecti bis sex cervice subirent, qual...",Achievement,PUT_DIRECTION-9.4,False,present stem,sub,under,[v#01343923 raise from a lower to a higher pos...


## Figure 

#### Extracting and Storing Figure Synsets


In [75]:
list_figure_synsets = []  # Initializing an empty list to store figure synsets
tok2_figure_synsets = dict()  # Initializing an empty dictionary to store predicted figure synsets mapped to IDs
count= 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping fthrough each 'Figuresynset' annotation in the CAS
    for relation in cas.select('webanno.custom.Figuresynset'):
        # Looping through each token covered by the 'Figuresynset' annotation
        for token in cas.select_covered('webanno.custom.Figuresynset', relation):
            tok = token.get_covered_text()  # Getting the text covered by the current token
            id = token.begin  # Getting the beginning offset of the token
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            figure_synset = relation.FigSyn  # Getting the figure synset value
            
            # Checking if the ID already exists in tok2_figure_synsets dictionary
            if id in tok2_figure_synsets:
                # If ID exists, append the new figure synset to the existing list of figure synsets
                list_figure_synsets = tok2_figure_synsets[id] + figure_synset
                tok2_figure_synsets[id] = list_figure_synsets
            else:
                # If ID doesn't exist, store the figure synset directly
                tok2_figure_synsets[id] = figure_synset 

print(tok2_figure_synsets)


{'1498Caes': uima_cas_StringArray(xmiID=None, elements=['n#06080139 the people who inhabit a territory or state'], type=Type(name=uima.cas.StringArray)), '4296Caes': uima_cas_StringArray(xmiID=None, elements=['n#00004123 a human being'], type=Type(name=uima.cas.StringArray)), '4578Caes': uima_cas_StringArray(xmiID=None, elements=['n#00004123 a human being'], type=Type(name=uima.cas.StringArray)), '4854Caes': uima_cas_StringArray(xmiID=None, elements=['n#00004123 a human being'], type=Type(name=uima.cas.StringArray)), '4978Caes': uima_cas_StringArray(xmiID=None, elements=['n#00004123 a human being'], type=Type(name=uima.cas.StringArray)), '5366Caes': uima_cas_StringArray(xmiID=None, elements=['n#06789983 a large natural stream of water (larger than a creek)'], type=Type(name=uima.cas.StringArray)), '5732Caes': uima_cas_StringArray(xmiID=None, elements=['n#00004123 a human being'], type=Type(name=uima.cas.StringArray)), '5997Caes': uima_cas_StringArray(xmiID=None, elements=['n#00004123 a

#### Extracting and Cleaning Figure Synsets


In [76]:
list2_figure_synsets = []  # Initializing an empty list to store figure synsets
count= 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Figuresynset' annotation in the CAS
    for relation in cas.select('webanno.custom.Figuresynset'):
        figure_synset = relation.FigSyn  # Getting the figure synset value
        list2_figure_synsets.append(str(figure_synset))  # Appending the figure synset to the list

# Removing unwanted characters from each element in list2_figure_synsets
list2_figure_synsets = [s.replace('u.c.StringArray(elements=', '') for s in list2_figure_synsets]

print(list2_figure_synsets)


["['n#06080139 the people who inhabit a territory or state'])", "['n#00004123 a human being'])", "['n#00004123 a human being'])", "['n#00004123 a human being'])", "['n#00004123 a human being'])", "['n#06789983 a large natural stream of water (larger than a creek)'])", "['n#00004123 a human being'])", "['n#00004123 a human being'])", "['n#07591125 an enlisted man or woman who serves in an army', 'n#00004123 a human being'])", "['n#07591125 an enlisted man or woman who serves in an army', 'n#00004123 a human being'])", "['n#00004123 a human being'])", "['n#00004123 a human being'])", "['n#00004123 a human being'])", "['n#00004123 a human being'])", "['n#00004123 a human being'])", "['n#00004123 a human being'])", "['n#00004123 a human being'])", "['n#06095787 an army unit large enough to sustain combat', 'n#00004123 a human being'])", "['n#00004123 a human being'])", "['n#06296283 a small administrative division of a country'])", "['n#00004123 a human being'])", "['n#00004123 a human bei

In [77]:
# Removing '])' characters from each string in fslist2 and storing the modified strings in list_figure_synsets
list_figure_synsets = [l.replace(')', '') for l in list2_figure_synsets]

print(list_figure_synsets)

["['n#06080139 the people who inhabit a territory or state']", "['n#00004123 a human being']", "['n#00004123 a human being']", "['n#00004123 a human being']", "['n#00004123 a human being']", "['n#06789983 a large natural stream of water (larger than a creek']", "['n#00004123 a human being']", "['n#00004123 a human being']", "['n#07591125 an enlisted man or woman who serves in an army', 'n#00004123 a human being']", "['n#07591125 an enlisted man or woman who serves in an army', 'n#00004123 a human being']", "['n#00004123 a human being']", "['n#00004123 a human being']", "['n#00004123 a human being']", "['n#00004123 a human being']", "['n#00004123 a human being']", "['n#00004123 a human being']", "['n#00004123 a human being']", "['n#06095787 an army unit large enough to sustain combat', 'n#00004123 a human being']", "['n#00004123 a human being']", "['n#06296283 a small administrative division of a country']", "['n#00004123 a human being']", "['n#00004123 a human being']", "['n#00004123 a

#### Extracting Token IDs for Figure Synsets


In [78]:
list_token_figure_synset_ids = []  # Initializing an empty list to store IDs of tokens annotated with a figure
count= 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Figuresynset' annotation in the CAS
    for relation in cas.select('webanno.custom.Figuresynset'):
        # Looping through each token covered by the 'Figuresynset' annotation
        for token in cas.select_covered('webanno.custom.Figuresynset', relation):
            id = token.begin  # Getting the beginning offset of the token
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            list_token_figure_synset_ids.append(id)  # Appending the token ID to the list

print(list_token_figure_synset_ids)


['1498Caes', '4296Caes', '4578Caes', '4854Caes', '4978Caes', '5366Caes', '5732Caes', '5997Caes', '6831Caes', '7012Caes', '7293Caes', '8498Caes', '9426Caes', '9649Caes', '10407Caes', '10617Caes', '10862Caes', '10890Caes', '10931Caes', '11152Caes', '11818Caes', '12323Caes', '21567Caes', '24504Caes', '24513Caes', '25610Caes', '26036Caes', '26225Caes', '26268Caes', '26640Caes', '27386Caes', '27831Caes', '28359Caes', '29918Caes', '30767Caes', '31365Caes', '33967Caes', '34210Caes', '34348Caes', '36546Caes', '36804Caes', '37047Caes', '37157Caes', '37517Caes', '38986Caes', '41443Caes', '46744Caes', '46852Caes', '48088Caes', '50312Caes', '52574Caes', '53622Caes', '56162Caes', '56208Caes', '56243Caes', '56345Caes', '56583Caes', '58654Caes', '59161Caes', '60301Caes', '61670Caes', '63933Caes', '64997Caes', '65326Caes', '65391Caes', '65423Caes', '65528Caes', '66243Caes', '66324Caes', '66389Caes', '66411Caes', '66489Caes', '66567Caes', '66805Caes', '67219Caes', '67317Caes', '68853Caes', '69478Caes',

#### Mapping Token IDs to Figure Synsets


In [79]:
# Create a dictionary where each token ID from list_token_figure_synset_ids is mapped to its corresponding figure synset from list_figure_synsets
tok2_figure_synsets = {list_token_figure_synset_ids[i]: list_figure_synsets[i] for i in range(len(list_token_figure_synset_ids))}

tok2_figure_synsets



{'1498Caes': "['n#06080139 the people who inhabit a territory or state']",
 '4296Caes': "['n#00004123 a human being']",
 '4578Caes': "['n#00004123 a human being']",
 '4854Caes': "['n#00004123 a human being']",
 '4978Caes': "['n#00004123 a human being']",
 '5366Caes': "['n#06789983 a large natural stream of water (larger than a creek']",
 '5732Caes': "['n#00004123 a human being']",
 '5997Caes': "['n#00004123 a human being']",
 '6831Caes': "['n#07591125 an enlisted man or woman who serves in an army', 'n#00004123 a human being']",
 '7012Caes': "['n#07591125 an enlisted man or woman who serves in an army', 'n#00004123 a human being']",
 '7293Caes': "['n#00004123 a human being']",
 '8498Caes': "['n#00004123 a human being']",
 '9426Caes': "['n#00004123 a human being']",
 '9649Caes': "['n#00004123 a human being']",
 '10407Caes': "['n#00004123 a human being']",
 '10617Caes': "['n#00004123 a human being']",
 '10862Caes': "['n#00004123 a human being']",
 '10890Caes': "['n#06095787 an army unit 

#### Creating Figure Synsets DataFrame


In [80]:
figure_synsets_df = pd.DataFrame([(k,v) for k,v in tok2_figure_synsets.items()], columns=["ID", "FIGURE SEMANTICS"]) #where 'figure_synsets_df' is a dataframe containing token IDs and figure synsets of all the tokens annotated with a figure 
figure_synsets_df

Unnamed: 0,ID,FIGURE SEMANTICS
0,1498Caes,['n#06080139 the people who inhabit a territor...
1,4296Caes,['n#00004123 a human being']
2,4578Caes,['n#00004123 a human being']
3,4854Caes,['n#00004123 a human being']
4,4978Caes,['n#00004123 a human being']
...,...,...
499,435595Virg,['n#06888584 a female deity']
500,436869Virg,['n#06888584 a female deity']
501,438653Virg,['n#10169961 a serious (sometimes fatal infect...
502,440736Virg,['n#00004123 a human being']


#### Merging Synsets and Figure Synsets DataFrame


In [81]:
id_figure_synsets_df = id_synsets_df.merge(figure_synsets_df, on='ID', how='left') #where 'id_figure_synsets_df' merges 'id_synsets_df' and 'figure_synsets_df', basically adding the figure synset to the other parameters contained in 'id_synsets_df'
id_figure_synsets_df

Unnamed: 0,ID,VERB TOKEN,MORPHOLOGICAL FEATURES,LEMMA,SENTENCE,ACTIONALITY,VERB CLASS,LITERAL MEANING,VERB STEM,PREVERB,PREVERB SEMANTICS,VERB SEMANTICS,FIGURE SEMANTICS
0,1498Caes,exirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,exeo,Pisone consulibus regni cupiditate inductus co...,Activity,ESCAPE-51.1,True,present stem,ex,out,"[v#01376117 move out of; as of a room, a count...",['n#06080139 the people who inhabit a territor...
1,4296Caes,exeant,Mood=Subj|Number=Plur|Person=3|Tense=Pres|Verb...,exeo,Post eius mortem nihilo minus Helvetii id quod...,Activity,ESCAPE-51.1,True,present stem,ex,out,"[v#01376117 move out of; as of a room, a count...",['n#00004123 a human being']
2,4578Caes,subeunda,Case=Nom|Gender=Neut|Number=Plur|Tense=Pres|Ve...,subeo,"frumentum omne, praeter quod secum portaturi e...",Achievement,MARVEL-31.3,False,present stem,sub,under,[v#01444459 undergo or suffer],['n#00004123 a human being']
3,4854Caes,transierant,Mood=Ind|Number=Plur|Person=3|Tense=Pqp|VerbFo...,transeo,Persuadent Rauracis et Tulingis et Latobrigis ...,Achievement,ESCAPE-51.1,True,perfect stem,trans,across,"[v#01253107 change location; move, travel, or ...",['n#00004123 a human being']
4,4978Caes,exire,Tense=Pres|VerbForm=Inf|Voice=Act,exeo,"Erant omnino itinera duo, quibus itineribus do...",Activity,ESCAPE-51.1,True,present stem,ex,out,[v#01371248 go away from a place],['n#00004123 a human being']
...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,435595Virg,procurrit,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,procurro,"Dum nititur acer et instat, rursus in aurigae ...",Achievement,RUN-51.3.2,True,present stem,pro,forward,[v#01410345 run or move very quickly or hastily],['n#06888584 a female deity']
500,436869Virg,succurrere,Tense=Pres|VerbForm=Inf|Voice=Act,succurro,"Iuturnam misero, fateor, succurrere fratri sua...",Activity,HELP-72.1,False,present stem,sub,under,[v#01737682 help in a difficult situation],['n#06888584 a female deity']
501,438653Virg,occurrere,Tense=Pres|VerbForm=Inf|Voice=Act,occurro,Harum unam celerem demisit ab aethere summo Iu...,Accomplishment,RUN-51.3.2,False,present stem,ob,to,[v#01410345 run or move very quickly or hastily],['n#10169961 a serious (sometimes fatal infect...
502,440736Virg,subirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,subeo,"Vix illud lecti bis sex cervice subirent, qual...",Achievement,PUT_DIRECTION-9.4,False,present stem,sub,under,[v#01343923 raise from a lower to a higher pos...,['n#00004123 a human being']


## Ground synset

#### Extracting Ground Synsets 


In [82]:
list_ground_synsets = []  # Initializing an empty list to store ground synsets
tok2_ground_synsets = dict()  # Initializing an empty dictionary to store predicted ground synsets mapped to IDs
count= 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Groundsynset' annotation in the CAS
    for relation in cas.select('webanno.custom.Groundsynset'):
        # Looping through each token covered by the 'Groundsynset' annotation
        for token in cas.select_covered('webanno.custom.Groundsynset', relation):
            tok = token.get_covered_text()  # Getting the text covered by the current token
            id = token.begin  # Getting the beginning offset of the token
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            ground_synset = relation.GroundSyn  # Getting the ground synset value
            
            # Checking if the ID already exists in tok2_ground_synsets dictionary
            if id in tok2_ground_synsets:
                # If ID exists, append the new ground synset to the existing list of ground synsets
                list_ground_synsets = tok2_ground_synsets[id] + [ground_synset]
                tok2_ground_synsets[id] = list_ground_synsets
            else:
                # If ID doesn't exist, store the ground synset directly
                tok2_ground_synsets[id] = [ground_synset]

print(tok2_ground_synsets)


{'1498Caes': [uima_cas_StringArray(xmiID=None, elements=['n#06299747 the territory occupied by a nation'], type=Type(name=uima.cas.StringArray))], '4296Caes': [uima_cas_StringArray(xmiID=None, elements=['n#06299747 the territory occupied by a nation'], type=Type(name=uima.cas.StringArray))], '4578Caes': [uima_cas_StringArray(xmiID=None, elements=['n#10428324 a source of danger'], type=Type(name=uima.cas.StringArray))], '4854Caes': [uima_cas_StringArray(xmiID=None, elements=['n#06299747 the territory occupied by a nation'], type=Type(name=uima.cas.StringArray))], '4978Caes': [uima_cas_StringArray(xmiID=None, elements=['n#06277165 the country where you were born'], type=Type(name=uima.cas.StringArray))], '5732Caes': [uima_cas_StringArray(xmiID=None, elements=['n#06800223 sloping land (especially the slope beside a body of water)'], type=Type(name=uima.cas.StringArray))], '5997Caes': [uima_cas_StringArray(xmiID=None, elements=['n#06287351 a large and densely populated urban area; may incl

#### Extracting and Cleaning Ground Synsets



In [83]:
list_ground_synsets = []  # Initializing an empty list to store ground synsets

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Groundsynset' annotation in the CAS
    for relation in cas.select('webanno.custom.Groundsynset'):
        ground_synset = relation.GroundSyn  # Getting the ground synset value
        # Extracting the useful information from the StringArray elements
        ground_synset_values = [element.strip().strip("'") for element in ground_synset.elements]
        # Appending the ground synset values as a sublist
        list_ground_synsets.append(ground_synset_values)

print(list_ground_synsets)


[['n#06299747 the territory occupied by a nation'], ['n#06299747 the territory occupied by a nation'], ['n#10428324 a source of danger'], ['n#06299747 the territory occupied by a nation'], ['n#06277165 the country where you were born'], ['n#06800223 sloping land (especially the slope beside a body of water)'], ['n#06287351 a large and densely populated urban area; may include several independent administrative districts'], [], ['n#06299747 the territory occupied by a nation'], ['n#00004123 a human being'], ['n#06789983 a large natural stream of water (larger than a creek)'], ['n#06095787 an army unit large enough to sustain combat', 'n#00004123 a human being'], ['n#06789983 a large natural stream of water (larger than a creek)'], ['n#00004123 a human being'], ['n#06277165 the country where you were born'], ['n#06789983 a large natural stream of water (larger than a creek)'], ['n#06789983 a large natural stream of water (larger than a creek)'], ['n#00004123 a human being'], ['n#00004123

#### Extracting and Mapping Ground Synset Annotations from CAS Files

In [84]:
list_token_ground_synset_ids = []  # Initializing an empty list to store IDs of tokens annotated with a ground 
count= 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Groundsynset' annotation in the CAS
    for relation in cas.select('webanno.custom.Groundsynset'):
        # Looping through each token covered by the 'Groundsynset' annotation
        for token in cas.select_covered('webanno.custom.Groundsynset', relation):
            id = token.begin  # Getting the beginning offset of the token
            id = str(id) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            list_token_ground_synset_ids.append(id)  # Appending the token ID to the list

print(list_token_ground_synset_ids)


['1498Caes', '4296Caes', '4578Caes', '4854Caes', '4978Caes', '5732Caes', '5997Caes', '9426Caes', '9649Caes', '10407Caes', '10617Caes', '10862Caes', '10890Caes', '10931Caes', '11152Caes', '11818Caes', '12323Caes', '21567Caes', '24504Caes', '25610Caes', '26036Caes', '26268Caes', '26640Caes', '27386Caes', '27831Caes', '28359Caes', '29918Caes', '30767Caes', '31365Caes', '33967Caes', '34210Caes', '37157Caes', '37517Caes', '46744Caes', '46852Caes', '48088Caes', '53622Caes', '56162Caes', '59161Caes', '60301Caes', '61670Caes', '65326Caes', '66243Caes', '66324Caes', '66489Caes', '66567Caes', '67219Caes', '67317Caes', '68853Caes', '69478Caes', '70242Caes', '70802Caes', '72211Caes', '74081Caes', '74785Caes', '75094Caes', '75926Caes', '76251Caes', '76487Caes', '77316Caes', '77401Caes', '77454Caes', '78212Caes', '78345Caes', '81306Caes', '81803Caes', '81859Caes', '85712Caes', '86330Caes', '89385Caes', '90684Caes', '91683Caes', '92706Caes', '93203Caes', '97054Caes', '97169Caes', '97624Caes', '102818

#### Mapping Token IDs to Ground Synsets


In [85]:
# Creating a dictionary where each token ID from gs_idlist is mapped to its corresponding ground synset from gslist
tok2_ground_synset = {list_token_ground_synset_ids[i]: list_ground_synsets[i] for i in range(len(list_token_ground_synset_ids))}

tok2_ground_synset


{'1498Caes': ['n#06299747 the territory occupied by a nation'],
 '4296Caes': ['n#06299747 the territory occupied by a nation'],
 '4578Caes': ['n#10428324 a source of danger'],
 '4854Caes': ['n#06299747 the territory occupied by a nation'],
 '4978Caes': ['n#06277165 the country where you were born'],
 '5732Caes': ['n#06800223 sloping land (especially the slope beside a body of water)'],
 '5997Caes': ['n#06287351 a large and densely populated urban area; may include several independent administrative districts'],
 '9426Caes': [],
 '9649Caes': ['n#06299747 the territory occupied by a nation'],
 '10407Caes': ['n#00004123 a human being'],
 '10617Caes': ['n#06789983 a large natural stream of water (larger than a creek)'],
 '10862Caes': ['n#06095787 an army unit large enough to sustain combat',
  'n#00004123 a human being'],
 '10890Caes': ['n#06789983 a large natural stream of water (larger than a creek)'],
 '10931Caes': ['n#00004123 a human being'],
 '11152Caes': ['n#06277165 the country whe

#### Creating Token IDs and Ground Synsets DataFrame

In [86]:
ground_synsets_df = pd.DataFrame([(k,v) for k,v in tok2_ground_synset.items()], columns=["ID", "GROUND SEMANTICS"]) #where 'ground_synsets_df' is a dataframe containing token IDs and ground synsets of all the tokens annotated with a ground 
ground_synsets_df

Unnamed: 0,ID,GROUND SEMANTICS
0,1498Caes,[n#06299747 the territory occupied by a nation]
1,4296Caes,[n#06299747 the territory occupied by a nation]
2,4578Caes,[n#10428324 a source of danger]
3,4854Caes,[n#06299747 the territory occupied by a nation]
4,4978Caes,[n#06277165 the country where you were born]
...,...,...
267,428401Virg,[n#00004123 a human being]
268,436869Virg,[n#07127521 a male with the same parents as so...
269,438653Virg,[n#06381267 a point located with respect to su...
270,440736Virg,[n#06669293 a lump of hard consolidated minera...


#### Merging Ground Semantics with Figure Semantics DataFrames



In [54]:
id_ground_synsets_df = id_figure_synsets_df.merge(ground_synsets_df, on='ID', how='left') #where 'id_ground_synsets_df' merges 'id_figure_synsets_df' and 'ground_synsets_df', basically adding the ground synset to the other parameters contained in 'id_figure_synsets_df'
id_ground_synsets_df

Unnamed: 0,ID,VERB TOKEN,MORPHOLOGICAL FEATURES,LEMMA,SENTENCE,ACTIONALITY,VERB CLASS,LITERAL MEANING,VERB STEM,PREVERB,PREVERB SEMANTICS,VERB SEMANTICS,FIGURE SEMANTICS,GROUND SEMANTICS
0,7504Aen,perire,Tense=Pres|VerbForm=Inf|Voice=Act,pereo,"Quae neque Dardaniis campis potuere perire, ...",Achievement,DIE-42.4.1,False,present stem,per,"(idea of destruction/death), across",[v#00250254 pass from physical life and lose a...,['n#00004123 a human being'],
1,7614Aen,Concurrunt,Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbF...,concurro,Concurrunt vel uti venti cum spiritus austri ...,Achievement,BUMP-18.4-1,True,present stem,cum,together,[v#01075789 crash together with violent impact],['n#00004123 a human being'],
2,7807Aen,conveniunt,Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbF...,convenio,Undique conveniunt vel ut imber tela tribuno,Activity,HERD-47.5.2,True,present stem,cum,together,[v#01654097 collect in one place],['n#03176413 a body that is thrown or projected'],[n#07168973 an officer in command of a militar...
3,627Ap,praeeunte,Case=Abl|Gender=Masc|Number=Sing|Tense=Pres|Ve...,praeeo,"Mox in urbe Latia advena studiorum, Quiritium ...",State,ACCOMPANY-51.7,False,present stem,prae,before,"[v#01661609 be a guiding force, as with direct...",['n#07632177 a person whose occupation is teac...,
4,638Ap,aggressus,Case=Nom|Gender=Masc|Number=Sing|Tense=Past|Ve...,adgredior,"Mox in urbe Latia advena studiorum, Quiritium ...",Accomplishment,BEGIN-55.1,False,supine stem,ad,to,[v#01661230 begin to deal with],['n#00004123 a human being'],[n#05167497 a human written or spoken language...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,435595Verg,procurrit,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,procurro,"Dum nititur acer et instat, rursus in aurigae ...",Achievement,RUN-51.3.2,True,present stem,pro,forward,[v#01410345 run or move very quickly or hastily],['n#06888584 a female deity'],
1479,436869Verg,succurrere,Tense=Pres|VerbForm=Inf|Voice=Act,succurro,"Iuturnam misero, fateor, succurrere fratri sua...",Activity,HELP-72.1,False,present stem,sub,under,[v#01737682 help in a difficult situation],['n#06888584 a female deity'],[n#07127521 a male with the same parents as so...
1480,438653Verg,occurrere,Tense=Pres|VerbForm=Inf|Voice=Act,occurro,Harum unam celerem demisit ab aethere summo Iu...,Accomplishment,RUN-51.3.2,False,present stem,ob,to,[v#01410345 run or move very quickly or hastily],['n#10169961 a serious (sometimes fatal infect...,[n#06381267 a point located with respect to su...
1481,440736Verg,subirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,subeo,"Vix illud lecti bis sex cervice subirent, qual...",Achievement,PUT_DIRECTION-9.4,False,present stem,sub,under,[v#01343923 raise from a lower to a higher pos...,['n#00004123 a human being'],[n#06669293 a lump of hard consolidated minera...


In [55]:
#Exporting id_ground_synsets_df to csv
id_ground_synsets_df.to_csv('title.csv')

### Obtaining participants (Figure, Ground) lemmas

In [56]:
#Mapping verb token IDs to participants IDs

list_participants = []  # Initializing an empty list to store participants
tok2_participants = dict()  # Initializing an empty dictionary to store tokens mapped to their corresponding participants
count = 0  # Initializing a count variable

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Participants' annotation in the CAS
    for relation in cas.select('webanno.custom.Paticipants'):
        dep = relation.Dependent  # Dependent token of the participant relation
        tokdep = dep.get_covered_text()  # Covered text of the dependent token
        id = str(dep.begin) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
        list_participants.append(str(tokdep))  # Appending the covered text of the dependent token to srlist

        gov = relation.Governor  # Governor token of the participant relation
        toksr = gov.get_covered_text()  # Covered text of the governor token
        id2 = str(gov.begin) + file_input_abbr  # Creating a unique identifier by appending the file abbreviation to the offset of the governor token
        
        # Checking if the ID already exists in tok2_participants
        if id in tok2_participants:
            list_participants = tok2_participants[id] + [id2]  # If so, append the participant token ID to the existing list
            tok2_participants[id] = list_participants  # Update the dictionary with the appended list
        else:
            tok2_participants[id] = [id2]  # If not, create a new entry with a list containing the participant token ID

print(tok2_participants)  



{'7807Aen': ['7831Aen', '7836Aen', 'praeeunte', 'aggressus', 'perveneritis', 'obiturus', 'provenire', 'convenire', 'transeas', 'aufugi', 'abierunt', 'pervenimus', 'exire', 'evolo', 'occurrit', 'occurrit'], '627Ap': ['618Ap'], '638Ap': ['583Ap'], '4001Ap': ['3991Ap'], '6965Ap': ['6953Ap'], '11895Ap': ['11872Ap'], '13304Ap': ['13271Ap'], '14102Ap': ['14094Ap'], '20808Ap': ['20796Ap'], '21919Ap': ['21891Ap'], '31740Ap': ['31723Ap'], '32491Ap': ['32474Ap'], '34900Ap': ['34879Ap'], '37524Ap': ['37451Ap', '37514Ap', 'occurrens', 'advenit', 'subit', 'adire', 'aufugere', 'obeas', 'abis', 'accurro', 'Occurrit', 'convenissent', 'advenit', 'involo', 'procurrens', 'aggressus', 'decurrit', 'decurrit'], '38710Ap': ['38703Ap'], '40602Ap': ['40596Ap'], '46705Ap': ['46697Ap'], '51135Ap': ['51105Ap'], '51515Ap': ['51501Ap'], '53490Ap': ['53484Ap'], '55177Ap': ['55198Ap'], '55702Ap': ['55694Ap'], '57275Ap': ['57348Ap'], '61144Ap': ['61136Ap'], '62018Ap': ['61963Ap'], '63203Ap': ['63194Ap'], '65487Ap': ['

In [57]:
participants_df = pd.DataFrame([(k,v) for k,v in tok2_participants.items()], columns=["ID", "PARTICIPANT ID"])  #where 'part_df' is a dataframe containing token IDs and IDs of all the tokens annotated with a participant relation (i.e., Figure/Ground)
participants_df

Unnamed: 0,ID,PARTICIPANT ID
0,7807Aen,"[7831Aen, 7836Aen, praeeunte, aggressus, perve..."
1,627Ap,[618Ap]
2,638Ap,[583Ap]
3,4001Ap,[3991Ap]
4,6965Ap,[6953Ap]
...,...,...
906,435595Verg,[435621Verg]
907,436869Verg,"[436844Verg, 436880Verg, occurrere, subirent, ..."
908,438653Verg,[438644Verg]
909,440736Verg,[440708Verg]


In [58]:
list_participant_lemmas = []  # Initializing an empty list to store participant lemmas
tok2_participant_lemmas = dict()  # Initializing an empty dictionary to map tokens to their participant lemmas
count = 0  # Initializing a count variable

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Lemma' annotation in the CAS
    for relation in cas.select('de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma'):
        # Looping through each token covered by 'Lemma' annotation
        for token in cas.select_covered('de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma', relation):
            tok = token.get_covered_text()  # Covered text of the token
            id = str(token.begin) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            lemma = relation.value  # Value of the lemma relation
            
            # Checking if the ID already exists in tok2_participant_lemmas
            if id in tok2_participant_lemmas:
                list_lemmas = tok2_participant_lemmas[id] + lemma  # If so, append the value to the existing list of lemma fillers
                tok2_participant_lemmas[id] = list_lemmas  # Update the dictionary with the appended list
            else:
                tok2_participant_lemmas[id] = lemma  # If not, create a new entry with the lemma filler value

print(tok2_participant_lemmas)  


{'7489Aen': 'campus', '7504Aen': 'pereo', '7614Aen': 'concurro', '7799Aen': 'undique', '7807Aen': 'convenio', '7831Aen': 'telum', '7836Aen': 'tribunus', '583Ap': 'sermo', '618Ap': 'magister', '627Ap': 'praeeo', '638Ap': 'adgredior', '2954Ap': 'intereo', '3256Ap': 'bacillum', '3278Ap': 'ad', '3281Ap': 'occipitium', '3292Ap': 'per', '3296Ap': 'ingluvies', '3306Ap': 'subeo', '3991Ap': 'civitas', '4001Ap': 'pervenio', '6953Ap': 'spectaculum', '6965Ap': 'obeo', '9077Ap': 'provenio', '11123Ap': 'aufugio', '11134Ap': 'istinc', '11872Ap': 'affectus', '11881Ap': 'in', '11884Ap': 'contrarius', '11895Ap': 'provenio', '13271Ap': 'nomen', '13304Ap': 'convenio', '14090Ap': 'per', '14094Ap': 'fluvius', '14102Ap': 'transeo', '14123Ap': 'abeo', '16724Ap': 'adgredior', '20774Ap': 'per', '20796Ap': 'solitudo', '20808Ap': 'aufugio', '21324Ap': 'provenio', '21888Ap': 'ad', '21891Ap': 'villula', '21919Ap': 'abeo', '22995Ap': 'progredior', '23699Ap': 'invenio', '27261Ap': 'abeo', '29687Ap': 'circumeo', '3172

In [59]:
participant_lemmas_df = pd.DataFrame([(k,v) for k,v in tok2_participant_lemmas.items()], columns=["PARTICIPANT ID", "PARTICIPANT LEMMA"]) #where 'part_lem_df' is a dataframe containing IDs of all the tokens annotated with a participant relation (i.e., Figure/Ground) and their lemma
participant_lemmas_df

Unnamed: 0,PARTICIPANT ID,PARTICIPANT LEMMA
0,7489Aen,campus
1,7504Aen,pereo
2,7614Aen,concurro
3,7799Aen,undique
4,7807Aen,convenio
...,...,...
3282,440708Verg,ille
3283,440736Verg,subeo
3284,441917Verg,per
3285,441937Verg,transeo


In [60]:
# Initialize a new dictionary to store the mapping of IDs to lemmas
pred2_participant_lemma = {}

# Iterate over the keys of pred2_lemmas
for id, lemma in pred2_lemmas.items():
    # Check if the ID exists in tok2_participants
    if id in tok2_participants:
        # If so, find the corresponding participant IDs
        participant_ids = tok2_participants[id]
        # Check if there are participant IDs
        if participant_ids:
            # Initialize a list to store the lemmas corresponding to the participant IDs
            participant_lemmas = []
            for participant_id in participant_ids:
                # Lookup the lemma corresponding to the participant ID
                if participant_id in pred2_lemmas:
                    participant_lemma = pred2_lemmas[participant_id]
                    participant_lemmas.append(participant_lemma)
            # Store the list of lemmas corresponding to the participant IDs
            pred2_participant_lemma[id] = participant_lemmas

print(pred2_participant_lemma)


{'7807Aen': ['telum', 'tribunus'], '627Ap': ['magister'], '638Ap': ['sermo'], '4001Ap': ['civitas'], '6965Ap': ['spectaculum'], '11895Ap': ['affectus'], '13304Ap': ['nomen'], '14102Ap': ['fluvius'], '20808Ap': ['solitudo'], '21919Ap': ['villula'], '31740Ap': ['domus'], '32491Ap': ['fauces'], '34900Ap': ['hospitium'], '37524Ap': ['pars', 'lumen'], '38710Ap': ['oculus'], '40602Ap': ['Liber'], '46705Ap': ['fluctus'], '51135Ap': ['locus'], '51515Ap': ['mortuus'], '53490Ap': ['munus'], '55177Ap': ['bestia'], '55702Ap': ['cadaver'], '57275Ap': ['senex'], '61144Ap': ['reliquus'], '62018Ap': ['dies'], '63203Ap': ['latro'], '65487Ap': ['populus'], '69546Ap': ['ego'], '71331Ap': ['mulier', 'theatrum'], '71779Ap': ['fortuna'], '72278Ap': ['puer'], '74642Ap': ['domus', 'magistratus'], '79760Ap': ['is'], '84767Ap': ['ego', 'Fotis'], '87247Ap': ['iste', 'cogitatio'], '89491Ap': ['asinus'], '90315Ap': ['asinus'], '93188Ap': ['auxilium'], '95922Ap': ['scaevitas', 'conatus'], '96697Ap': ['iuvenis'], '9

In [61]:
id_pred2_token_participant_df = pd.DataFrame([(k,v) for k,v in pred2_participant_lemma.items()], columns=["ID", "PARTICIPANT LEMMA"]) #where 'id_pred2_token_participant_df' is a dataframe containing token IDs and IDs of all the tokens annotated with a Spatial relation
id_pred2_token_participant_df

Unnamed: 0,ID,PARTICIPANT LEMMA
0,7807Aen,"[telum, tribunus]"
1,627Ap,[magister]
2,638Ap,[sermo]
3,4001Ap,[civitas]
4,6965Ap,[spectaculum]
...,...,...
906,435595Verg,[dea]
907,436869Verg,"[Iuturna, frater]"
908,438653Verg,[Iuturna]
909,440736Verg,[ille]


### Participant role

In [63]:
list_participant_role = []  # Initializing an empty list to store frames
tok2_participant_role = dict()  # Initializing an empty dictionary to map tokens to their frames
count = 0  # Initializing a count variable

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Participants' annotation in the CAS
    for relation in cas.select('webanno.custom.Paticipants'):
        dep = relation.Dependent  # Dependent token of the participant relation
        tokdep = dep.get_covered_text()  # Covered text of the dependent token
        id = str(dep.begin) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
        list_participants.append(str(tokdep))  # Appending the covered text of the dependent token to sptlist

        role = relation.Frame  # Frame associated with the participant relation
        
        # Checking if the ID already exists in tok2_participant_role
        if id in tok2_participant_role:
            list_participant_role = tok2_participant_role[id] + [role]  # If so, append the frame to the existing list of frames
            tok2_participant_role[id] = list_participant_role  # Update the dictionary with the appended list
        else:
            tok2_participant_role[id] = [role]  # If not, create a new entry with a list containing the frame

print(tok2_participant_role)  


{'7807Aen': ['Ground', 'Figure'], '627Ap': ['Figure'], '638Ap': ['Ground'], '4001Ap': ['Ground'], '6965Ap': ['Ground'], '11895Ap': ['Figure'], '13304Ap': ['Figure'], '14102Ap': ['Ground'], '20808Ap': ['Ground'], '21919Ap': ['Ground'], '31740Ap': ['Ground'], '32491Ap': ['Figure'], '34900Ap': ['Ground'], '37524Ap': ['Ground', 'Figure'], '38710Ap': ['Ground'], '40602Ap': ['Figure'], '46705Ap': ['Ground'], '51135Ap': ['Ground'], '51515Ap': ['Figure'], '53490Ap': ['Ground'], '55177Ap': ['Figure'], '55702Ap': ['Ground'], '57275Ap': ['Figure'], '61144Ap': ['Figure'], '62018Ap': ['Figure'], '63203Ap': ['Ground'], '65487Ap': ['Figure'], '69546Ap': ['Ground'], '71331Ap': ['Ground', 'Figure'], '71779Ap': ['Ground'], '72278Ap': ['Figure'], '74642Ap': ['Figure', 'Ground'], '79760Ap': ['Ground'], '84767Ap': ['Ground', 'Figure'], '87247Ap': ['Ground', 'Figure'], '89491Ap': ['Ground'], '90315Ap': ['Ground'], '93188Ap': ['Ground'], '95922Ap': ['Figure', 'Ground'], '96697Ap': ['Figure'], '99825Ap': ['Gr

In [64]:
participant_role_df = pd.DataFrame([(k,v) for k,v in tok2_participant_role.items()], columns=["ID", "PARTICIPANT ROLE"]) #where 'participant_role_df' is a dataframe containing IDs of all the tokens annotated with a participant relation (i.e., Figure/Ground) and their role
participant_role_df

Unnamed: 0,ID,PARTICIPANT ROLE
0,7807Aen,"[Ground, Figure]"
1,627Ap,[Figure]
2,638Ap,[Ground]
3,4001Ap,[Ground]
4,6965Ap,[Ground]
...,...,...
906,435595Verg,[Figure]
907,436869Verg,"[Ground, Figure]"
908,438653Verg,[Ground]
909,440736Verg,[Ground]


In [65]:
id_participant_role_lemma_df = id_pred2_token_participant_df.merge(participant_role_df, on='ID', how='left')
id_participant_role_lemma_df

Unnamed: 0,ID,PARTICIPANT LEMMA,PARTICIPANT ROLE
0,7807Aen,"[telum, tribunus]","[Ground, Figure]"
1,627Ap,[magister],[Figure]
2,638Ap,[sermo],[Ground]
3,4001Ap,[civitas],[Ground]
4,6965Ap,[spectaculum],[Ground]
...,...,...,...
906,435595Verg,[dea],[Figure]
907,436869Verg,"[Iuturna, frater]","[Ground, Figure]"
908,438653Verg,[Iuturna],[Ground]
909,440736Verg,[ille],[Ground]


In [66]:
verb_with_participant_roles_lemmas_df = id_ground_synsets_df.merge(id_participant_role_lemma_df, on='ID', how='left')
verb_with_participant_roles_lemmas_df

Unnamed: 0,ID,VERB TOKEN,MORPHOLOGICAL FEATURES,LEMMA,SENTENCE,ACTIONALITY,VERB CLASS,LITERAL MEANING,VERB STEM,PREVERB,PREVERB SEMANTICS,VERB SEMANTICS,FIGURE SEMANTICS,GROUND SEMANTICS,PARTICIPANT LEMMA,PARTICIPANT ROLE
0,7504Aen,perire,Tense=Pres|VerbForm=Inf|Voice=Act,pereo,"Quae neque Dardaniis campis potuere perire, ...",Achievement,DIE-42.4.1,False,present stem,per,"(idea of destruction/death), across",[v#00250254 pass from physical life and lose a...,['n#00004123 a human being'],,,
1,7614Aen,Concurrunt,Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbF...,concurro,Concurrunt vel uti venti cum spiritus austri ...,Achievement,BUMP-18.4-1,True,present stem,cum,together,[v#01075789 crash together with violent impact],['n#00004123 a human being'],,,
2,7807Aen,conveniunt,Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbF...,convenio,Undique conveniunt vel ut imber tela tribuno,Activity,HERD-47.5.2,True,present stem,cum,together,[v#01654097 collect in one place],['n#03176413 a body that is thrown or projected'],[n#07168973 an officer in command of a militar...,"[telum, tribunus]","[Ground, Figure]"
3,627Ap,praeeunte,Case=Abl|Gender=Masc|Number=Sing|Tense=Pres|Ve...,praeeo,"Mox in urbe Latia advena studiorum, Quiritium ...",State,ACCOMPANY-51.7,False,present stem,prae,before,"[v#01661609 be a guiding force, as with direct...",['n#07632177 a person whose occupation is teac...,,[magister],[Figure]
4,638Ap,aggressus,Case=Nom|Gender=Masc|Number=Sing|Tense=Past|Ve...,adgredior,"Mox in urbe Latia advena studiorum, Quiritium ...",Accomplishment,BEGIN-55.1,False,supine stem,ad,to,[v#01661230 begin to deal with],['n#00004123 a human being'],[n#05167497 a human written or spoken language...,[sermo],[Ground]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,435595Verg,procurrit,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,procurro,"Dum nititur acer et instat, rursus in aurigae ...",Achievement,RUN-51.3.2,True,present stem,pro,forward,[v#01410345 run or move very quickly or hastily],['n#06888584 a female deity'],,[dea],[Figure]
1479,436869Verg,succurrere,Tense=Pres|VerbForm=Inf|Voice=Act,succurro,"Iuturnam misero, fateor, succurrere fratri sua...",Activity,HELP-72.1,False,present stem,sub,under,[v#01737682 help in a difficult situation],['n#06888584 a female deity'],[n#07127521 a male with the same parents as so...,"[Iuturna, frater]","[Ground, Figure]"
1480,438653Verg,occurrere,Tense=Pres|VerbForm=Inf|Voice=Act,occurro,Harum unam celerem demisit ab aethere summo Iu...,Accomplishment,RUN-51.3.2,False,present stem,ob,to,[v#01410345 run or move very quickly or hastily],['n#10169961 a serious (sometimes fatal infect...,[n#06381267 a point located with respect to su...,[Iuturna],[Ground]
1481,440736Verg,subirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,subeo,"Vix illud lecti bis sex cervice subirent, qual...",Achievement,PUT_DIRECTION-9.4,False,present stem,sub,under,[v#01343923 raise from a lower to a higher pos...,['n#00004123 a human being'],[n#06669293 a lump of hard consolidated minera...,[ille],[Ground]


In [67]:
verb_with_participant_roles_lemmas_df.to_csv('Latin_with_participants_without_spatial_relations.csv')

### Mapping IDs of verb tokens onto IDs of spatial relations

In [68]:
# Initializing an empty list to store spatial relations covered text
list_spatial_relations = []

# Initializing an empty dictionary to map tokens to their spatial relations with unique identifiers
tok2_spatial_relations = dict()

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Current file name
    file_input_abbr = files_abbreviated[i]  # Abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file

    # Looping through each 'Spatiality' annotation in the CAS
    for relation in cas.select('webanno.custom.Spatiality'):
        dep = relation.Dependent  # Dependent token of the spatial relation
        tokdep = dep.get_covered_text()  # Covered text of the dependent token
        id = str(dep.begin) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
        list_spatial_relations.append(str(tokdep))  # Appending the covered text of the dependent token to list_spatial_relations

        gov = relation.Governor  # Governor token of the spatial relation
        tokgov = gov.get_covered_text()  # Covered text of the governor token
        id2 = str(gov.begin) + file_input_abbr # Creating a unique identifier by appending the file abbreviation to the offset of the governor token
        
        if id in tok2_spatial_relations:  # Checking if the ID already exists in tok2_spatial_relations
            tok2_spatial_relations[id].append(id2)  # If so, append the unique identifier to the existing list
        else:
            tok2_spatial_relations[id] = [id2]  # If not, create a new entry with a list containing the unique identifier

print(tok2_spatial_relations)


{'7504Aen': ['7489Aen'], '7807Aen': ['7836Aen', '7799Aen'], '638Ap': ['583Ap'], '3306Ap': ['3296Ap', '3281Ap'], '4001Ap': ['3991Ap'], '6965Ap': ['6953Ap'], '11123Ap': ['11134Ap'], '11895Ap': ['11884Ap'], '14102Ap': ['14094Ap'], '20808Ap': ['20796Ap'], '21919Ap': ['21891Ap'], '31740Ap': ['31723Ap'], '32491Ap': ['32474Ap'], '34900Ap': ['34879Ap'], '37524Ap': ['37514Ap'], '38710Ap': ['38703Ap'], '46705Ap': ['46697Ap'], '51135Ap': ['51105Ap'], '55702Ap': ['55694Ap'], '63203Ap': ['63194Ap'], '69546Ap': ['69535Ap'], '71331Ap': ['71247Ap'], '71779Ap': ['71770Ap'], '74642Ap': ['74628Ap'], '84767Ap': ['84764Ap'], '87247Ap': ['87232Ap'], '93188Ap': ['93172Ap'], '99825Ap': ['99839Ap'], '105746Ap': ['105738Ap'], '110168Ap': ['110161Ap'], '111432Ap': ['111361Ap'], '113591Ap': ['113574Ap'], '116038Ap': ['116024Ap'], '117021Ap': ['117034Ap'], '125471Ap': ['125451Ap'], '133136Ap': ['133128Ap'], '134636Ap': ['134653Ap'], '140607Ap': ['140619Ap'], '149618Ap': ['149592Ap'], '150834Ap': ['150825Ap'], '158

In [69]:
tok2_spatial_relations_id_df = pd.DataFrame([(k,v) for k,v in tok2_spatial_relations.items()], columns=["ID", "SPATIAL RELATION ID"]) #where 'tok2_spatial_relations_id_df' is a dataframe containing token IDs and IDs of all the tokens annotated with a Spatial relation
tok2_spatial_relations_id_df

Unnamed: 0,ID,SPATIAL RELATION ID
0,7504Aen,[7489Aen]
1,7807Aen,"[7836Aen, 7799Aen]"
2,638Ap,[583Ap]
3,3306Ap,"[3296Ap, 3281Ap]"
4,4001Ap,[3991Ap]
...,...,...
683,424445Verg,[424413Verg]
684,428401Verg,[428307Verg]
685,436869Verg,[436880Verg]
686,438653Verg,[438644Verg]


In [70]:
pred2_spatial_relation_lemmas = dict()  # Initialize an empty dictionary to store mappings between predicate IDs and their spatial relation lemmas

# Iterate through each predicate ID in the dictionary tok2_spatial_relations
for pred_id in tok2_spatial_relations:
    spatial_relation_ids = tok2_spatial_relations[pred_id]  # Retrieve the list of spatial relation IDs associated with the current predicate ID
    spatial_relation_lemmas = list()  # Initialize an empty list to store lemmas corresponding to spatial relations
    
    # Iterate through each spatial relation ID in the list of IDs
    for spatial_relation in spatial_relation_ids:
        spatial_relation_lemma = pred2_lemmas[spatial_relation]  # Retrieve the lemma associated with the current spatial relation ID from pred2_lemmas dictionary
        spatial_relation_lemmas.append(spatial_relation_lemma)  # Append the retrieved lemma to the list of lemmas for the current predicate ID
        pred2_spatial_relation_lemmas[pred_id] = spatial_relation_lemmas  # Assign the list of lemmas to the current predicate ID in the pred2_spatial_relation_lemmas dictionary

print(pred2_spatial_relation_lemmas)  


{'7504Aen': ['campus'], '7807Aen': ['tribunus', 'undique'], '638Ap': ['sermo'], '3306Ap': ['ingluvies', 'occipitium'], '4001Ap': ['civitas'], '6965Ap': ['spectaculum'], '11123Ap': ['istinc'], '11895Ap': ['contrarius'], '14102Ap': ['fluvius'], '20808Ap': ['solitudo'], '21919Ap': ['villula'], '31740Ap': ['domus'], '32491Ap': ['fauces'], '34900Ap': ['hospitium'], '37524Ap': ['lumen'], '38710Ap': ['oculus'], '46705Ap': ['fluctus'], '51135Ap': ['locus'], '55702Ap': ['cadaver'], '63203Ap': ['latro'], '69546Ap': ['ego'], '71331Ap': ['theatrum'], '71779Ap': ['fortuna'], '74642Ap': ['domus'], '84767Ap': ['ego'], '87247Ap': ['cogitatio'], '93188Ap': ['auxilium'], '99825Ap': ['locus'], '105746Ap': ['humerus'], '110168Ap': ['epulae'], '111432Ap': ['confinium'], '113591Ap': ['porta'], '116038Ap': ['meta'], '117021Ap': ['domus'], '125471Ap': ['voluptas'], '133136Ap': ['nuptiae'], '134636Ap': ['os'], '140607Ap': ['domus'], '149618Ap': ['unde'], '150834Ap': ['scopulum'], '158305Ap': ['tu'], '161942Ap'

In [71]:
spatial_relation_lemma_df = pd.DataFrame([(k,v) for k,v in pred2_spatial_relation_lemmas.items()], columns=["ID", "SPATIAL RELATION LEMMA"]) #where 'spatial_relation_lemma_df' is a dataframe containing token IDs and IDs of all the tokens annotated with a Spatial relation
spatial_relation_lemma_df

Unnamed: 0,ID,SPATIAL RELATION LEMMA
0,7504Aen,[campus]
1,7807Aen,"[tribunus, undique]"
2,638Ap,[sermo]
3,3306Ap,"[ingluvies, occipitium]"
4,4001Ap,[civitas]
...,...,...
683,424445Verg,[Hyllus]
684,428401Verg,[hic]
685,436869Verg,[frater]
686,438653Verg,[Iuturna]


In [72]:
list_spatial_relation_roles = []  # Initializing an empty list to store frames
tok2_spatial_relation_roles = dict()  # Initializing an empty dictionary to map tokens to their frames
count = 0  # Initializing a count variable

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Getting the current file name
    file_input_abbr = files_abbreviated[i]  # Getting the abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Participants' annotation in the CAS
    for relation in cas.select('webanno.custom.Spatiality'):
        dep = relation.Dependent  # Dependent token of the participant relation
        tokdep = dep.get_covered_text()  # Covered text of the dependent token
        id = str(dep.begin) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
        list_participants.append(str(tokdep))  # Appending the covered text of the dependent token to sptlist

        spatial_relation = relation.Spatials  # Frame associated with the participant relation
        
        # Checking if the ID already exists in tok2_spatial_relation_roles
        if id in tok2_spatial_relation_roles:
            list_spatial_relation_roles = tok2_spatial_relation_roles[id] + [spatial_relation]  # If so, append the frame to the existing list of frames
            tok2_spatial_relation_roles[id] = list_spatial_relation_roles  # Update the dictionary with the appended list
        else:
            tok2_spatial_relation_roles[id] = [spatial_relation]  # If not, create a new entry with a list containing the frame

print(tok2_spatial_relation_roles)  



{'7504Aen': ['LOCATION'], '7807Aen': ['GOAL', 'SOURCE'], '638Ap': ['GOAL'], '3306Ap': ['PATH', 'GOAL'], '4001Ap': ['GOAL'], '6965Ap': ['GOAL'], '11123Ap': ['SOURCE'], '11895Ap': ['GOAL'], '14102Ap': ['PATH'], '20808Ap': ['PATH'], '21919Ap': ['GOAL'], '31740Ap': ['GOAL'], '32491Ap': ['SOURCE'], '34900Ap': ['GOAL'], '37524Ap': ['GOAL'], '38710Ap': ['GOAL'], '46705Ap': ['GOAL'], '51135Ap': ['GOAL'], '55702Ap': ['GOAL'], '63203Ap': ['GOAL'], '69546Ap': ['GOAL'], '71331Ap': ['PATH'], '71779Ap': ['GOAL'], '74642Ap': ['GOAL'], '84767Ap': ['GOAL'], '87247Ap': ['GOAL'], '93188Ap': ['GOAL'], '99825Ap': ['GOAL'], '105746Ap': ['GOAL'], '110168Ap': ['GOAL'], '111432Ap': ['GOAL'], '113591Ap': ['GOAL'], '116038Ap': ['GOAL'], '117021Ap': ['SOURCE'], '125471Ap': ['GOAL'], '133136Ap': ['GOAL'], '134636Ap': ['GOAL'], '140607Ap': ['SOURCE'], '149618Ap': ['SOURCE'], '150834Ap': ['GOAL'], '158305Ap': ['GOAL'], '161942Ap': ['GOAL'], '162189Ap': ['GOAL'], '162465Ap': ['PATH'], '1498Caes': ['SOURCE'], '4296Cae

In [73]:
spatial_relation_role_df = pd.DataFrame([(k,v) for k,v in tok2_spatial_relation_roles.items()], columns=["ID", "SPATIAL RELATION ROLE"]) #where 'spatial_relation_role_df' is a dataframe containing token IDs and IDs of all the tokens annotated with a Spatial relation
spatial_relation_role_df

Unnamed: 0,ID,SPATIAL RELATION ROLE
0,7504Aen,[LOCATION]
1,7807Aen,"[GOAL, SOURCE]"
2,638Ap,[GOAL]
3,3306Ap,"[PATH, GOAL]"
4,4001Ap,[GOAL]
...,...,...
683,424445Verg,[GOAL]
684,428401Verg,[GOAL]
685,436869Verg,[GOAL]
686,438653Verg,[GOAL]


In [74]:
id_spatial_relation_roles_lemmas_df = spatial_relation_role_df.merge(spatial_relation_lemma_df, on='ID', how='left') #where 'id_spatial_relation_roles_lemmas_df' merges 'spatial_relation_lemma_df' and 'spatial_relation_role_df'
id_spatial_relation_roles_lemmas_df

Unnamed: 0,ID,SPATIAL RELATION ROLE,SPATIAL RELATION LEMMA
0,7504Aen,[LOCATION],[campus]
1,7807Aen,"[GOAL, SOURCE]","[tribunus, undique]"
2,638Ap,[GOAL],[sermo]
3,3306Ap,"[PATH, GOAL]","[ingluvies, occipitium]"
4,4001Ap,[GOAL],[civitas]
...,...,...,...
683,424445Verg,[GOAL],[Hyllus]
684,428401Verg,[GOAL],[hic]
685,436869Verg,[GOAL],[frater]
686,438653Verg,[GOAL],[Iuturna]


In [75]:
pred2_spatial_relation_synsets = dict()  # Initialize an empty dictionary to store mappings between predicate IDs and their spatial relation lemmas

# Iterate through each predicate ID in the dictionary tok2_spatial_relations
for pred_id in tok2_spatial_relations:
    spatial_relation_ids = tok2_spatial_relations[pred_id]  # Retrieve the list of spatial relation IDs associated with the current predicate ID
    spatial_relation_synsets = list()  # Initialize an empty list to store lemmas corresponding to spatial relations
    
    # Iterate through each spatial relation ID in the list of IDs
    for spatial_relation in spatial_relation_ids:
        spatial_relation_synset = tok2_synsets[spatial_relation]  # Retrieve the lemma associated with the current spatial relation ID from pred2_lemmas dictionary
        # Extract only the lemma string from the object and append it to the list of lemmas for the current predicate ID
        spatial_relation_synsets.append(spatial_relation_synset.elements[0])  
    pred2_spatial_relation_synsets[pred_id] = spatial_relation_synsets  # Assign the list of lemmas to the current predicate ID in the pred2_spatial_relation_synsets dictionary
        
print(pred2_spatial_relation_synsets)  


{'7504Aen': ['n#06727012 extensive tract of level open land'], '7807Aen': ['n#07168973 an officer in command of a military unit', 'r#L2533591 from any or all places'], '638Ap': ['n#05167497 a human written or spoken language used by a community; opposed to e.g. a computer language'], '3306Ap': ['n#04296952 the passage to the stomach and lungs; in the front part of the neck below the chin and above the collarbone', 'n#04292200 back part of the head or skull'], '4001Ap': ['n#06382213 an urban area with a fixed boundary that is smaller than a city'], '6965Ap': ['n#04964487 a public exhibition or entertainment'], '11123Ap': ['r#00041436 (archaic) from this place'], '11895Ap': ['n#09984290 a relation of direct opposition'], '14102Ap': ['n#06789983 a large natural stream of water (larger than a creek)'], '20808Ap': ['n#06391772 a wild and uninhabited area'], '21919Ap': ['n#02666884 a house for the farmer and family'], '31740Ap': ['n#02837386 a dwelling that serves as living quarters for one 

In [76]:
spatial_relation_synset_df = pd.DataFrame([(k,v) for k,v in pred2_spatial_relation_synsets.items()], columns=["ID", "SPATIAL RELATION SEMANTICS"]) #where 'spatial_relation_synset_df' is a dataframe containing verb IDs and semantics of all the tokens annotated with a Spatial relation
spatial_relation_synset_df

Unnamed: 0,ID,SPATIAL RELATION SEMANTICS
0,7504Aen,[n#06727012 extensive tract of level open land]
1,7807Aen,[n#07168973 an officer in command of a militar...
2,638Ap,[n#05167497 a human written or spoken language...
3,3306Ap,[n#04296952 the passage to the stomach and lun...
4,4001Ap,[n#06382213 an urban area with a fixed boundar...
...,...,...
683,424445Verg,[n#00004123 a human being]
684,428401Verg,[n#00004123 a human being]
685,436869Verg,[n#07127521 a male with the same parents as so...
686,438653Verg,[n#06381267 a point located with respect to su...


In [77]:
id_spatial_relation_roles_lemmas_synsets_df = id_spatial_relation_roles_lemmas_df.merge(spatial_relation_synset_df, on='ID', how='left') #where 'id_spatial_relation_roles_lemmas_synsets_df' merges 'id_spatial_relation_roles_lemmas_df' and 'spatial_relation_synset_df'
id_spatial_relation_roles_lemmas_synsets_df

Unnamed: 0,ID,SPATIAL RELATION ROLE,SPATIAL RELATION LEMMA,SPATIAL RELATION SEMANTICS
0,7504Aen,[LOCATION],[campus],[n#06727012 extensive tract of level open land]
1,7807Aen,"[GOAL, SOURCE]","[tribunus, undique]",[n#07168973 an officer in command of a militar...
2,638Ap,[GOAL],[sermo],[n#05167497 a human written or spoken language...
3,3306Ap,"[PATH, GOAL]","[ingluvies, occipitium]",[n#04296952 the passage to the stomach and lun...
4,4001Ap,[GOAL],[civitas],[n#06382213 an urban area with a fixed boundar...
...,...,...,...,...
683,424445Verg,[GOAL],[Hyllus],[n#00004123 a human being]
684,428401Verg,[GOAL],[hic],[n#00004123 a human being]
685,436869Verg,[GOAL],[frater],[n#07127521 a male with the same parents as so...
686,438653Verg,[GOAL],[Iuturna],[n#06381267 a point located with respect to su...


In [78]:
# Initializing an empty list to store spatial relations covered text
list_spatial_relations = []

# Initializing an empty dictionary to map tokens to their spatial relations with unique identifiers
tok2_spatial_relations = dict()

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Current file name
    file_input_abbr = files_abbreviated[i]  # Abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file

    # Looping through each 'Spatiality' annotation in the CAS
    for relation in cas.select('webanno.custom.Spatiality'):
        dep = relation.Dependent  # Dependent token of the spatial relation
        tokdep = dep.get_covered_text()  # Covered text of the dependent token
        id = str(dep.begin) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
        list_spatial_relations.append(str(tokdep))  # Appending the covered text of the dependent token to list_spatial_relations

        gov = relation.Governor  # Governor token of the spatial relation
        tokgov = gov.get_covered_text()  # Covered text of the governor token
        id2 = str(gov.begin) + file_input_abbr # Creating a unique identifier by appending the file abbreviation to the offset of the governor token
        
        if id in tok2_spatial_relations:  # Checking if the ID already exists in tok2_spatial_relations
            tok2_spatial_relations[id].append(id2)  # If so, append the unique identifier to the existing list
        else:
            tok2_spatial_relations[id] = [id2]  # If not, create a new entry with a list containing the unique identifier

print(tok2_spatial_relations)



{'7504Aen': ['7489Aen'], '7807Aen': ['7836Aen', '7799Aen'], '638Ap': ['583Ap'], '3306Ap': ['3296Ap', '3281Ap'], '4001Ap': ['3991Ap'], '6965Ap': ['6953Ap'], '11123Ap': ['11134Ap'], '11895Ap': ['11884Ap'], '14102Ap': ['14094Ap'], '20808Ap': ['20796Ap'], '21919Ap': ['21891Ap'], '31740Ap': ['31723Ap'], '32491Ap': ['32474Ap'], '34900Ap': ['34879Ap'], '37524Ap': ['37514Ap'], '38710Ap': ['38703Ap'], '46705Ap': ['46697Ap'], '51135Ap': ['51105Ap'], '55702Ap': ['55694Ap'], '63203Ap': ['63194Ap'], '69546Ap': ['69535Ap'], '71331Ap': ['71247Ap'], '71779Ap': ['71770Ap'], '74642Ap': ['74628Ap'], '84767Ap': ['84764Ap'], '87247Ap': ['87232Ap'], '93188Ap': ['93172Ap'], '99825Ap': ['99839Ap'], '105746Ap': ['105738Ap'], '110168Ap': ['110161Ap'], '111432Ap': ['111361Ap'], '113591Ap': ['113574Ap'], '116038Ap': ['116024Ap'], '117021Ap': ['117034Ap'], '125471Ap': ['125451Ap'], '133136Ap': ['133128Ap'], '134636Ap': ['134653Ap'], '140607Ap': ['140619Ap'], '149618Ap': ['149592Ap'], '150834Ap': ['150825Ap'], '158

In [79]:
#Spatial relation expression

list_spatial_relation_expressions = [] # Initializing a list to store expressions
tok2_spatial_relation_expression = dict() # Initializing a dictionary to map tokens to their expressions
count = 0 # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Current file name
    file_input_abbr = files_abbreviated[i]  # Abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Expressedby' annotation in the CAS
    for relation in cas.select('webanno.custom.Expressedby'):
        # Looping through each token covered by 'Expressedby' annotation
        for token in cas.select_covered('webanno.custom.Expressedby', relation):
            tok = token.get_covered_text()  # Covered text of the token
            id = str(token.begin) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            spatial_relation_expression = relation.preposition  # Preposition associated with the expression
            
            # Checking if the ID already exists in tok2_spatial_relation_expression
            if id in tok2_spatial_relation_expression:
                tok2_spatial_relation_expression = tok2_spatial_relation_expression[id] + [spatial_relation_expression]
                tok2_spatial_relation_expression[id] = list_spatial_relation_expressions
                #tok2_spatial_relation_expression[id].append(spatial_relation_expression)  # If so, append the preposition to the existing list of expressions
            else:
                tok2_spatial_relation_expression[id] = [spatial_relation_expression]  # If not, create a new entry with the expression list

print(tok2_spatial_relation_expression)



{'7489Aen': ['ABL'], '7799Aen': ['adverb'], '7836Aen': ['DAT'], '583Ap': ['ACC'], '3281Ap': ['ad + ACC'], '3296Ap': ['per + ACC'], '3991Ap': ['ACC'], '6953Ap': ['ACC'], '11134Ap': ['adverb'], '11884Ap': ['in + ACC'], '14094Ap': ['per + ACC'], '20796Ap': ['per + ACC'], '21891Ap': ['ad + ACC'], '31723Ap': ['ad + ACC'], '32474Ap': ['de + ABL'], '34879Ap': ['ad + ACC'], '37514Ap': ['DAT'], '38703Ap': ['DAT'], '46697Ap': ['ACC'], '51105Ap': ['ACC'], '53484Ap': ['ACC'], '55694Ap': ['ACC'], '63194Ap': ['ACC'], '69535Ap': ['ACC'], '71247Ap': ['per + ACC'], '71770Ap': ['DAT'], '74628Ap': ['ACC'], '84764Ap': ['ACC'], '87232Ap': ['DAT'], '93172Ap': ['ad + ACC'], '99839Ap': ['ad + ACC'], '105738Ap': ['ACC'], '109743Ap': ['ACC'], '110161Ap': ['ACC'], '110876Ap': ['ACC'], '111361Ap': ['ad + ACC'], '113574Ap': ['ACC'], '116024Ap': ['ACC'], '117034Ap': ['ex + ABL'], '125451Ap': ['in + ACC'], '133128Ap': ['ACC'], '134492Ap': ['ACC'], '134653Ap': ['ob + ACC'], '140619Ap': ['ex + ABL'], '149592Ap': ['adv

In [80]:
# Initialize a new dictionary to store the mapping of identifiers to expressions
pred2_spatial_relation_expression = {}

# Iterate over the keys and values of tok2_spatial_relations
for identifier, ids in tok2_spatial_relations.items():
    expressions = []  # Initialize an empty list to store expressions for the current identifier
    for id in ids:
        # Check if the id exists in tok2_spatial_relation_expression
        if id in tok2_spatial_relation_expression:
            expressions.extend(tok2_spatial_relation_expression[id])  # Extend the expressions list with expressions for the current id
    
    # Add the list of expressions to the combined dictionary only if it's not empty
    if expressions:
        pred2_spatial_relation_expression[identifier] = expressions

print(pred2_spatial_relation_expression)


{'7504Aen': ['ABL'], '7807Aen': ['DAT', 'adverb'], '638Ap': ['ACC'], '3306Ap': ['per + ACC', 'ad + ACC'], '4001Ap': ['ACC'], '6965Ap': ['ACC'], '11123Ap': ['adverb'], '11895Ap': ['in + ACC'], '14102Ap': ['per + ACC'], '20808Ap': ['per + ACC'], '21919Ap': ['ad + ACC'], '31740Ap': ['ad + ACC'], '32491Ap': ['de + ABL'], '34900Ap': ['ad + ACC'], '37524Ap': ['DAT'], '38710Ap': ['DAT'], '46705Ap': ['ACC'], '51135Ap': ['ACC'], '55702Ap': ['ACC'], '63203Ap': ['ACC'], '69546Ap': ['ACC'], '71331Ap': ['per + ACC'], '71779Ap': ['DAT'], '74642Ap': ['ACC'], '84767Ap': ['ACC'], '87247Ap': ['DAT'], '93188Ap': ['ad + ACC'], '99825Ap': ['ad + ACC'], '105746Ap': ['ACC'], '110168Ap': ['ACC'], '111432Ap': ['ad + ACC'], '113591Ap': ['ACC'], '116038Ap': ['ACC'], '117021Ap': ['ex + ABL'], '125471Ap': ['in + ACC'], '133136Ap': ['ACC'], '134636Ap': ['ob + ACC'], '140607Ap': ['ex + ABL'], '149618Ap': ['adverb'], '150834Ap': ['ACC'], '158305Ap': ['DAT'], '161942Ap': ['ad + ACC'], '162189Ap': ['ad + ACC'], '162465

In [81]:
# Find the missing key-value pair (if needed)
missing_key = None
for key in tok2_spatial_relations:
    if key not in pred2_spatial_relation_expression:
        missing_key = key
        break

print("Missing key:", missing_key)


Missing key: None


In [82]:
spatial_relation_expression_df = pd.DataFrame([(k,v) for k,v in pred2_spatial_relation_expression.items()], columns=["ID", "SPATIAL RELATION EXPRESSION"]) #where 'tok2sr_df' is a dataframe containing token IDs and IDs of all the tokens annotated with a Spatial relation
spatial_relation_expression_df

Unnamed: 0,ID,SPATIAL RELATION EXPRESSION
0,7504Aen,[ABL]
1,7807Aen,"[DAT, adverb]"
2,638Ap,[ACC]
3,3306Ap,"[per + ACC, ad + ACC]"
4,4001Ap,[ACC]
...,...,...
683,424445Verg,[DAT]
684,428401Verg,[DAT]
685,436869Verg,[DAT]
686,438653Verg,[DAT]


In [83]:
id_spatial_relation_expression_df = spatial_relation_expression_df.merge(id_spatial_relation_roles_lemmas_synsets_df, on='ID', how='left')
id_spatial_relation_expression_df

Unnamed: 0,ID,SPATIAL RELATION EXPRESSION,SPATIAL RELATION ROLE,SPATIAL RELATION LEMMA,SPATIAL RELATION SEMANTICS
0,7504Aen,[ABL],[LOCATION],[campus],[n#06727012 extensive tract of level open land]
1,7807Aen,"[DAT, adverb]","[GOAL, SOURCE]","[tribunus, undique]",[n#07168973 an officer in command of a militar...
2,638Ap,[ACC],[GOAL],[sermo],[n#05167497 a human written or spoken language...
3,3306Ap,"[per + ACC, ad + ACC]","[PATH, GOAL]","[ingluvies, occipitium]",[n#04296952 the passage to the stomach and lun...
4,4001Ap,[ACC],[GOAL],[civitas],[n#06382213 an urban area with a fixed boundar...
...,...,...,...,...,...
683,424445Verg,[DAT],[GOAL],[Hyllus],[n#00004123 a human being]
684,428401Verg,[DAT],[GOAL],[hic],[n#00004123 a human being]
685,436869Verg,[DAT],[GOAL],[frater],[n#07127521 a male with the same parents as so...
686,438653Verg,[DAT],[GOAL],[Iuturna],[n#06381267 a point located with respect to su...


In [84]:
verbs_with_participants_and_spatial_relations = verb_with_participant_roles_lemmas_df.merge(id_spatial_relation_expression_df, on='ID', how='left')
verbs_with_participants_and_spatial_relations

Unnamed: 0,ID,VERB TOKEN,MORPHOLOGICAL FEATURES,LEMMA,SENTENCE,ACTIONALITY,VERB CLASS,LITERAL MEANING,VERB STEM,PREVERB,PREVERB SEMANTICS,VERB SEMANTICS,FIGURE SEMANTICS,GROUND SEMANTICS,PARTICIPANT LEMMA,PARTICIPANT ROLE,SPATIAL RELATION EXPRESSION,SPATIAL RELATION ROLE,SPATIAL RELATION LEMMA,SPATIAL RELATION SEMANTICS
0,7504Aen,perire,Tense=Pres|VerbForm=Inf|Voice=Act,pereo,"Quae neque Dardaniis campis potuere perire, ...",Achievement,DIE-42.4.1,False,present stem,per,"(idea of destruction/death), across",[v#00250254 pass from physical life and lose a...,['n#00004123 a human being'],,,,[ABL],[LOCATION],[campus],[n#06727012 extensive tract of level open land]
1,7614Aen,Concurrunt,Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbF...,concurro,Concurrunt vel uti venti cum spiritus austri ...,Achievement,BUMP-18.4-1,True,present stem,cum,together,[v#01075789 crash together with violent impact],['n#00004123 a human being'],,,,,,,
2,7807Aen,conveniunt,Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbF...,convenio,Undique conveniunt vel ut imber tela tribuno,Activity,HERD-47.5.2,True,present stem,cum,together,[v#01654097 collect in one place],['n#03176413 a body that is thrown or projected'],[n#07168973 an officer in command of a militar...,"[telum, tribunus]","[Ground, Figure]","[DAT, adverb]","[GOAL, SOURCE]","[tribunus, undique]",[n#07168973 an officer in command of a militar...
3,627Ap,praeeunte,Case=Abl|Gender=Masc|Number=Sing|Tense=Pres|Ve...,praeeo,"Mox in urbe Latia advena studiorum, Quiritium ...",State,ACCOMPANY-51.7,False,present stem,prae,before,"[v#01661609 be a guiding force, as with direct...",['n#07632177 a person whose occupation is teac...,,[magister],[Figure],,,,
4,638Ap,aggressus,Case=Nom|Gender=Masc|Number=Sing|Tense=Past|Ve...,adgredior,"Mox in urbe Latia advena studiorum, Quiritium ...",Accomplishment,BEGIN-55.1,False,supine stem,ad,to,[v#01661230 begin to deal with],['n#00004123 a human being'],[n#05167497 a human written or spoken language...,[sermo],[Ground],[ACC],[GOAL],[sermo],[n#05167497 a human written or spoken language...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,435595Verg,procurrit,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,procurro,"Dum nititur acer et instat, rursus in aurigae ...",Achievement,RUN-51.3.2,True,present stem,pro,forward,[v#01410345 run or move very quickly or hastily],['n#06888584 a female deity'],,[dea],[Figure],,,,
1479,436869Verg,succurrere,Tense=Pres|VerbForm=Inf|Voice=Act,succurro,"Iuturnam misero, fateor, succurrere fratri sua...",Activity,HELP-72.1,False,present stem,sub,under,[v#01737682 help in a difficult situation],['n#06888584 a female deity'],[n#07127521 a male with the same parents as so...,"[Iuturna, frater]","[Ground, Figure]",[DAT],[GOAL],[frater],[n#07127521 a male with the same parents as so...
1480,438653Verg,occurrere,Tense=Pres|VerbForm=Inf|Voice=Act,occurro,Harum unam celerem demisit ab aethere summo Iu...,Accomplishment,RUN-51.3.2,False,present stem,ob,to,[v#01410345 run or move very quickly or hastily],['n#10169961 a serious (sometimes fatal infect...,[n#06381267 a point located with respect to su...,[Iuturna],[Ground],[DAT],[GOAL],[Iuturna],[n#06381267 a point located with respect to su...
1481,440736Verg,subirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,subeo,"Vix illud lecti bis sex cervice subirent, qual...",Achievement,PUT_DIRECTION-9.4,False,present stem,sub,under,[v#01343923 raise from a lower to a higher pos...,['n#00004123 a human being'],[n#06669293 a lump of hard consolidated minera...,[ille],[Ground],,,,


In [85]:
verbs_with_participants_and_spatial_relations.to_csv('Latin_with_participants_and_spatial_relations.csv')

### Places (WHG)

In [86]:
list_places = []  # Initializing an empty list to store places
pred2_place = dict()  # Initializing an empty dictionary to map tokens to their places
count = 0  # Initializing a count variable to keep track of iterations

# Looping through each file in the list of all_files
for i in range(len(all_files)):
    file_input = all_files[i]  # Current file name
    file_input_abbr = files_abbreviated[i]  # Abbreviated name for the current file
    xml = get_xml(file_input)  # Retrieving XML content from the current file
    cas = load_cas(file_input)  # Loading CAS from the current file
    
    # Looping through each 'Place' annotation in the CAS
    for relation in cas.select('webanno.custom.Place'):
        # Looping through each token covered by 'Place' annotation
        for token in cas.select_covered('webanno.custom.Place', relation):
            tok = token.get_covered_text()  # Covered text of the token
            id = str(token.begin) + file_input_abbr  # Creating a unique ID for the token based on its offset and file abbreviation
            place = relation.Places  # Places associated with the token
            
            # Checking if the ID already exists in pred2_place
            if id in pred2_place:
                list_place = pred2_place[id] + place  # If so, append the places to the existing list of places
                pred2_place[id] = list_place  # Update the dictionary with the appended list
            else:
                pred2_place[id] = place  # If not, create a new entry with a list containing the places

print(pred2_place)  



{'4840Caes': 'https://whgazetteer.org/places/14153618/portal', '5323Caes': 'https://whgazetteer.org/places/12347418/portal', '5989Caes': 'https://whgazetteer.org/places/84296/detail', '29911Caes': 'https://whgazetteer.org/places/12347419/portal', '31358Caes': 'https://whgazetteer.org/places/12347419/portal', '33960Caes': 'https://whgazetteer.org/places/12347419/portal', '37510Caes': 'https://whgazetteer.org/places/12347419/portal', '46737Caes': 'https://whgazetteer.org/places/12347419/portal', '46862Caes': 'https://whgazetteer.org/places/12347419/portal', '56119Caes': 'https://whgazetteer.org/places/12347419/portal', '70231Caes': 'https://whgazetteer.org/places/12636696/portal', '106021Caes': 'https://whgazetteer.org/places/6465620/detail', '111957Caes': 'https://whgazetteer.org/places/81460/detail', '111967Caes': 'https://whgazetteer.org/places/82552/detail', '114147Caes': 'https://whgazetteer.org/places/12347419/portal', '117025Caes': 'https://whgazetteer.org/places/12347419/portal',

In [87]:
places_df = pd.DataFrame([(k,v) for k,v in pred2_place.items()], columns=["PLACE ID", "WHG URL"]) #where 'tok2sr_df' is a dataframe containing token IDs and IDs of all the tokens annotated with a Spatial relation
places_df

Unnamed: 0,PLACE ID,WHG URL
0,4840Caes,https://whgazetteer.org/places/14153618/portal
1,5323Caes,https://whgazetteer.org/places/12347418/portal
2,5989Caes,https://whgazetteer.org/places/84296/detail
3,29911Caes,https://whgazetteer.org/places/12347419/portal
4,31358Caes,https://whgazetteer.org/places/12347419/portal
5,33960Caes,https://whgazetteer.org/places/12347419/portal
6,37510Caes,https://whgazetteer.org/places/12347419/portal
7,46737Caes,https://whgazetteer.org/places/12347419/portal
8,46862Caes,https://whgazetteer.org/places/12347419/portal
9,56119Caes,https://whgazetteer.org/places/12347419/portal


In [88]:
pred2_url_relations = dict()  # Initialize an empty dictionary to store mappings between predicate IDs and their associated web URLs

# Iterate through each predicate ID in the dictionary tok2_spatial_relations
for pred_id in tok2_spatial_relations:
    spatial_relation_ids = tok2_spatial_relations[pred_id]  # Retrieve the list of spatial relation IDs associated with the current predicate ID
    url_relations = list()  # Initialize an empty list to store web URLs corresponding to spatial relations
    
    # Iterate through each spatial relation ID in the list of IDs
    for spatial_relation_id in spatial_relation_ids:
        # Retrieve the web URL associated with the current spatial relation ID from pred2_place dictionary
        url_relation = pred2_place.get(spatial_relation_id, None)
        if url_relation is not None:
            url_relations.append(url_relation)  # Append the web URL to the list of URLs for the current predicate ID
            
    pred2_url_relations[pred_id] = url_relations  # Assign the list of URLs to the current predicate ID in the pred2_url_relations dictionary
        
print(pred2_url_relations)  


{'7504Aen': [], '7807Aen': [], '638Ap': [], '3306Ap': [], '4001Ap': [], '6965Ap': [], '11123Ap': [], '11895Ap': [], '14102Ap': [], '20808Ap': [], '21919Ap': [], '31740Ap': [], '32491Ap': [], '34900Ap': [], '37524Ap': [], '38710Ap': [], '46705Ap': [], '51135Ap': [], '55702Ap': [], '63203Ap': [], '69546Ap': [], '71331Ap': [], '71779Ap': [], '74642Ap': [], '84767Ap': [], '87247Ap': [], '93188Ap': [], '99825Ap': [], '105746Ap': [], '110168Ap': [], '111432Ap': [], '113591Ap': [], '116038Ap': [], '117021Ap': [], '125471Ap': [], '133136Ap': [], '134636Ap': [], '140607Ap': [], '149618Ap': [], '150834Ap': [], '158305Ap': [], '161942Ap': [], '162189Ap': [], '162465Ap': [], '1498Caes': [], '4296Caes': [], '4578Caes': [], '4854Caes': ['https://whgazetteer.org/places/14153618/portal'], '4978Caes': [], '5732Caes': [], '5997Caes': ['https://whgazetteer.org/places/84296/detail'], '7012Caes': [], '9426Caes': [], '9649Caes': [], '10407Caes': [], '10617Caes': [], '10862Caes': [], '10890Caes': [], '10931C

In [89]:
pred2_url_relations = dict()  # Initialize an empty dictionary to store mappings between predicate IDs and their associated web URLs

# Iterate through each predicate ID in the dictionary tok2_spatial_relations
for pred_id in tok2_spatial_relations:
    spatial_relation_ids = tok2_spatial_relations[pred_id]  # Retrieve the list of spatial relation IDs associated with the current predicate ID
    url_relation = None  # Initialize url_relation to None
    
    # Iterate through each spatial relation ID in the list of IDs
    for spatial_relation_id in spatial_relation_ids:
        # Retrieve the web URL associated with the current spatial relation ID from pred2_place dictionary
        url_relation = pred2_place.get(spatial_relation_id, None)
        if url_relation is not None:
            # If a URL is found, break the loop
            break
    
    pred2_url_relations[pred_id] = url_relation  # Assign the URL to the current predicate ID in the pred2_url_relations dictionary
        
print(pred2_url_relations)  


{'7504Aen': None, '7807Aen': None, '638Ap': None, '3306Ap': None, '4001Ap': None, '6965Ap': None, '11123Ap': None, '11895Ap': None, '14102Ap': None, '20808Ap': None, '21919Ap': None, '31740Ap': None, '32491Ap': None, '34900Ap': None, '37524Ap': None, '38710Ap': None, '46705Ap': None, '51135Ap': None, '55702Ap': None, '63203Ap': None, '69546Ap': None, '71331Ap': None, '71779Ap': None, '74642Ap': None, '84767Ap': None, '87247Ap': None, '93188Ap': None, '99825Ap': None, '105746Ap': None, '110168Ap': None, '111432Ap': None, '113591Ap': None, '116038Ap': None, '117021Ap': None, '125471Ap': None, '133136Ap': None, '134636Ap': None, '140607Ap': None, '149618Ap': None, '150834Ap': None, '158305Ap': None, '161942Ap': None, '162189Ap': None, '162465Ap': None, '1498Caes': None, '4296Caes': None, '4578Caes': None, '4854Caes': 'https://whgazetteer.org/places/14153618/portal', '4978Caes': None, '5732Caes': None, '5997Caes': 'https://whgazetteer.org/places/84296/detail', '7012Caes': None, '9426Caes':

In [90]:
id_places_df = pd.DataFrame([(k,v) for k,v in pred2_url_relations.items()], columns=["ID", "WHG URL"]) #where 'id_places_df' is a dataframe containing verb IDs and WHG URLs
id_places_df

Unnamed: 0,ID,WHG URL
0,7504Aen,
1,7807Aen,
2,638Ap,
3,3306Ap,
4,4001Ap,
...,...,...
683,424445Verg,
684,428401Verg,
685,436869Verg,
686,438653Verg,


### Getting the final CSV for Latin

In [91]:
Latin_full = verbs_with_participants_and_spatial_relations.merge(id_places_df, on='ID', how='left')
Latin_full

Unnamed: 0,ID,VERB TOKEN,MORPHOLOGICAL FEATURES,LEMMA,SENTENCE,ACTIONALITY,VERB CLASS,LITERAL MEANING,VERB STEM,PREVERB,...,VERB SEMANTICS,FIGURE SEMANTICS,GROUND SEMANTICS,PARTICIPANT LEMMA,PARTICIPANT ROLE,SPATIAL RELATION EXPRESSION,SPATIAL RELATION ROLE,SPATIAL RELATION LEMMA,SPATIAL RELATION SEMANTICS,WHG URL
0,7504Aen,perire,Tense=Pres|VerbForm=Inf|Voice=Act,pereo,"Quae neque Dardaniis campis potuere perire, ...",Achievement,DIE-42.4.1,False,present stem,per,...,[v#00250254 pass from physical life and lose a...,['n#00004123 a human being'],,,,[ABL],[LOCATION],[campus],[n#06727012 extensive tract of level open land],
1,7614Aen,Concurrunt,Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbF...,concurro,Concurrunt vel uti venti cum spiritus austri ...,Achievement,BUMP-18.4-1,True,present stem,cum,...,[v#01075789 crash together with violent impact],['n#00004123 a human being'],,,,,,,,
2,7807Aen,conveniunt,Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbF...,convenio,Undique conveniunt vel ut imber tela tribuno,Activity,HERD-47.5.2,True,present stem,cum,...,[v#01654097 collect in one place],['n#03176413 a body that is thrown or projected'],[n#07168973 an officer in command of a militar...,"[telum, tribunus]","[Ground, Figure]","[DAT, adverb]","[GOAL, SOURCE]","[tribunus, undique]",[n#07168973 an officer in command of a militar...,
3,627Ap,praeeunte,Case=Abl|Gender=Masc|Number=Sing|Tense=Pres|Ve...,praeeo,"Mox in urbe Latia advena studiorum, Quiritium ...",State,ACCOMPANY-51.7,False,present stem,prae,...,"[v#01661609 be a guiding force, as with direct...",['n#07632177 a person whose occupation is teac...,,[magister],[Figure],,,,,
4,638Ap,aggressus,Case=Nom|Gender=Masc|Number=Sing|Tense=Past|Ve...,adgredior,"Mox in urbe Latia advena studiorum, Quiritium ...",Accomplishment,BEGIN-55.1,False,supine stem,ad,...,[v#01661230 begin to deal with],['n#00004123 a human being'],[n#05167497 a human written or spoken language...,[sermo],[Ground],[ACC],[GOAL],[sermo],[n#05167497 a human written or spoken language...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,435595Verg,procurrit,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,procurro,"Dum nititur acer et instat, rursus in aurigae ...",Achievement,RUN-51.3.2,True,present stem,pro,...,[v#01410345 run or move very quickly or hastily],['n#06888584 a female deity'],,[dea],[Figure],,,,,
1479,436869Verg,succurrere,Tense=Pres|VerbForm=Inf|Voice=Act,succurro,"Iuturnam misero, fateor, succurrere fratri sua...",Activity,HELP-72.1,False,present stem,sub,...,[v#01737682 help in a difficult situation],['n#06888584 a female deity'],[n#07127521 a male with the same parents as so...,"[Iuturna, frater]","[Ground, Figure]",[DAT],[GOAL],[frater],[n#07127521 a male with the same parents as so...,
1480,438653Verg,occurrere,Tense=Pres|VerbForm=Inf|Voice=Act,occurro,Harum unam celerem demisit ab aethere summo Iu...,Accomplishment,RUN-51.3.2,False,present stem,ob,...,[v#01410345 run or move very quickly or hastily],['n#10169961 a serious (sometimes fatal infect...,[n#06381267 a point located with respect to su...,[Iuturna],[Ground],[DAT],[GOAL],[Iuturna],[n#06381267 a point located with respect to su...,
1481,440736Verg,subirent,Mood=Subj|Number=Plur|Person=3|Tense=Imp|VerbF...,subeo,"Vix illud lecti bis sex cervice subirent, qual...",Achievement,PUT_DIRECTION-9.4,False,present stem,sub,...,[v#01343923 raise from a lower to a higher pos...,['n#00004123 a human being'],[n#06669293 a lump of hard consolidated minera...,[ille],[Ground],,,,,


In [92]:
Latin_full.to_csv('Latin_FULL.csv')