# Preliminaries

## Import statements

In [8]:
import os
import time
import pandas as pd

## Paths

In [40]:
data_dir = "data"
spacy_file = "spacy_tokens.csv"
cltk_file = "cltk_tokens.csv"
merged_file = "merged.csv"

## Check for data

Make sure that the previous two notebooks have been run and their output is present.

In [33]:
old = 0
failed = 0

for filename in [spacy_file, cltk_file]:
    path = os.path.join("data", filename)

    print(path, "...", end=" ")
    if os.path.exists(path):
        if time.time() - os.path.getmtime(path) > 3600:
            print("exists - note: data created more than one hour ago.")
            old += 1
        else:
            print("exists")
    else:
        print("not found!")
        failed += 1
print()

if failed:
    print("Some data is missing. You must run NLP before continuing.")
elif old:
    print("Some data is old. Consider re-running NLP.")
else:
    spacy_tokens = pd.read_csv(os.path.join(data_dir, spacy_file), dtype=str)
    spacy_tokens["level"] = spacy_tokens["level"].astype(int)
    cltk_tokens = pd.read_csv(os.path.join(data_dir, cltk_file), dtype=str)
    cltk_tokens["level"] = cltk_tokens["level"].astype(int)

data/spacy_tokens.csv ... exists
data/cltk_tokens.csv ... exists



## Deduplicate embedded lines

In [34]:
max_levels = cltk_tokens.groupby('line_id').agg(level=('level', 'max'))

In [35]:
x = cltk_tokens.loc[:,['line_id','level']].merge(max_levels, how='left', on='line_id')
mask = x['level_x'] == x['level_y']
cltk_no_dups = cltk_tokens.loc[mask]
cltk_no_dups.to_csv('cltk_tokens.csv', index=False)
display(cltk_no_dups)

Unnamed: 0,speech_id,lang,author,work,urn,l_fi,l_la,nlines,spkr,addr,...,mood,tense,voice,aspect,person,number,case,gender,degree,verbform
0,1385,greek,Apollonius,Argonautica,urn:cts:greekLit:tlg0001.tlg001.perseus-grc2,1.242,1.246,5,citizens,citizens,...,,,,,,singular,vocative,masculine,,
1,1385,greek,Apollonius,Argonautica,urn:cts:greekLit:tlg0001.tlg001.perseus-grc2,1.242,1.246,5,citizens,citizens,...,,,,,,singular,vocative,feminine,,
2,1385,greek,Apollonius,Argonautica,urn:cts:greekLit:tlg0001.tlg001.perseus-grc2,1.242,1.246,5,citizens,citizens,...,,,,,,singular,nominative,masculine,,
3,1385,greek,Apollonius,Argonautica,urn:cts:greekLit:tlg0001.tlg001.perseus-grc2,1.242,1.246,5,citizens,citizens,...,,,,,,singular,genitive,masculine,,
4,1385,greek,Apollonius,Argonautica,urn:cts:greekLit:tlg0001.tlg001.perseus-grc2,1.242,1.246,5,citizens,citizens,...,,,,,,singular,nominative,masculine,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401768,1868,latin,Virgil,Aeneid,urn:cts:latinLit:phi0690.phi003.perseus-lat2,12.947,12.949,3,Aeneas,Turnus,...,,,,,,singular,accusative,feminine,,
401769,1868,latin,Virgil,Aeneid,urn:cts:latinLit:phi0690.phi003.perseus-lat2,12.947,12.949,3,Aeneas,Turnus,...,,,passive,perfective,,singular,ablative,masculine,,participle
401770,1868,latin,Virgil,Aeneid,urn:cts:latinLit:phi0690.phi003.perseus-lat2,12.947,12.949,3,Aeneas,Turnus,...,,,,,,,,,,
401771,1868,latin,Virgil,Aeneid,urn:cts:latinLit:phi0690.phi003.perseus-lat2,12.947,12.949,3,Aeneas,Turnus,...,,,,,,singular,ablative,masculine,,


In [36]:
x = spacy_tokens.loc[:,['line_id','level']].merge(max_levels, how='left', on='line_id')
mask = x['level_x'] == x['level_y']
spacy_no_dups = spacy_tokens.loc[mask]
spacy_no_dups.to_csv('spacy_tokens.csv', index=False)
display(spacy_no_dups)

Unnamed: 0,speech_id,lang,author,work,urn,l_fi,l_la,nlines,spkr,addr,...,mood,tense,voice,person,number,case,gender,verbform,degree,prontype
0,1,greek,Homer,Iliad,urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,1.17,1.21,5,Chryses,"Agamemnon,Greeks",...,,,,,Plur,Nom,Masc,,,
1,1,greek,Homer,Iliad,urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,1.17,1.21,5,Chryses,"Agamemnon,Greeks",...,,,,,,,,,,
2,1,greek,Homer,Iliad,urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,1.17,1.21,5,Chryses,"Agamemnon,Greeks",...,,,,,,,,,,
3,1,greek,Homer,Iliad,urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,1.17,1.21,5,Chryses,"Agamemnon,Greeks",...,,,,,Plur,Nom,Masc,,,
4,1,greek,Homer,Iliad,urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,1.17,1.21,5,Chryses,"Agamemnon,Greeks",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410805,4696,latin,Prudentius,Psychomachia,urn:cts:latinLit:stoa0238.stoa002.perseus-lat2,1.799,1.822,24,Fides,Virtues,...,,,,,Plur,Abl,Neut,,,
410806,4696,latin,Prudentius,Psychomachia,urn:cts:latinLit:stoa0238.stoa002.perseus-lat2,1.799,1.822,24,Fides,Virtues,...,,,,,Sing,Acc,Fem,,,
410807,4696,latin,Prudentius,Psychomachia,urn:cts:latinLit:stoa0238.stoa002.perseus-lat2,1.799,1.822,24,Fides,Virtues,...,Sub,Pres,Act,3,Sing,,,Fin,,
410808,4696,latin,Prudentius,Psychomachia,urn:cts:latinLit:stoa0238.stoa002.perseus-lat2,1.799,1.822,24,Fides,Virtues,...,,Past,Pass,,Sing,Nom,Fem,Part,,


### Alignment

In [38]:
cols = ['tok_id', 'token', 'lemma', 'pos', 'mood', 'tense', 'voice', 'aspect', 'person', 'number', 'case', 'gender', 'degree', 'verbform']
merged = spacy_no_dups.merge(cltk_no_dups[cols], how='left', on='tok_id', suffixes=('_spacy', '_cltk'))
merged.to_csv(os.path.join('data', 'merged.csv'), index=False)

### Output

In [41]:
merged.to_csv(os.path.join(data_dir, merged_file))
merged

Unnamed: 0,speech_id,lang,author,work,urn,l_fi,l_la,nlines,spkr,addr,...,mood_cltk,tense_cltk,voice_cltk,aspect,person_cltk,number_cltk,case_cltk,gender_cltk,degree_cltk,verbform_cltk
0,1,greek,Homer,Iliad,urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,1.17,1.21,5,Chryses,"Agamemnon,Greeks",...,,,,,,,,,,
1,1,greek,Homer,Iliad,urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,1.17,1.21,5,Chryses,"Agamemnon,Greeks",...,,,,,,,,,,
2,1,greek,Homer,Iliad,urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,1.17,1.21,5,Chryses,"Agamemnon,Greeks",...,,,,,,,,,,
3,1,greek,Homer,Iliad,urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,1.17,1.21,5,Chryses,"Agamemnon,Greeks",...,,,,,,plural,nominative,masculine,,
4,1,greek,Homer,Iliad,urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,1.17,1.21,5,Chryses,"Agamemnon,Greeks",...,,,,,,plural,vocative,masculine,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384379,4696,latin,Prudentius,Psychomachia,urn:cts:latinLit:stoa0238.stoa002.perseus-lat2,1.799,1.822,24,Fides,Virtues,...,,,,,,plural,dative,feminine,,
384380,4696,latin,Prudentius,Psychomachia,urn:cts:latinLit:stoa0238.stoa002.perseus-lat2,1.799,1.822,24,Fides,Virtues,...,,,,,,singular,accusative,feminine,,
384381,4696,latin,Prudentius,Psychomachia,urn:cts:latinLit:stoa0238.stoa002.perseus-lat2,1.799,1.822,24,Fides,Virtues,...,subjunctive,present,active,imperfective,third,singular,,,,finite
384382,4696,latin,Prudentius,Psychomachia,urn:cts:latinLit:stoa0238.stoa002.perseus-lat2,1.799,1.822,24,Fides,Virtues,...,,,passive,perfective,,plural,nominative,neuter,,participle
