# Preliminaries

## Import statements

In [None]:
import os
import time
import pandas as pd

## Paths

In [None]:
data_dir = "data"
spacy_file = "spacy_tokens.csv"
cltk_file = "cltk_tokens.csv"
merged_file = "merged.csv"

## Check for data

Make sure that the previous two notebooks have been run and their output is present.

In [None]:
old = 0
failed = 0

for filename in [spacy_file, cltk_file]:
    path = os.path.join("data", filename)

    print(path, "...", end=" ")
    if os.path.exists(path):
        if time.time() - os.path.getmtime(path) > 3600:
            print("exists - note: data created more than one hour ago.")
            old += 1
        else:
            print("exists")
    else:
        print("not found!")
        failed += 1
print()

if failed:
    print("Some data is missing. You must run NLP before continuing.")
elif old:
    print("Some data is old. Consider re-running NLP.")
else:
    spacy_tokens = pd.read_csv(os.path.join(data_dir, spacy_file), dtype=str)
    spacy_tokens["level"] = spacy_tokens["level"].astype(int)
    cltk_tokens = pd.read_csv(os.path.join(data_dir, cltk_file), dtype=str)
    cltk_tokens["level"] = cltk_tokens["level"].astype(int)

## Deduplicate embedded lines

In [None]:
max_levels = cltk_tokens.groupby('line_id').agg(level=('level', 'max'))

In [None]:
x = cltk_tokens.loc[:,['line_id','level']].merge(max_levels, how='left', on='line_id')
mask = x['level_x'] == x['level_y']
cltk_no_dups = cltk_tokens.loc[mask]
cltk_no_dups.to_csv('cltk_tokens.csv', index=False)
display(cltk_no_dups)

In [None]:
x = spacy_tokens.loc[:,['line_id','level']].merge(max_levels, how='left', on='line_id')
mask = x['level_x'] == x['level_y']
spacy_no_dups = spacy_tokens.loc[mask]
spacy_no_dups.to_csv('spacy_tokens.csv', index=False)
display(spacy_no_dups)

### Alignment

In [None]:
cols = ['tok_id', 'token', 'lemma', 'pos', 'mood', 'tense', 'voice', 'aspect', 'person', 'number', 'case', 'gender', 'degree', 'verbform']
merged = spacy_no_dups.merge(cltk_no_dups[cols], how='left', on='tok_id', suffixes=('_spacy', '_cltk'))
merged.to_csv(os.path.join('data', 'merged.csv'), index=False)

### Output

In [None]:
merged.to_csv(os.path.join(data_dir, merged_file))
merged