In [1]:
## Script to download the UD treebanks
## Uncomment as necessary

# !sh download_ud.sh

In [2]:
# Imports

import pandas as pd
from cltkreaders.readers import UDCorpusReader
from latintools import preprocess

In [3]:
# Set up corpus

UDR = UDCorpusReader('data')

In [4]:
# Get sentence data

sent_dicts = UDR.sent_dicts()

In [5]:
# Limit sentence data to verbs

verbs = []

for sent_dict in sent_dicts:
    for item in sent_dict:
        if item['UPOS'] == 'VERB':
            verbs.append(({'lemma': item['LEMMA'], 'form': item['FORM'], 'feats': item['FEATS']}))

In [6]:
# Helper function to expand feats

def expand_feats(feat):
    # This is what the feats look like, e.g.:
    # Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act
    feats = feat.split('|')
    feats_dict = {}
    for feat in feats:
        key, value = feat.split('=')
        feats_dict[key] = value
    return feats_dict

In [7]:
# Expand feats

expanded_verbs = []

for verb in verbs:
    expanded_verb = {}
    expanded_verb['lemma'] = verb['lemma']
    expanded_verb['form'] = verb['form']
    expanded_verb.update(expand_feats(verb['feats']))
    expanded_verbs.append(expanded_verb)

In [8]:
# Create dataframe for streamlit app

df = pd.DataFrame(expanded_verbs)
df = df[df['VerbForm'] == 'Fin']
df = df[['lemma', 'form', 'Mood', 'Number', 'Person', 'Tense', 'Voice']]
df.columns = ['lemma', 'form', 'person', 'number', 'tense', 'mood', 'voice']
df['lemma'] = df['lemma'].apply(lambda x: preprocess(x))
df['form'] = df['form'].apply(lambda x: preprocess(x))
df['tense'] = df['tense'].astype(str)

In [9]:
# Save dataframe

df.to_csv('data/verbs.tsv', sep='\t', index=False)