# Universal dependencies

For my purposes, universal dependencies is a way of hand-annotating many languages with a consistent set of features. The data comes in a very nice [format](http://universaldependencies.org/docs/format.html).

In [10]:
import os
import re
import numpy as np
import pandas as pd

In [11]:
def read(path):
    with open(path) as f:
        return f.read()

Every sentence has comment lines at the top, indicated with a #.

In [12]:
def remove_comments(raw):
    comment_pattern = '# .*?\n' # Must include space after hashtag so that hashtag tokens are parsed correctly
    return re.sub(comment_pattern, '', raw)

In [13]:
def write(path, data):
    with open(path, 'w') as f:
        f.write(data)

In [14]:
train_data_path = 'UD_English/en-ud-train.conllu'
raw = read(train_data_path)
cleaned = remove_comments(raw)
path = 'UD_English/en-ud-train-cleaned.tsv'
write(path, cleaned)

In [15]:
columns = ['id', 'form', 'lemma', 'universal_pos', 'lg_pos', 'features', 'head', 'dependency_relations', 'dependencies', 'misc']
df = pd.read_csv(path, sep='\t', names=columns, quoting=3)
df['id'] = df['id'].astype(int)
df.replace('_', np.nan, inplace=True)
df.head()

Unnamed: 0,id,form,lemma,universal_pos,lg_pos,features,head,dependency_relations,dependencies,misc
0,1,Al,Al,PROPN,NNP,Number=Sing,0,root,,SpaceAfter=No
1,2,-,-,PUNCT,HYPH,,1,punct,,SpaceAfter=No
2,3,Zaman,Zaman,PROPN,NNP,Number=Sing,1,flat,,
3,4,:,:,PUNCT,:,,1,punct,,
4,5,American,american,ADJ,JJ,Degree=Pos,6,amod,,


In [16]:
def make_feature_dict(lst):
    if isinstance(lst, float):
        return {}
    return {feat: value for feat, value in [pair.split('=') for pair in lst]}

In [17]:
feature_dicts = df['features'].str.split('|').apply(make_feature_dict)
features = pd.DataFrame(list(feature_dicts.values))
features.head()

Unnamed: 0,Case,Definite,Degree,Foreign,Gender,Mood,NumType,Number,Person,Poss,PronType,Reflex,Tense,VerbForm,Voice
0,,,,,,,,Sing,,,,,,,
1,,,,,,,,,,,,,,,
2,,,,,,,,Sing,,,,,,,
3,,,,,,,,,,,,,,,
4,,,Pos,,,,,,,,,,,,


In [18]:
data = pd.merge(df, features, left_index=True, right_index=True)
data.drop(['id', 'features', 'head', 'dependency_relations', 'dependencies', 'misc'], axis=1, inplace=True)
data.head()

Unnamed: 0,form,lemma,universal_pos,lg_pos,Case,Definite,Degree,Foreign,Gender,Mood,NumType,Number,Person,Poss,PronType,Reflex,Tense,VerbForm,Voice
0,Al,Al,PROPN,NNP,,,,,,,,Sing,,,,,,,
1,-,-,PUNCT,HYPH,,,,,,,,,,,,,,,
2,Zaman,Zaman,PROPN,NNP,,,,,,,,Sing,,,,,,,
3,:,:,PUNCT,:,,,,,,,,,,,,,,,
4,American,american,ADJ,JJ,,,Pos,,,,,,,,,,,,


In [21]:
path = 'UD_English/en-ud-features.csv'
data.to_csv(path, index=False)