# Universal dependencies

The [Universal Dependencies project](http://universaldependencies.org/) annotates corpora with cross-linguistically consistent notation. The data comes in a very nice [format](http://universaldependencies.org/docs/format.html). In this notebook, I want to use the annotated corpora to extract morphosyntactic features for word tokens.

In [72]:
import os
import re
import glob
from io import StringIO
import numpy as np
import pandas as pd

In [2]:
def read(path):
    with open(path) as f:
        return f.read()

In [3]:
def remove_comments(raw):
    comment_pattern = '# .*?\n' # Must include space after hashtag so that hashtag tokens are parsed correctly
    return re.sub(comment_pattern, '', raw)

In [117]:
def make_feature_dict(lst):
    if isinstance(lst, float):
        return {}
    return {feat: value for feat, value in [pair.split('=') for pair in lst]}

In [118]:
def which_data_split(path):
    """Returns whether the file contains train, dev or test data."""
    rest, filename = os.path.split(path)
    name, ext = os.path.splitext(filename)
    return name.split('-')[-1]

In [119]:
def preprocess(path):
    """Returns cleaned UD data as dataframe with each row a token and each column a feature."""
    raw = read(path)
    cleaned = remove_comments(raw)
    columns = ['id', 'form', 'lemma', 'universal_pos', 'lg_pos', 'features', 'head', 'dependency_relations', 'dependencies', 'misc']
    words = pd.read_csv(StringIO(cleaned), sep='\t', names=columns, quoting=3)
    words['id'] = words['id'].astype(int, errors='ignore')
    words.replace('_', np.nan, inplace=True)
    
    feature_dicts = words['features'].str.split('|').apply(make_feature_dict)
    features = pd.DataFrame(list(feature_dicts.values))
    data = pd.merge(words, features, left_index=True, right_index=True)
    data_set = which_data_split(path)
    data['set'] = data_set
    return data.drop(['id', 'features', 'head', 'dependency_relations', 'dependencies', 'misc'], axis=1)

In [120]:
def combine_splits(directory):
    dfs = []
    pattern = '{}/*.conllu'.format(directory)
    for path in glob.glob(pattern):
        df = preprocess(path)
        dfs.append(df)
    return pd.concat(dfs)

In [122]:
def write(directory):
    df = combine_splits(directory)
    outfile = os.path.join(directory, 'features.csv')
    df.to_csv(outfile, index=False)

In [124]:
for path in os.listdir():
    if os.path.isdir(path) and path.startswith('UD_'):
        write(path)
        print('Finished preprocessing {}'.format(path))

Finished preprocessing UD_Czech
Finished preprocessing UD_English
Finished preprocessing UD_French
Finished preprocessing UD_Japanese
Finished preprocessing UD_Italian
