# UniMorph

In [1]:
import os
import re
import logging
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from datetime import datetime
import pandas as pd

logging.basicConfig(filename='logs/unimorph.log', filemode='w', level=logging.INFO, 
                        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

LINKS = ['https://github.com/unimorph/eng.git',
         'https://github.com/unimorph/fra.git']

OUT_DIR = 'unimorph'

In [2]:
def download(url, path):
    cmd = 'git clone {} {}'.format(url, path)
    os.system(cmd)

def fetch_datasets(datasets):
    for link in datasets:
        local_path = os.path.join(OUT_DIR, os.path.splitext(os.path.basename(link))[0])
        if not os.path.exists(local_path):
            download(link, local_path)

fetch_datasets(LINKS)

In [3]:
class UniMorphReader:
    
    AKTIONSART = 'STAT DYN TEL ATEL PCT DUR ACH ACCMP SEMEL ACTY'
    ASPECT = 'IPFV PFV PRF PROG PROSP ITER HAB'
    FINITENESS = 'FIN NFIN'
    GENDER = 'MASC FEM NEUT'
    NUMBER = 'SG PL'
    POS = 'N PROPN ADJ PRO CLF ART DET V ADV AUX V.PTCP V.MSDR V.CVB ADP COMP CONJ NUM PART INTJ'
    TENSE = 'PRS PST FUT IMMED HOD 1DAY RCT RMT'
    
    DIMENSIONS = {'aktionsart': AKTIONSART,
                  'aspect': ASPECT,
                  'finiteness': FINITENESS,
                  'gender': GENDER,
                  'number': NUMBER,
                  'pos': POS,
                  'tense': TENSE
                 }     
    
    def __init__(self, lg):
        self.lg = lg
        self.fname = os.path.join(OUT_DIR, lg, lg)
        self.data = self.load_data(self.DIMENSIONS.keys())
        self.save()
    
    def make_regex(self, dimension):
        pattern = '(' + '|'.join(dimension.split()) + ')'
        return re.compile(pattern)
    
    def load_data(self, dimensions):
        data = pd.read_csv(self.fname, sep='\t', header=None, names=['stem', 'inflected', 'features'])
        for dimension in dimensions:
            regex = self.make_regex(self.DIMENSIONS[dimension])
            data[dimension] = data['features'].str.extract(regex)
        data.drop('features', axis=1, inplace=True)
        return data.drop_duplicates()
    
    def save(self):
        fname = os.path.join(OUT_DIR, self.lg, '{}.csv'.format(self.lg))
        self.data.to_csv(fname, index=False)

In [4]:
for lg in ['eng', 'fra']:
    start = datetime.now()
    data = UniMorphReader(lg)
    end = datetime.now()
    msg = 'Processing unimorph data for {} took {}'.format(lg, end-start)
    logging.info(msg)