# 1. Loads Gold Standard and Schema

In [37]:
from collections import defaultdict
import pandas as pd
import yaml
import glob 
import re

DATASETS_DIR = '../datasets/csvs/'
SCHEMAS_DIR = '../datasets/schemas/'

GS_PATH = '{:}{:}'.format(DATASETS_DIR, 'gs.csv')
GS_SCHEMA_PATH = '{:}{:}'.format(SCHEMAS_DIR, 'gs.yaml')

# re_replace = re.compile(r'[\+|\-|\d]')

In [38]:
with open(GS_SCHEMA_PATH, mode='r') as f:
    dictschema = yaml.load(f)

print([ i
    for i in dictschema])


['ARG', 'CTREE', 'DTREE', 'FORM', 'FUNC', 'GPOS', 'ID', 'INDEX', 'LEMMA', 'MORF', 'P', 'PRED', 'P_S', 'S']


In [39]:
df = pd.read_csv(GS_PATH, sep=',', encoding='utf-8', index_col=0)
print(df.columns)

Index(['ID', 'S', 'P', 'P_S', 'FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC',
       'CTREE', 'PRED', 'ARG'],
      dtype='object')


# 2. Loads gs_column_shifts

In [40]:
gs_column_shift = '../datasets/csvs/gs_column_shifts/*'
for file_path in glob.glob(gs_column_shift):
    _df = pd.read_csv(file_path, sep=',', encoding='utf-8', index_col=0)
    df = pd.concat((df, _df), axis=1, ignore_index=False)
print(df.columns)

Index(['ID', 'S', 'P', 'P_S', 'FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC',
       'CTREE', 'PRED', 'ARG', 'FORM+1', 'FORM+2', 'FORM+3', 'FORM-1',
       'FORM-2', 'FORM-3', 'LEMMA+1', 'LEMMA+2', 'LEMMA+3', 'LEMMA-1',
       'LEMMA-2', 'LEMMA-3', 'GPOS+1', 'GPOS+2', 'GPOS+3', 'GPOS-1', 'GPOS-2',
       'GPOS-3'],
      dtype='object')


# 3. Features per column

In [41]:

columns_helper = [(re.sub(r'[\+|\-|\d]', '', col), col) 
                  for col in df.columns.tolist()]

mapper = {rawcol:len(dictschema[basecol].get('domain',[1]))
          for basecol, rawcol in columns_helper}
          

print(mapper)

{'ID': 1, 'S': 1, 'P': 1, 'P_S': 1, 'FORM': 13290, 'LEMMA': 9071, 'GPOS': 25, 'MORF': 25, 'DTREE': 91, 'FUNC': 49, 'CTREE': 49, 'PRED': 1027, 'ARG': 60, 'FORM+1': 13290, 'FORM+2': 13290, 'FORM+3': 13290, 'FORM-1': 13290, 'FORM-2': 13290, 'FORM-3': 13290, 'LEMMA+1': 9071, 'LEMMA+2': 9071, 'LEMMA+3': 9071, 'LEMMA-1': 9071, 'LEMMA-2': 9071, 'LEMMA-3': 9071, 'GPOS+1': 25, 'GPOS+2': 25, 'GPOS+3': 25, 'GPOS-1': 25, 'GPOS-2': 25, 'GPOS-3': 25}


In [49]:
lexicons = {col : 
                dict(
                     zip(dictschema[col]['domain'], 
                         range(1, mapper[col]+1)
                        )
                    )
             for col in dictschema if 'domain' in dictschema[col]}


columns = ['FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC', 'CTREE', 'PRED',
          'FORM-3', 'FORM-2', 'FORM-1', 'FORM+1', 'FORM+2', 'FORM+3',
          'LEMMA-3', 'LEMMA-2', 'LEMMA-1', 'LEMMA+1', 'LEMMA+2', 'LEMMA+3',
          'GPOS-3', 'GPOS-2', 'GPOS-1', 'GPOS+1', 'GPOS+2', 'GPOS+3']


d = df.to_dict()

print(lexicons.keys())
print(d.keys())

dict_keys(['ARG', 'CTREE', 'DTREE', 'FORM', 'FUNC', 'GPOS', 'LEMMA', 'MORF', 'PRED'])
dict_keys(['ID', 'S', 'P', 'P_S', 'FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC', 'CTREE', 'PRED', 'ARG', 'FORM+1', 'FORM+2', 'FORM+3', 'FORM-1', 'FORM-2', 'FORM-3', 'LEMMA+1', 'LEMMA+2', 'LEMMA+3', 'LEMMA-1', 'LEMMA-2', 'LEMMA-3', 'GPOS+1', 'GPOS+2', 'GPOS+3', 'GPOS-1', 'GPOS-2', 'GPOS-3'])


In [48]:
print(columns)
sparse = defaultdict(dict)
for idx in range(2):
    lb = 0 
    for col in columns:
        categorical = d[col][idx] 
        if categorical in lexicons[col]:
            sparse[idx][lb+lexicons[col][d[col][idx]]] 
        else:
            # nan set to zero
            sparse[idx][lb+lexicons[col][d[col][idx]]] 
            lb += mapper[col] 

sparse[0]

['FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC', 'CTREE', 'PRED', 'FORM-3', 'FORM-2', 'FORM-1', 'FORM+1', 'FORM+2', 'FORM+3', 'LEMMA-3', 'LEMMA-2', 'LEMMA-1', 'LEMMA+1', 'LEMMA+2', 'LEMMA+3', 'GPOS-3', 'GPOS-2', 'GPOS-1', 'GPOS+1', 'GPOS+2', 'GPOS+3']
FORM Brasília
LEMMA Brasília
GPOS PROP
MORF F|S
DTREE 5
FUNC ADVL
CTREE (FCL(NP*)
PRED -
FORM-3 nan
FORM-2 nan
FORM-1 nan
FORM+1 Pesquisa_Datafolha
FORM+2 publicada
FORM+3 hoje
LEMMA-3 nan
LEMMA-2 nan
LEMMA-1 nan
LEMMA+1 Pesquisa_Datafolha
LEMMA+2 publicar
LEMMA+3 hoje
GPOS-3 nan
GPOS-2 nan
GPOS-1 nan
GPOS+1 N
GPOS+2 V-PCP
GPOS+3 ADV
FORM Pesquisa_Datafolha
LEMMA Pesquisa_Datafolha
GPOS N
MORF F|S
DTREE 5
FUNC SUBJ
CTREE (NP*
PRED -
FORM-3 nan
FORM-2 nan
FORM-1 Brasília
FORM+1 publicada
FORM+2 hoje
FORM+3 revela
LEMMA-3 nan
LEMMA-2 nan
LEMMA-1 Brasília
LEMMA+1 publicar
LEMMA+2 hoje
LEMMA+3 revelar
GPOS-3 nan
GPOS-2 nan
GPOS-1 PROP
GPOS+1 V-PCP
GPOS+2 ADV
GPOS+3 V-FIN


{}