# 1. Loads Gold Standard and Schema

In [20]:
from collections import defaultdict
import pandas as pd
import yaml
import glob 
import re

DATASETS_DIR = '../datasets/csvs/'
SCHEMAS_DIR = '../datasets/schemas/'
SVM_DIR = '../datasets/svms/hot/'
GS_PATH = '{:}{:}'.format(DATASETS_DIR, 'gs.csv')
GS_SCHEMA_PATH = '{:}{:}'.format(SCHEMAS_DIR, 'gs.yaml')

DATASET_SIZE= 5931
DATASET_TRAIN_SIZE= 5099
DATASET_VALID_SIZE= 569
DATASET_TEST_SIZE=  263


In [2]:
with open(GS_SCHEMA_PATH, mode='r') as f:
    dictschema = yaml.load(f)

print([ i
    for i in dictschema])


['ARG', 'CTREE', 'DTREE', 'FORM', 'FUNC', 'GPOS', 'ID', 'INDEX', 'LEMMA', 'MORF', 'P', 'PRED', 'P_S', 'S']


In [3]:
df = pd.read_csv(GS_PATH, sep=',', encoding='utf-8', index_col=0)
print(df.columns)

Index(['ID', 'S', 'P', 'P_S', 'FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC',
       'CTREE', 'PRED', 'ARG'],
      dtype='object')


# 2. Loads gs_column_shifts

In [4]:
gs_column_shift = '../datasets/csvs/gs_column_shifts/*'
for file_path in glob.glob(gs_column_shift):
    _df = pd.read_csv(file_path, sep=',', encoding='utf-8', index_col=0)
    df = pd.concat((df, _df), axis=1, ignore_index=False)
print(df.columns)

Index(['ID', 'S', 'P', 'P_S', 'FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC',
       'CTREE', 'PRED', 'ARG', 'FORM+1', 'FORM+2', 'FORM+3', 'FORM-1',
       'FORM-2', 'FORM-3', 'LEMMA+1', 'LEMMA+2', 'LEMMA+3', 'LEMMA-1',
       'LEMMA-2', 'LEMMA-3', 'GPOS+1', 'GPOS+2', 'GPOS+3', 'GPOS-1', 'GPOS-2',
       'GPOS-3'],
      dtype='object')


# 3. Features per column

In [5]:

columns_mapper = {col: re.sub(r'[\+|\-|\d]', '', col) for col in df.columns.tolist()}

bounds_mapper = {colfeat:len(dictschema[colbase].get('domain',[1]))
          for colfeat, colbase in columns_mapper.items()}
          

print(bounds_mapper)

{'ID': 1, 'S': 1, 'P': 1, 'P_S': 1, 'FORM': 13290, 'LEMMA': 9071, 'GPOS': 25, 'MORF': 25, 'DTREE': 91, 'FUNC': 49, 'CTREE': 49, 'PRED': 1027, 'ARG': 60, 'FORM+1': 13290, 'FORM+2': 13290, 'FORM+3': 13290, 'FORM-1': 13290, 'FORM-2': 13290, 'FORM-3': 13290, 'LEMMA+1': 9071, 'LEMMA+2': 9071, 'LEMMA+3': 9071, 'LEMMA-1': 9071, 'LEMMA-2': 9071, 'LEMMA-3': 9071, 'GPOS+1': 25, 'GPOS+2': 25, 'GPOS+3': 25, 'GPOS-1': 25, 'GPOS-2': 25, 'GPOS-3': 25}


In [7]:
lexicons = {col : 
                dict(
                     zip(dictschema[col]['domain'], 
                         range(1, bounds_mapper[col]+1)
                        )
                    )
             for col in dictschema if 'domain' in dictschema[col]}


columns = ['FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC', 'CTREE', 'PRED',
          'FORM-3', 'FORM-2', 'FORM-1', 'FORM+1', 'FORM+2', 'FORM+3',
          'LEMMA-3', 'LEMMA-2', 'LEMMA-1', 'LEMMA+1', 'LEMMA+2', 'LEMMA+3',
          'GPOS-3', 'GPOS-2', 'GPOS-1', 'GPOS+1', 'GPOS+2', 'GPOS+3']


d = df.to_dict()

print(lexicons.keys())
print(d.keys())

dict_keys(['ARG', 'CTREE', 'DTREE', 'FORM', 'FUNC', 'GPOS', 'LEMMA', 'MORF', 'PRED'])
dict_keys(['ID', 'S', 'P', 'P_S', 'FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC', 'CTREE', 'PRED', 'ARG', 'FORM+1', 'FORM+2', 'FORM+3', 'FORM-1', 'FORM-2', 'FORM-3', 'LEMMA+1', 'LEMMA+2', 'LEMMA+3', 'LEMMA-1', 'LEMMA-2', 'LEMMA-3', 'GPOS+1', 'GPOS+2', 'GPOS+3', 'GPOS-1', 'GPOS-2', 'GPOS-3'])


In [9]:
args = [] 
sparse_features = defaultdict(dict)
propositions = []        
for idx, propid in d['P'].items():
    lb = 0 
    for col in columns:
        base_col = columns_mapper[col]
        categorical = d[col][idx] 
        if categorical in lexicons[base_col]:
            idx1 = lexicons[base_col][categorical]
            sparse_features[idx][lb + idx1]=1 
        else:
            # nan set to zero
            sparse_features[idx][lb]=1 
        lb += bounds_mapper[col] 

    args.append(lexicons['ARG'][d['ARG'][idx]]) 
    propositions.append( propid )

        
sparse_features[0]

{134: 1,
 13386: 1,
 22362: 1,
 22386: 1,
 22417: 1,
 22514: 1,
 22551: 1,
 23172: 1,
 23627: 1,
 36917: 1,
 50207: 1,
 72569: 1,
 86827: 1,
 97445: 1,
 103367: 1,
 112438: 1,
 121509: 1,
 136790: 1,
 145405: 1,
 153753: 1,
 157793: 1,
 157818: 1,
 157843: 1,
 157890: 1,
 157902: 1,
 157922: 1}

In [10]:
args[:5]

[3, 23, 3, 43, 12]

In [11]:
# FORM .: first sparse feature = lexicons['FORM']['Brasília'] --> 134
# LEMMA .: second sparse feature = bounds_mapper['FORM'] + lexicons['LEMMA']['Brasília'] --> 13386
# GPOS .: third sparse feature =  bounds_mapper['FORM'] +  bounds_mapper['LEMMA'] + lexicons['GPOS']['PROP'] --> 22362
df.head(1)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,DTREE,FUNC,...,LEMMA+3,LEMMA-1,LEMMA-2,LEMMA-3,GPOS+1,GPOS+2,GPOS+3,GPOS-1,GPOS-2,GPOS-3
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,0,Brasília,Brasília,PROP,F|S,5,ADVL,...,hoje,,,,N,V-PCP,ADV,,,


In [21]:
for ds_type in ('train', 'test', 'valid'):    
    if ds_type in ('train'):                
      lb = 0
      ub = DATASET_TRAIN_SIZE 

    if ds_type in ('valid'):                
      lb = DATASET_TRAIN_SIZE 
      ub = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE

    if ds_type in ('test'):                
      lb = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE 
      ub = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE + DATASET_TEST_SIZE
        
    # saves the processed data
    svm_path = '{:}{:}.svm'.format(SVM_DIR, ds_type)
    with open(svm_path, mode='w') as f:
        for idx in sparse_features:
            p = propositions[idx]
            if p > lb and p < ub + 1:
                target = '{:} '.format(int(args[idx]))
                features = ' '.join([ '{:}:{:}'.format(key, val) 
                     for key, val in sparse_features[idx].items()])
                ex = '{:}{:}\n'.format(target, features)
                f.write(ex)
                
                
                
            
        