# 1. Loads Gold Standard and Schema

In [1]:
import sys
sys.path.append('../datasets')

from collections import defaultdict
import pandas as pd
import yaml
import glob 
import re
import math
from gensim.models import KeyedVectors
from data_propbankbr import propbankbr_arg2t

DATASETS_DIR = '../datasets/csvs/'
SCHEMAS_DIR = '../datasets/schemas/'
SVM_DIR = '../datasets/svms/'
EMBEDDINGS_DIR = '../datasets/txts/embeddings/'

GS_PATH = '{:}{:}'.format(DATASETS_DIR, 'gs.csv')
GS_SCHEMA_PATH = '{:}{:}'.format(SCHEMAS_DIR, 'gs.yaml')

GLOVE_S50_PATH = '{:}glove_s50.txt'.format(EMBEDDINGS_DIR)
WANG_S100_PATH = '{:}wang2vec_s100.txt'.format(EMBEDDINGS_DIR)



DATASET_SIZE= 5931
DATASET_TRAIN_SIZE= 5099
DATASET_VALID_SIZE= 569
DATASET_TEST_SIZE=  263


In [2]:
with open(GS_SCHEMA_PATH, mode='r') as f:
    dictschema = yaml.load(f)

print([ i
    for i in dictschema])


['ARG', 'CTREE', 'DTREE', 'FORM', 'FUNC', 'GPOS', 'ID', 'INDEX', 'LEMMA', 'MORF', 'P', 'PRED', 'P_S', 'S']


In [3]:
df = pd.read_csv(GS_PATH, sep=',', encoding='utf-8', index_col=0)
print(df.columns)

Index(['ID', 'S', 'P', 'P_S', 'FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC',
       'CTREE', 'PRED', 'ARG'],
      dtype='object')


# 2. Loads gs_column_shifts

In [4]:
gs_column_shift = '../datasets/csvs/gs_column_shifts/*'
for file_path in glob.glob(gs_column_shift):
    _df = pd.read_csv(file_path, sep=',', encoding='utf-8', index_col=0)
    df = pd.concat((df, _df), axis=1, ignore_index=False)
print(df.columns)

Index(['ID', 'S', 'P', 'P_S', 'FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC',
       'CTREE', 'PRED', 'ARG', 'FORM+1', 'FORM+2', 'FORM+3', 'FORM-1',
       'FORM-2', 'FORM-3', 'LEMMA+1', 'LEMMA+2', 'LEMMA+3', 'LEMMA-1',
       'LEMMA-2', 'LEMMA-3', 'GPOS+1', 'GPOS+2', 'GPOS+3', 'GPOS-1', 'GPOS-2',
       'GPOS-3'],
      dtype='object')


# 3. Features per column

In [5]:

columns_mapper = {col: re.sub(r'[\+|\-|\d]', '', col) for col in df.columns.tolist()}

dimension_mapper = {colfeat:len(dictschema[colbase].get('domain',[1]))
          for colfeat, colbase in columns_mapper.items()}
          
def bounds_fn(columns, embeddings_size, dimension_mapper, columns_mapper):
    bmapper = {}
    lb = 0 
    for col in columns:
        if is_dense(col, columns_mapper):
            ub = lb +  embeddings_size        
        else:
            ub = lb +  dimension_mapper[col]            
        bmapper[col] = {'lb': lb, 'ub':ub }
        lb = ub + 1
    return bmapper

def is_dense(col, columns_mapper):
    return columns_mapper[col] in ('FORM', 'LEMMA', 'PRED')

print(dimension_mapper)

{'ID': 1, 'S': 1, 'P': 1, 'P_S': 1, 'FORM': 13290, 'LEMMA': 9071, 'GPOS': 25, 'MORF': 25, 'DTREE': 91, 'FUNC': 49, 'CTREE': 49, 'PRED': 1027, 'ARG': 60, 'FORM+1': 13290, 'FORM+2': 13290, 'FORM+3': 13290, 'FORM-1': 13290, 'FORM-2': 13290, 'FORM-3': 13290, 'LEMMA+1': 9071, 'LEMMA+2': 9071, 'LEMMA+3': 9071, 'LEMMA-1': 9071, 'LEMMA-2': 9071, 'LEMMA-3': 9071, 'GPOS+1': 25, 'GPOS+2': 25, 'GPOS+3': 25, 'GPOS-1': 25, 'GPOS-2': 25, 'GPOS-3': 25}


In [6]:
lexicons = {col : 
                dict(
                     zip(dictschema[col]['domain'], 
                         range(1, dimension_mapper[col]+1)
                        )
                    )
             for col in dictschema if 'domain' in dictschema[col]}


columns = ['FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC', 'CTREE', 'PRED',
          'FORM-3', 'FORM-2', 'FORM-1', 'FORM+1', 'FORM+2', 'FORM+3',
          'LEMMA-3', 'LEMMA-2', 'LEMMA-1', 'LEMMA+1', 'LEMMA+2', 'LEMMA+3',
          'GPOS-3', 'GPOS-2', 'GPOS-1', 'GPOS+1', 'GPOS+2', 'GPOS+3']


d = df.to_dict()

print(lexicons.keys())
print(d.keys())

dict_keys(['ARG', 'CTREE', 'DTREE', 'FORM', 'FUNC', 'GPOS', 'LEMMA', 'MORF', 'PRED'])
dict_keys(['ID', 'S', 'P', 'P_S', 'FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC', 'CTREE', 'PRED', 'ARG', 'FORM+1', 'FORM+2', 'FORM+3', 'FORM-1', 'FORM-2', 'FORM-3', 'LEMMA+1', 'LEMMA+2', 'LEMMA+3', 'LEMMA-1', 'LEMMA-2', 'LEMMA-3', 'GPOS+1', 'GPOS+2', 'GPOS+3', 'GPOS-1', 'GPOS-2', 'GPOS-3'])


In [7]:
args = [] 
sparse_features = defaultdict(dict)
propositions = []        

for idx, propid in d['P'].items():
    lb = 0 
    for col in columns:
        base_col = columns_mapper[col]
        categorical = d[col][idx] 
        if categorical in lexicons[base_col]:
            idx1 = lexicons[base_col][categorical]
            sparse_features[idx][lb + idx1]=1 
        else:
            # nan set to zero
            sparse_features[idx][lb]=1 
        lb += dimension_mapper[col] 

    args.append(lexicons['ARG'][d['ARG'][idx]]) 
    propositions.append( propid )

        
sparse_features[0]

{4098: 1,
 16148: 1,
 22378: 1,
 22386: 1,
 22417: 1,
 22549: 1,
 22551: 1,
 23222: 1,
 23627: 1,
 36917: 1,
 50207: 1,
 69831: 1,
 89101: 1,
 92169: 1,
 103367: 1,
 112438: 1,
 121509: 1,
 134938: 1,
 142802: 1,
 150198: 1,
 157793: 1,
 157818: 1,
 157843: 1,
 157890: 1,
 157908: 1,
 157943: 1}

In [8]:
args[:5]

[30, 8, 30, 45, 41]

In [9]:
# FORM .: first sparse feature = lexicons['FORM']['Brasília'] --> 134
# LEMMA .: second sparse feature = dimension_mapper['FORM'] + lexicons['LEMMA']['Brasília'] --> 13386
# GPOS .: third sparse feature =  dimension_mapper['FORM'] +  dimension_mapper['LEMMA'] + lexicons['GPOS']['PROP'] --> 22362
df.head(1)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,DTREE,FUNC,...,LEMMA+3,LEMMA-1,LEMMA-2,LEMMA-3,GPOS+1,GPOS+2,GPOS+3,GPOS-1,GPOS-2,GPOS-3
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,0,Brasília,Brasília,PROP,F|S,5,ADVL,...,hoje,,,,N,V-PCP,ADV,,,


# 4. Convert ARGS into T

In [10]:
arguments = [value for key, value in d['ARG'].items()] 
targets = propbankbr_arg2t(propositions, arguments)

targets[:5]

['*', 'A0', 'A0', 'A0', 'V']

In [11]:
target_keys = set(targets)
target_idxs = range(len(target_keys))
targets_mapper = dict(zip(target_keys, target_idxs))

# 5. Save onehot representations

In [12]:
for ds_type in ('train', 'test', 'valid'):    
    if ds_type in ('train'):                
      lb = 0
      ub = DATASET_TRAIN_SIZE 

    if ds_type in ('valid'):                
      lb = DATASET_TRAIN_SIZE 
      ub = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE

    if ds_type in ('test'):                
      lb = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE 
      ub = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE + DATASET_TEST_SIZE
        
    # saves the processed data
    svm_path = '{:}/{:}/{:}.svm'.format(SVM_DIR, 'hot', ds_type)
    with open(svm_path, mode='w') as f:
        for idx in sparse_features:
            p = propositions[idx]
            if p > lb and p < ub + 1:
                target = '{:} '.format(int(targets_mapper[targets[idx]]))
                features = ' '.join([ '{:}:{:}'.format(key, val) 
                     for key, val in sparse_features[idx].items()])
                ex = '{:}{:}\n'.format(target, features)
                f.write(ex)

# 6. Glove embeddings

In [12]:
word2vec = KeyedVectors.load_word2vec_format(GLOVE_S50_PATH, unicode_errors="ignore")
embeddings_size = 50

In [13]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

# converts a word into a token,
# word might be in fact a number
def tokenize(word):
    token = word
    if is_number(word):
        token = '0'    
    elif word.lower() in word2vec:
        token = word.lower()
    else:
        token = 'unk'
    return token
        
    

## 6.1 Process replacing sparse with feature

In [15]:
args = [] 
sparse_features = defaultdict(dict)
propositions = []        
columns = ['FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC', 'CTREE', 'PRED']
bounds_mapper = defaultdict(dict)
for idx, propid in d['P'].items():
    lb = 0 
    for col in columns:
        base_col = columns_mapper[col]
        word = d[col][idx] 
        if not 'lb' in bounds_mapper[col]:
            bounds_mapper[col]['lb'] = lb 
        if base_col in ('FORM', 'LEMMA', 'PRED'):
            sz = embeddings_size
            token = tokenize(word)
            values = list(word2vec[token])
                
            sparse_features[idx].update({
                i + lb: round(val, 7) 
                for i, val in enumerate(values)
            })
        elif categorical in lexicons[base_col]:
            idx1 = lexicons[base_col][categorical]
            sparse_features[idx][lb + idx1]=1 
            sz = dimension_mapper[col] 
        else:
            # nan set to zero
            sparse_features[idx][lb]=1 
            sz = dimension_mapper[col]           
        lb += sz
        if not 'ub' in bounds_mapper[col]:
            bounds_mapper[col]['ub'] = lb 

    args.append(lexicons['ARG'][d['ARG'][idx]]) 
    propositions.append( propid )

        
sparse_features[0]

{0: -0.35242301,
 1: 0.52991003,
 2: 1.378052,
 3: -2.6353519,
 4: 0.064434998,
 5: 0.51971298,
 6: -0.89432102,
 7: -1.146332,
 8: -0.71181601,
 9: 0.21502399,
 10: -0.32224,
 11: -0.087746002,
 12: 0.54578102,
 13: -0.072255999,
 14: -0.138069,
 15: -1.0330909,
 16: 0.374457,
 17: 0.41515201,
 18: 0.062208001,
 19: 0.061988998,
 20: 0.53525698,
 21: -0.57822698,
 22: -0.77802098,
 23: -1.77086,
 24: -0.867414,
 25: 0.44269899,
 26: -0.81675398,
 27: -0.23604099,
 28: -0.16220599,
 29: 0.226478,
 30: 0.839167,
 31: -0.069043003,
 32: -0.37729999,
 33: -0.138179,
 34: -0.15516201,
 35: 0.137887,
 36: 0.052816,
 37: 0.47624999,
 38: -0.54324901,
 39: -0.35822999,
 40: -0.21709099,
 41: 0.26028901,
 42: 0.0069400002,
 43: -0.95268399,
 44: 0.558254,
 45: -0.28643,
 46: -0.211694,
 47: 0.59664899,
 48: 0.311598,
 49: -0.25995699,
 50: -0.35242301,
 51: 0.52991003,
 52: 1.378052,
 53: -2.6353519,
 54: 0.064434998,
 55: 0.51971298,
 56: -0.89432102,
 57: -1.146332,
 58: -0.71181601,
 59: 0.

In [17]:
print(bounds_mapper)

defaultdict(<class 'dict'>, {'FORM': {'lb': 0, 'ub': 50}, 'LEMMA': {'lb': 50, 'ub': 100}, 'GPOS': {'lb': 100, 'ub': 125}, 'MORF': {'lb': 125, 'ub': 150}, 'DTREE': {'lb': 150, 'ub': 241}, 'FUNC': {'lb': 241, 'ub': 290}, 'CTREE': {'lb': 290, 'ub': 339}, 'PRED': {'lb': 339, 'ub': 389}})


## 6.2 Scale mixed representation

In [None]:
series_d = defaultdict(list)
for col in columns:
    if is_dense(col, columns_mapper):
        lb = bounds_mapper[col]['lb']
        ub = bounds_mapper[col]['ub']                
        for idx in sparse_features:    
            for x in range(lb, ub):
                series_d[x].append(sparse_features[idx][x])
        # rescale
        for d in range(lb, ub):
            min_x = min(series_d[d])
            max_x = max(series_d[d])
            for x in series_d[d]:
                if (2*x - min_x - max_x)/ (max_x - min_x) > 1 + 1e-5:
                    import code; code.interact(local=dict(globals(), **locals()))
                    
            series_d[d] = [
                (2*x - min_x - max_x)/ (max_x - min_x)
                for x in series_d[d]
            ]

        # standardize
        for d in range(lb, ub):
            n = len(series_d[d])
            mu_x = sum(series_d[d]) / n
            ssq_x = sum([(x - mu_x)*(x - mu_x) for x in series_d[d]])
            std_x = math.sqrt(ssq_x/ (n-1))
            for x in series_d[d]:
                if (x - mu_x)/ std_x  > 1 + 1e-5:
                    import code; code.interact(local=dict(globals(), **locals()))
                    
            series_d[d] = [(x - mu_x)/ std_x for x in series_d[d]]
            

        # move rescaled and standardized back to sparse_features
        for idx in sparse_features:    
            for x in range(lb, ub):
                sparse_features[idx][x] = series_d[x][idx]
            
        
        
        
sparse_features[0]        

In [23]:
print(columns)
print(bounds_mapper['PRED'])
print(bounds_mapper['FORM'])

['FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC', 'CTREE', 'PRED']
{'lb': 22607, 'ub': 23634}
{'lb': 0, 'ub': 13290}


## 6.3 Save mixed representation

In [15]:
for ds_type in ('train', 'test', 'valid'):    
    if ds_type in ('train'):                
      lb = 0
      ub = DATASET_TRAIN_SIZE 

    if ds_type in ('valid'):                
      lb = DATASET_TRAIN_SIZE 
      ub = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE

    if ds_type in ('test'):                
      lb = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE 
      ub = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE + DATASET_TEST_SIZE
        
    # saves the processed data
    svm_path = '{:}/{:}/{:}.svm'.format(SVM_DIR, 'glo', ds_type)
    with open(svm_path, mode='w') as f:
        for idx in sparse_features:
            p = propositions[idx]
            if p > lb and p < ub + 1:
                target = '{:} '.format(int(targets_mapper[targets[idx]]))
                features = ' '.join([ '{:}:{:}'.format(key, val) 
                     for key, val in sparse_features[idx].items()])
                ex = '{:}{:}\n'.format(target, features)
                f.write(ex)

# 7. Wang2vec embeddings

In [18]:
word2vec = KeyedVectors.load_word2vec_format(WANG_S100_PATH, unicode_errors="ignore")
embeddings_size = 100

## 7.1 Process replacing sparse with wang2vec

In [19]:
args = [] 
sparse_features = defaultdict(dict)
propositions = []        
for idx, propid in d['P'].items():
    lb = 0 
    for col in columns:
        base_col = columns_mapper[col]
        word = d[col][idx] 
        if base_col in ('FORM', 'LEMMA'):
            sz = embeddings_size
            token = tokenize(word)
            values = list(word2vec[token])
                
            sparse_features[idx].update({
                i + lb: round(val, 7) 
                for i, val in enumerate(values)
            })
        elif categorical in lexicons[base_col]:
            idx1 = lexicons[base_col][categorical]
            sparse_features[idx][lb + idx1]=1 
            sz = dimension_mapper[col] 
        else:
            # nan set to zero
            sparse_features[idx][lb]=1 
            sz = dimension_mapper[col] 
        lb += sz
    propositions.append( propid )

        
sparse_features[0]

{0: -0.52029401,
 1: -1.084011,
 2: 0.57885402,
 3: -0.63992798,
 4: -0.038910002,
 5: 0.62952298,
 6: -0.082349002,
 7: 0.29049999,
 8: -0.83230901,
 9: 0.70121199,
 10: -0.115194,
 11: 0.22070201,
 12: 0.27586901,
 13: -0.26365501,
 14: -0.177855,
 15: 0.145326,
 16: 0.58414203,
 17: -0.49399501,
 18: 0.089759,
 19: 0.47134301,
 20: 0.16844399,
 21: -0.239599,
 22: -0.035000999,
 23: -0.318086,
 24: -0.044199001,
 25: 0.39588401,
 26: 0.51004899,
 27: 0.461573,
 28: 0.36443901,
 29: -0.147663,
 30: 0.012673,
 31: 0.469439,
 32: -0.54493499,
 33: 0.60724401,
 34: -0.32190999,
 35: -0.0071990001,
 36: 0.043249,
 37: -0.38506499,
 38: -0.35674,
 39: -0.47903901,
 40: -0.31924799,
 41: 0.155983,
 42: 0.091351002,
 43: -0.218468,
 44: 0.14129899,
 45: 0.481749,
 46: 0.077671997,
 47: 0.19820599,
 48: -0.72299701,
 49: -0.33969,
 50: -0.055128001,
 51: -0.31261799,
 52: -0.64407998,
 53: -0.31589401,
 54: -0.61545199,
 55: 0.708318,
 56: 0.106625,
 57: -0.15074199,
 58: -0.47371799,
 59: 0

## 7.2 Saved mixed representation

In [20]:
for ds_type in ('train', 'test', 'valid'):    
    if ds_type in ('train'):                
      lb = 0
      ub = DATASET_TRAIN_SIZE 

    if ds_type in ('valid'):                
      lb = DATASET_TRAIN_SIZE 
      ub = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE

    if ds_type in ('test'):                
      lb = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE 
      ub = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE + DATASET_TEST_SIZE
        
    # saves the processed data
    svm_path = '{:}/{:}/{:}.svm'.format(SVM_DIR, 'wan', ds_type)
    with open(svm_path, mode='w') as f:
        for idx in sparse_features:
            p = propositions[idx]
            if p > lb and p < ub + 1:
                target = '{:} '.format(int(targets_mapper[targets[idx]]))
                features = ' '.join([ '{:}:{:}'.format(key, val) 
                     for key, val in sparse_features[idx].items()])
                ex = '{:}{:}\n'.format(target, features)
                f.write(ex)