# 1. Loads Gold Standard and Schema

In [7]:
import sys
sys.path.append('../datasets')

from collections import defaultdict
import pandas as pd
import yaml
import glob 
import re
import math
from gensim.models import KeyedVectors
from data_propbankbr import propbankbr_arg2t

DATASETS_DIR = '../datasets/csvs/'
SCHEMAS_DIR = '../datasets/schemas/'
SVM_DIR = '../datasets/svms/'
EMBEDDINGS_DIR = '../datasets/txts/embeddings/'

GS_PATH = '{:}{:}'.format(DATASETS_DIR, 'gs.csv')
GS_SCHEMA_PATH = '{:}{:}'.format(SCHEMAS_DIR, 'gs.yaml')

GLOVE_S50_PATH = '{:}glove_s50.txt'.format(EMBEDDINGS_DIR)
WANG_S100_PATH = '{:}wang2vec_s100.txt'.format(EMBEDDINGS_DIR)



DATASET_SIZE= 5931
DATASET_TRAIN_SIZE= 5099
DATASET_VALID_SIZE= 569
DATASET_TEST_SIZE=  263


In [8]:
with open(GS_SCHEMA_PATH, mode='r') as f:
    dictschema = yaml.load(f)

print([ i
    for i in dictschema])


['ARG', 'CTREE', 'DTREE', 'FORM', 'FUNC', 'GPOS', 'ID', 'INDEX', 'LEMMA', 'MORF', 'P', 'PRED', 'PRED_MARKER', 'P_S', 'S', 'T']


In [9]:
df = pd.read_csv(GS_PATH, sep=',', encoding='utf-8', index_col=0)
print(df.columns)

Index(['ID', 'S', 'P', 'P_S', 'FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC',
       'CTREE', 'PRED', 'ARG'],
      dtype='object')


# 2. Loads gs_column_shifts

In [23]:
column_paths = ('column_shifts', 'column_shifts_ctx_p', 'column_t', 
                'column_preddist', 'column_predmorph', 'column_predmarker',
                'column_passivevoice', 'column_deptree')

numeric_columns = ('column_preddist', 'column_predmorph', 
                   'column_predmarker', 'column_passivevoice')

kwargs = {'sep': ',', 'encoding':'utf-8', 'index_col':0}
for column_path in column_paths:
    column_pattern = '{:}{:}/*'.format(DATASETS_DIR, column_path)
    for file_path in glob.glob(column_pattern):
        print(file_path)
        if column_path in numeric_columns:
            _df = pd.read_csv(file_path, dtype=int, **kwargs)
        else:
            _df = pd.read_csv(file_path, dtype=str, **kwargs)
        df = pd.concat((df, _df), axis=1, ignore_index=False)

print(df.columns)

../datasets/csvs/column_shifts/form.csv
../datasets/csvs/column_shifts/func.csv
../datasets/csvs/column_shifts/lemma.csv
../datasets/csvs/column_shifts/gpos.csv
../datasets/csvs/column_shifts_ctx_p/form.csv
../datasets/csvs/column_shifts_ctx_p/func.csv
../datasets/csvs/column_shifts_ctx_p/lemma.csv
../datasets/csvs/column_shifts_ctx_p/gpos.csv
../datasets/csvs/column_t/t.csv
../datasets/csvs/column_preddist/predicate_distance.csv
../datasets/csvs/column_predmorph/pred_morph.csv
../datasets/csvs/column_predmarker/predicate_marker.csv
../datasets/csvs/column_passivevoice/passive_voice.csv
../datasets/csvs/column_deptree/func.csv
../datasets/csvs/column_deptree/lemma.csv
../datasets/csvs/column_deptree/gpos.csv
Index(['ID', 'S', 'P', 'P_S', 'FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC',
       ...
       'GPOS_17', 'GPOS_18', 'GPOS_19', 'GPOS_20', 'GPOS_21', 'GPOS_CHILD_1',
       'GPOS_CHILD_2', 'GPOS_CHILD_3', 'GPOS_GRAND_PARENT', 'GPOS_PARENT'],
      dtype='object', length=560)


# 3. Auxiliary Functions

In [24]:
def bounds_fn(columns, embeddings_size, dimension_mapper, columns_mapper):
    bmapper = {}
    lb = 0 
    for col in columns:
        if is_dense(col, columns_mapper):
            ub = lb +  embeddings_size        
        else:
            ub = lb +  dimension_mapper[col]            
        bmapper[col] = {'lb': lb, 'ub':ub }
        lb = ub + 1
    return bmapper

def is_dense(col, columns_mapper):
    return columns_mapper[col] in ('FORM', 'LEMMA', 'PRED')


def subcol(col):
    re_ctxp = r'(_CTX_P)|(_\d)|[\+|\-|\d|]'
    re_repl = r'(_CHILD)|(_PARENT)|(_GRAND_PARENT)'

    bcol = re.sub(re_ctxp, '', col)
    bcol = re.sub(re_repl, '', bcol)
    return bcol

# 4. Column Schema

In [25]:

columns_mapper = {col: subcol(col) for col in df.columns.tolist()}
print(columns_mapper)
dimension_mapper = {}
for colfeat, colbase in columns_mapper.items():
    if colbase in dictschema:
        if 'domain' in dictschema[colbase]:
            dimension_mapper[colfeat] = len( dictschema[colbase]['domain'])
        else:
            dimension_mapper[colfeat] = 1
    else:
        dimension_mapper[colfeat] = 1
          

{'ID': 'ID', 'S': 'S', 'P': 'P', 'P_S': 'P_S', 'FORM': 'FORM', 'LEMMA': 'LEMMA', 'GPOS': 'GPOS', 'MORF': 'MORF', 'DTREE': 'DTREE', 'FUNC': 'FUNC', 'CTREE': 'CTREE', 'PRED': 'PRED', 'ARG': 'ARG', 'FORM+1': 'FORM', 'FORM+2': 'FORM', 'FORM+3': 'FORM', 'FORM-1': 'FORM', 'FORM-2': 'FORM', 'FORM-3': 'FORM', 'FUNC+1': 'FUNC', 'FUNC+2': 'FUNC', 'FUNC+3': 'FUNC', 'FUNC-1': 'FUNC', 'FUNC-2': 'FUNC', 'FUNC-3': 'FUNC', 'LEMMA+1': 'LEMMA', 'LEMMA+2': 'LEMMA', 'LEMMA+3': 'LEMMA', 'LEMMA-1': 'LEMMA', 'LEMMA-2': 'LEMMA', 'LEMMA-3': 'LEMMA', 'GPOS+1': 'GPOS', 'GPOS+2': 'GPOS', 'GPOS+3': 'GPOS', 'GPOS-1': 'GPOS', 'GPOS-2': 'GPOS', 'GPOS-3': 'GPOS', 'FORM_CTX_P+0': 'FORM', 'FORM_CTX_P+1': 'FORM', 'FORM_CTX_P+2': 'FORM', 'FORM_CTX_P+3': 'FORM', 'FORM_CTX_P-1': 'FORM', 'FORM_CTX_P-2': 'FORM', 'FORM_CTX_P-3': 'FORM', 'FUNC_CTX_P+0': 'FUNC', 'FUNC_CTX_P+1': 'FUNC', 'FUNC_CTX_P-1': 'FUNC', 'LEMMA_CTX_P+0': 'LEMMA', 'LEMMA_CTX_P+1': 'LEMMA', 'LEMMA_CTX_P-1': 'LEMMA', 'GPOS_CTX_P+0': 'GPOS', 'GPOS_CTX_P+1': 'GP

In [6]:
lexicons = {col : 
                dict(
                     zip(dictschema[col]['domain'], 
                         range(1, dimension_mapper[col]+1)
                        )
                    )
             for col in dictschema if  col in dictschema and 'domain' in dictschema[col]}


columns = ['FORM', 'FUNC', 'GPOS', 'LEMMA', 'PRED'
          'FORM-1', 'FORM+1', 'FUNC-1', 'FUNC+1',
          'LEMMA-3', 'LEMMA-2', 'LEMMA-1', 'LEMMA+1', 'LEMMA+2', 'LEMMA+3',
          'GPOS-3', 'GPOS-2', 'GPOS-1', 'GPOS+1', 'GPOS+2', 'GPOS+3',
          'FUNC-3', 'FUNC-2',  'FUNC+2', 'FUNC+3',
          'GPOS_CTX_P-1', 'GPOS_CTX_P+0', 'GPOS_CTX_P+1',
          'LEMMA_CTX_P-1', 'LEMMA_CTX_P+0', 'LEMMA_CTX_P+1', 'PRED_DIST']


d = df.to_dict()

print(lexicons.keys())
print(d.keys())

dict_keys(['ARG', 'CTREE', 'DTREE', 'FORM', 'FUNC', 'GPOS', 'LEMMA', 'MORF', 'PRED'])
dict_keys(['ID', 'S', 'P', 'P_S', 'FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC', 'CTREE', 'PRED', 'ARG', 'FORM+1', 'FORM+2', 'FORM+3', 'FORM-1', 'FORM-2', 'FORM-3', 'FUNC+1', 'FUNC+2', 'FUNC+3', 'FUNC-1', 'FUNC-2', 'FUNC-3', 'LEMMA+1', 'LEMMA+2', 'LEMMA+3', 'LEMMA-1', 'LEMMA-2', 'LEMMA-3', 'GPOS+1', 'GPOS+2', 'GPOS+3', 'GPOS-1', 'GPOS-2', 'GPOS-3', 'FUNC_CTX_P+0', 'FUNC_CTX_P+1', 'FUNC_CTX_P-1', 'LEMMA_CTX_P+0', 'LEMMA_CTX_P+1', 'LEMMA_CTX_P-1', 'GPOS_CTX_P+0', 'GPOS_CTX_P+1', 'GPOS_CTX_P-1', 'PRED_DIST'])


In [7]:
args = [] 
sparse_features = defaultdict(dict)
propositions = []        

for idx, propid in d['P'].items():
    lb = 1 
    for col in columns:
        base_col = columns_mapper[col]
        categorical = d[col][idx] 
        if base_col in lexicons and categorical in lexicons[base_col]:
            idx1 = lexicons[base_col][categorical]
            sparse_features[idx][lb + idx1]=1 
        else:
            # nan set to zero
            sparse_features[idx][lb]=1 
        lb += dimension_mapper[col] 

    args.append(lexicons['ARG'][d['ARG'][idx]]) 
    propositions.append( propid )

        
sparse_features[0]

{1000: 1,
 14001: 1,
 22368: 1,
 22387: 1,
 22418: 1,
 22512: 1,
 22552: 1,
 22747: 1,
 23628: 1,
 36918: 1,
 50208: 1,
 76627: 1,
 85109: 1,
 100999: 1,
 103368: 1,
 112439: 1,
 121510: 1,
 139542: 1,
 140728: 1,
 156191: 1,
 157794: 1,
 157819: 1,
 157844: 1,
 157891: 1,
 157911: 1,
 157933: 1,
 157944: 1,
 157993: 1,
 158042: 1,
 158126: 1,
 158174: 1,
 158198: 1,
 158252: 1,
 158278: 1,
 158306: 1,
 165781: 1,
 169132: 1,
 179804: 1,
 185526: 1}

In [8]:
args[:5]

[15, 28, 15, 32, 26]

In [9]:
# FORM .: first sparse feature = lexicons['FORM']['Brasília'] --> 134
# LEMMA .: second sparse feature = dimension_mapper['FORM'] + lexicons['LEMMA']['Brasília'] --> 13386
# GPOS .: third sparse feature =  dimension_mapper['FORM'] +  dimension_mapper['LEMMA'] + lexicons['GPOS']['PROP'] --> 22362
df.head(1)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,DTREE,FUNC,...,FUNC_CTX_P+0,FUNC_CTX_P+1,FUNC_CTX_P-1,LEMMA_CTX_P+0,LEMMA_CTX_P+1,LEMMA_CTX_P-1,GPOS_CTX_P+0,GPOS_CTX_P+1,GPOS_CTX_P-1,PRED_DIST
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,0,Brasília,Brasília,PROP,F|S,5,ADVL,...,STA,>N,ADVL,revelar,um,hoje,V-FIN,ART,ADV,4


# 4. Convert ARGS into T

In [10]:
arguments = [value for key, value in d['ARG'].items()] 
targets = propbankbr_arg2t(propositions, arguments)

targets[:5]

['*', 'A0', 'A0', 'A0', 'V']

In [11]:
target_keys = set(targets)
target_idxs = range(len(target_keys))
targets_mapper = dict(zip(target_keys, target_idxs))

# 5. Save onehot representations

In [12]:
for ds_type in ('train', 'test', 'valid'):    
    if ds_type in ('train'):                
      lb = 0
      ub = DATASET_TRAIN_SIZE 

    if ds_type in ('valid'):                
      lb = DATASET_TRAIN_SIZE 
      ub = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE

    if ds_type in ('test'):                
      lb = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE 
      ub = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE + DATASET_TEST_SIZE
        
    # saves the processed data
    svm_path = '{:}/{:}/{:}.svm'.format(SVM_DIR, 'hot', ds_type)
    with open(svm_path, mode='w') as f:
        for idx in sparse_features:
            p = propositions[idx]
            if p > lb and p < ub + 1:
                #1 - indexed value for sparse features, 
                # segmentation error 11 caused by zero indexed array
                target = '{:} '.format(int(targets_mapper[targets[idx]]))
                features = ' '.join([ '{:}:{:}'.format(key , val) 
                     for key, val in sparse_features[idx].items()])
                ex = '{:}{:}\n'.format(target, features)
                f.write(ex)

# 6. Glove embeddings

In [13]:
word2vec = KeyedVectors.load_word2vec_format(GLOVE_S50_PATH, unicode_errors="ignore")
embeddings_size = 50

In [14]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

# converts a word into a token,
# word might be in fact a number
def tokenize(word):
    token = word
    if is_number(word):
        token = '0'    
    elif word.lower() in word2vec:
        token = word.lower()
    else:
        token = 'unk'
    return token
        
    

## 6.1 Process replacing sparse with feature

In [15]:
args = [] 
sparse_features = defaultdict(dict)
propositions = []        
# columns = ['FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC', 'CTREE', 'PRED']
bounds_mapper = defaultdict(dict)
for idx, propid in d['P'].items():
    lb = 1 
    for col in columns:
        base_col = columns_mapper[col]
        word = d[col][idx] 
        if not 'lb' in bounds_mapper[col]:
            bounds_mapper[col]['lb'] = lb 
        if col in ('FORM', 'LEMMA', 'PRED'):
            sz = embeddings_size
            token = tokenize(word)
            values = list(word2vec[token])
                
            sparse_features[idx].update({
                i + lb: val
                for i, val in enumerate(values)
            })
        elif base_col in lexicons and categorical in lexicons[base_col]:
            idx1 = lexicons[base_col][categorical]
            sparse_features[idx][lb + idx1]=1 
            sz = dimension_mapper[col] 
        else:
            # nan set to zero
            sparse_features[idx][lb]=1 
            sz = dimension_mapper[col]           
        lb += sz
        if not 'ub' in bounds_mapper[col]:
            bounds_mapper[col]['ub'] = lb 

    args.append(lexicons['ARG'][d['ARG'][idx]]) 
    propositions.append( propid )

        
sparse_features[0]

{1: -0.35242301,
 2: 0.52991003,
 3: 1.378052,
 4: -2.6353519,
 5: 0.064434998,
 6: 0.51971298,
 7: -0.89432102,
 8: -1.146332,
 9: -0.71181601,
 10: 0.21502399,
 11: -0.32224,
 12: -0.087746002,
 13: 0.54578102,
 14: -0.072255999,
 15: -0.138069,
 16: -1.0330909,
 17: 0.374457,
 18: 0.41515201,
 19: 0.062208001,
 20: 0.061988998,
 21: 0.53525698,
 22: -0.57822698,
 23: -0.77802098,
 24: -1.77086,
 25: -0.867414,
 26: 0.44269899,
 27: -0.81675398,
 28: -0.23604099,
 29: -0.16220599,
 30: 0.226478,
 31: 0.839167,
 32: -0.069043003,
 33: -0.37729999,
 34: -0.138179,
 35: -0.15516201,
 36: 0.137887,
 37: 0.052816,
 38: 0.47624999,
 39: -0.54324901,
 40: -0.35822999,
 41: -0.21709099,
 42: 0.26028901,
 43: 0.0069400002,
 44: -0.95268399,
 45: 0.558254,
 46: -0.28643,
 47: -0.211694,
 48: 0.59664899,
 49: 0.311598,
 50: -0.25995699,
 51: -0.35242301,
 52: 0.52991003,
 53: 1.378052,
 54: -2.6353519,
 55: 0.064434998,
 56: 0.51971298,
 57: -0.89432102,
 58: -1.146332,
 59: -0.71181601,
 60: 0

In [16]:
print(bounds_mapper)

defaultdict(<class 'dict'>, {'FORM': {'lb': 1, 'ub': 51}, 'LEMMA': {'lb': 51, 'ub': 101}, 'GPOS': {'lb': 101, 'ub': 126}, 'MORF': {'lb': 126, 'ub': 151}, 'DTREE': {'lb': 151, 'ub': 242}, 'FUNC': {'lb': 242, 'ub': 291}, 'CTREE': {'lb': 291, 'ub': 340}, 'PRED': {'lb': 340, 'ub': 390}, 'FORM-3': {'lb': 390, 'ub': 13680}, 'FORM-2': {'lb': 13680, 'ub': 26970}, 'FORM-1': {'lb': 26970, 'ub': 40260}, 'FORM+1': {'lb': 40260, 'ub': 53550}, 'FORM+2': {'lb': 53550, 'ub': 66840}, 'FORM+3': {'lb': 66840, 'ub': 80130}, 'LEMMA-3': {'lb': 80130, 'ub': 89201}, 'LEMMA-2': {'lb': 89201, 'ub': 98272}, 'LEMMA-1': {'lb': 98272, 'ub': 107343}, 'LEMMA+1': {'lb': 107343, 'ub': 116414}, 'LEMMA+2': {'lb': 116414, 'ub': 125485}, 'LEMMA+3': {'lb': 125485, 'ub': 134556}, 'GPOS-3': {'lb': 134556, 'ub': 134581}, 'GPOS-2': {'lb': 134581, 'ub': 134606}, 'GPOS-1': {'lb': 134606, 'ub': 134631}, 'GPOS+1': {'lb': 134631, 'ub': 134656}, 'GPOS+2': {'lb': 134656, 'ub': 134681}, 'GPOS+3': {'lb': 134681, 'ub': 134706}, 'FUNC-3':

## 6.2 Scale mixed representation

In [17]:
series_d = defaultdict(list)
for col in columns:
#     if is_dense(col, columns_mapper): Missing values not handled FORM-3
    if col in ('FORM', 'LEMMA', 'PRED'):
        lb = bounds_mapper[col]['lb']
        ub = bounds_mapper[col]['ub']
        for idx in sparse_features:    
            for x in range(lb, ub):
                series_d[x].append(sparse_features[idx][x])

        # standardize
        for f in range(lb, ub):
            n = len(series_d[f])
            mu_x = sum(series_d[f]) / n
            ssq_x = sum([(x - mu_x)*(x - mu_x) for x in series_d[f]])
            std_x = math.sqrt(ssq_x/ (n-1))
            
            series_d[f] = [(x - mu_x)/ std_x for x in series_d[f]]            
        # rescale
        for f in range(lb, ub):
            min_x = min(series_d[f])
            max_x = max(series_d[f])
            series_d[f] = [
                (2*x - min_x - max_x)/ (max_x - min_x)
                for x in series_d[f]
            ]            
        # move rescaled and standardized back to sparse_features
        for idx in sparse_features:    
            for x in range(lb, ub):
                sparse_features[idx][x] = series_d[x][idx]

        # round values
        for idx in sparse_features:    
            for x in range(lb, ub):                
                sparse_features[idx][x] = round(sparse_features[idx][x], 4)
            
                
        
sparse_features[0]        

{1: -0.15909999999999999,
 2: 0.2671,
 3: 0.75890000000000002,
 4: -0.33579999999999999,
 5: 0.1467,
 6: 0.1449,
 7: -0.40110000000000001,
 8: -0.52400000000000002,
 9: -0.185,
 10: -0.083000000000000004,
 11: -0.069400000000000003,
 12: -0.096100000000000005,
 13: 0.32290000000000002,
 14: 0.02,
 15: 0.042900000000000001,
 16: -0.59379999999999999,
 17: 0.17599999999999999,
 18: 0.18629999999999999,
 19: -0.1148,
 20: -0.056000000000000001,
 21: 0.078100000000000003,
 22: -0.26019999999999999,
 23: -0.30969999999999998,
 24: -0.60980000000000001,
 25: -0.28510000000000002,
 26: 0.31440000000000001,
 27: -0.37040000000000001,
 28: -0.1996,
 29: -0.041700000000000001,
 30: -0.17000000000000001,
 31: 0.48049999999999998,
 32: -0.090300000000000005,
 33: -0.25700000000000001,
 34: -0.068500000000000005,
 35: -0.059999999999999998,
 36: 0.2329,
 37: 0.0012999999999999999,
 38: 0.26700000000000002,
 39: 0.017899999999999999,
 40: -0.22370000000000001,
 41: -0.16619999999999999,
 42: 0.1053,

## 6.3 Save mixed representation

In [18]:
for ds_type in ('train', 'test', 'valid'):    
    if ds_type in ('train'):                
      lb = 0
      ub = DATASET_TRAIN_SIZE 

    if ds_type in ('valid'):                
      lb = DATASET_TRAIN_SIZE 
      ub = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE

    if ds_type in ('test'):                
      lb = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE 
      ub = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE + DATASET_TEST_SIZE
        
    # saves the processed data
    svm_path = '{:}/{:}/{:}.svm'.format(SVM_DIR, 'glo', ds_type)
    with open(svm_path, mode='w') as f:
        for idx in sparse_features:
            p = propositions[idx]
            if p > lb and p < ub + 1:
                target = '{:} '.format(int(targets_mapper[targets[idx]]))
                features = ' '.join([ '{:}:{:}'.format(key, val) 
                     for key, val in sparse_features[idx].items()])
                ex = '{:}{:}\n'.format(target, features)
                f.write(ex)

# 7. Wang2vec embeddings

In [19]:
word2vec = KeyedVectors.load_word2vec_format(WANG_S100_PATH, unicode_errors="ignore")
embeddings_size = 100

## 7.1 Process replacing sparse with wang2vec

In [20]:
args = [] 
sparse_features = defaultdict(dict)
propositions = []        
columns = ['FORM', 'LEMMA', 'GPOS', 'MORF', 'DTREE', 'FUNC', 'CTREE', 'PRED']
bounds_mapper = defaultdict(dict)
for idx, propid in d['P'].items():
    lb = 0 
    for col in columns:
        base_col = columns_mapper[col]
        word = d[col][idx] 
        if not 'lb' in bounds_mapper[col]:
            bounds_mapper[col]['lb'] = lb 
        if base_col in ('FORM', 'LEMMA', 'PRED'):
            sz = embeddings_size
            token = tokenize(word)
            values = list(word2vec[token])
                
            sparse_features[idx].update({
                i + lb: val
                for i, val in enumerate(values)
            })
        elif categorical in lexicons[base_col]:
            idx1 = lexicons[base_col][categorical]
            sparse_features[idx][lb + idx1]=1 
            sz = dimension_mapper[col] 
        else:
            # nan set to zero
            sparse_features[idx][lb]=1 
            sz = dimension_mapper[col]           
        lb += sz
        if not 'ub' in bounds_mapper[col]:
            bounds_mapper[col]['ub'] = lb 

    args.append(lexicons['ARG'][d['ARG'][idx]]) 
    propositions.append( propid )

        
sparse_features[0]

{0: -0.52029401,
 1: -1.084011,
 2: 0.57885402,
 3: -0.63992798,
 4: -0.038910002,
 5: 0.62952298,
 6: -0.082349002,
 7: 0.29049999,
 8: -0.83230901,
 9: 0.70121199,
 10: -0.115194,
 11: 0.22070201,
 12: 0.27586901,
 13: -0.26365501,
 14: -0.177855,
 15: 0.145326,
 16: 0.58414203,
 17: -0.49399501,
 18: 0.089759,
 19: 0.47134301,
 20: 0.16844399,
 21: -0.239599,
 22: -0.035000999,
 23: -0.318086,
 24: -0.044199001,
 25: 0.39588401,
 26: 0.51004899,
 27: 0.461573,
 28: 0.36443901,
 29: -0.147663,
 30: 0.012673,
 31: 0.469439,
 32: -0.54493499,
 33: 0.60724401,
 34: -0.32190999,
 35: -0.0071990001,
 36: 0.043249,
 37: -0.38506499,
 38: -0.35674,
 39: -0.47903901,
 40: -0.31924799,
 41: 0.155983,
 42: 0.091351002,
 43: -0.218468,
 44: 0.14129899,
 45: 0.481749,
 46: 0.077671997,
 47: 0.19820599,
 48: -0.72299701,
 49: -0.33969,
 50: -0.055128001,
 51: -0.31261799,
 52: -0.64407998,
 53: -0.31589401,
 54: -0.61545199,
 55: 0.708318,
 56: 0.106625,
 57: -0.15074199,
 58: -0.47371799,
 59: 0

## 7.2 Scale mixed representation

In [21]:
series_d = defaultdict(list)
for col in columns:
#     if is_dense(col, columns_mapper):
    if col in ('FORM', 'LEMMA', 'PRED'):
        lb = bounds_mapper[col]['lb']
        ub = bounds_mapper[col]['ub']                
        for idx in sparse_features:    
            for x in range(lb, ub):
                series_d[x].append(sparse_features[idx][x])

        # standardize
        for f in range(lb, ub):
            n = len(series_d[f])
            mu_x = sum(series_d[f]) / n
            ssq_x = sum([(x - mu_x)*(x - mu_x) for x in series_d[f]])
            std_x = math.sqrt(ssq_x/ (n-1))
            
            series_d[f] = [(x - mu_x)/ std_x for x in series_d[f]]            
        # rescale
        for f in range(lb, ub):
            min_x = min(series_d[f])
            max_x = max(series_d[f])
            series_d[f] = [
                (2*x - min_x - max_x)/ (max_x - min_x)
                for x in series_d[f]
            ]            
        # move rescaled and standardized back to sparse_features
        for idx in sparse_features:    
            for x in range(lb, ub):
                sparse_features[idx][x] = series_d[x][idx]

        # round values
        for idx in sparse_features:    
            for x in range(lb, ub):                
                sparse_features[idx][x] = round(sparse_features[idx][x], 4)
            
                
        
sparse_features[0]        

{0: -0.41410000000000002,
 1: -0.4829,
 2: 0.043400000000000001,
 3: -0.45650000000000002,
 4: 0.1202,
 5: 0.45839999999999997,
 6: 0.035799999999999998,
 7: 0.40610000000000002,
 8: -0.34379999999999999,
 9: 0.26929999999999998,
 10: -0.13420000000000001,
 11: 0.0562,
 12: 0.3216,
 13: 0.37890000000000001,
 14: -0.055599999999999997,
 15: 0.044499999999999998,
 16: 0.02,
 17: -0.50980000000000003,
 18: -0.13619999999999999,
 19: 0.3488,
 20: 0.24279999999999999,
 21: -0.39300000000000002,
 22: -0.28570000000000001,
 23: 0.26250000000000001,
 24: 0.085099999999999995,
 25: 0.33279999999999998,
 26: 0.26869999999999999,
 27: 0.32240000000000002,
 28: 0.11899999999999999,
 29: -0.0134,
 30: 0.016,
 31: 0.52170000000000005,
 32: 0.1469,
 33: 0.13639999999999999,
 34: -0.0654,
 35: -0.085599999999999996,
 36: -0.111,
 37: 0.0012999999999999999,
 38: -0.38929999999999998,
 39: 0.081000000000000003,
 40: -0.36470000000000002,
 41: -0.079399999999999998,
 42: 0.033300000000000003,
 43: 0.0495

## 7.3 Saved mixed representation

In [22]:
for ds_type in ('train', 'test', 'valid'):    
    if ds_type in ('train'):                
      lb = 0
      ub = DATASET_TRAIN_SIZE 

    if ds_type in ('valid'):                
      lb = DATASET_TRAIN_SIZE 
      ub = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE

    if ds_type in ('test'):                
      lb = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE 
      ub = DATASET_TRAIN_SIZE + DATASET_VALID_SIZE + DATASET_TEST_SIZE
        
    # saves the processed data
    svm_path = '{:}/{:}/{:}.svm'.format(SVM_DIR, 'wan', ds_type)
    with open(svm_path, mode='w') as f:
        for idx in sparse_features:
            p = propositions[idx]
            if p > lb and p < ub + 1:
                target = '{:} '.format(int(targets_mapper[targets[idx]]))
                features = ' '.join([ '{:}:{:}'.format(key, val) 
                     for key, val in sparse_features[idx].items()])
                ex = '{:}{:}\n'.format(target, features)
                f.write(ex)