In [1]:
import pandas as pd
import numpy as np
from m16_mlutils.pipeline import CategoryEncoder, DataFrameSelector
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MaxAbsScaler

In [2]:
training_set = pd.read_csv('data/i__training_data.csv', 
                           header=None, index_col=None,
                           names=['offer_id', 'offer_length', 'token', 'token_position',
                                  'token_pos', 'token_length', 'upper', 'real_label'])
training_set.head()
training_set = training_set[~pd.isna(training_set['real_label'])]
print(len(training_set))
training_set.head()

381


Unnamed: 0,offer_id,offer_length,token,token_position,token_pos,token_length,upper,real_label
0,0,44,¡,0,faa,1,False,n
1,0,44,CUN,1,pp000000,3,True,org
2,0,44,a,5,sp000,1,False,sep
3,0,44,Ámsterdam,7,np00000,9,False,dst
4,0,44,$,17,dn0000,1,False,n


## Token Part-Of-the-Speech

In [5]:
pipeline_token_pos = Pipeline([
    ('selector', DataFrameSelector(['token_pos'])),
    ('encoder', CategoryEncoder())
])

## Is punctuation (more precise)

In [6]:
import string

punctuation = set(string.punctuation)

def is_punct(tokens):
    punct = [False] * len(tokens)
    for i, t in enumerate(tokens):
        punct[i] = t in punctuation
    return np.array(punct).reshape(-1,1)

pipeline_is_punctuation = Pipeline([
    ('selector', DataFrameSelector(['token'])),
    ('is_punct', FunctionTransformer(is_punct, validate=False))
])

## Relative location

In [16]:
def get_relative_locations(frame):
    return (frame['token_position']/frame['offer_length']).values.reshape(-1,1)

pipeline_relative_location = Pipeline([
    ('location', FunctionTransformer(get_relative_locations, validate=False))
])

## Token length

In [17]:
pipeline_token_length = Pipeline([
    ('selector', DataFrameSelector(['offer_length', 'token_length'])),
    ('scaler', MaxAbsScaler())
])

## Unmodified features

In [24]:
def reshape(data):
    return data.values.reshape(-1,1)

pipeline_unmodified = Pipeline([
    ('select', DataFrameSelector('upper')),
    ('reshape', FunctionTransformer(reshape, validate=False))
])

# Joint pipelines

In [25]:
joint_pipeline = Pipeline([
    ('get_features', FeatureUnion([
        ('token_pos', pipeline_token_pos),
        ('punctuation', pipeline_is_punctuation),
        ('rel_loc', pipeline_relative_location),
        ('token_length', pipeline_token_length),
        ('original_features', pipeline_unmodified)
    ]))
])

v = joint_pipeline.fit_transform(training_set)

(381, 15)