In [None]:
import pandas as pd
import numpy as np
from m16_mlutils.pipeline import CategoryEncoder, DataFrameSelector
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.externals import joblib
from sklearn.base import BaseEstimator, TransformerMixin

import os

HOME = os.getenv('HOME')

from pipelines import IsPunctuation, RelativeLocations, Reshaper, SentenceChunker

In [None]:
training_set = pd.read_csv('data/i__training_data.csv', 
                           header=None, index_col=None,
                           names=['sentence_id', 'offer_len', 'token', 'loc', 
                                  'pos', 'pos_left', 'pos_right', 
                                  'token_len', 'all_upper', 'n_tokens', 'real_label'])
training_set.head()
test_set = training_set[pd.isna(training_set['real_label'])]
training_set = training_set[~pd.isna(training_set['real_label'])]
print(len(training_set))
print(len(test_set))
training_set.head()

## Token Part-Of-the-Speech

In [None]:
pipeline_token_pos = Pipeline([
    ('selector', DataFrameSelector(['pos'])),
    ('encoder', CategoryEncoder())
])

## Is punctuation (more precise)

In [None]:
pipeline_is_punctuation = Pipeline([
    ('selector', DataFrameSelector(['token'])),
    ('is_punct', IsPunctuation())
])

## Relative location

In [None]:
pipeline_relative_location = Pipeline([
    ('location', RelativeLocations())
])

## Token length

In [None]:
pipeline_token_length = Pipeline([
    ('selector', DataFrameSelector(['offer_len', 'token_len'])),
    ('scaler', MaxAbsScaler())
])

## Unmodified features

In [None]:
pipeline_unmodified = Pipeline([
    ('select', DataFrameSelector('all_upper')),
    ('reshape', Reshaper())
])

# Joint pipelines

In [None]:
joint_pipeline = Pipeline([
    ('get_features', FeatureUnion([
        ('token_pos', pipeline_token_pos),
        ('punctuation', pipeline_is_punctuation),
        ('rel_loc', pipeline_relative_location),
        ('token_length', pipeline_token_length),
        ('original_features', pipeline_unmodified)
    ]))
])

joblib.dump(joint_pipeline, 'data/i__pipeline.joblib') 

In [None]:
transformed_training = joint_pipeline.fit_transform(training_set)
print("Training shape", transformed_training.shape)
transformed_test = joint_pipeline.transform(test_set)
print("Test shape", transformed_test.shape)