In [None]:
import pandas as pd
import numpy as np
from m16_mlutils.pipeline import CategoryEncoder, DataFrameSelector
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from m16_mlutils.datatools.evaluation import eval_summary
from sklearn.dummy import DummyClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import GridSearchCV

from numpy.random import seed

seed(42)

In [None]:
training_set = pd.read_csv('data/i__training_data.csv', 
                           header=None, index_col=None,
                           names=['sentence_id', 'offer_len', 'token', 'loc', 
                                  'pos', 'pos_left', 'pos_right', 
                                  'token_len', 'all_upper', 'n_tokens', 'real_label'])
training_set.head()
test_set = training_set[pd.isna(training_set['real_label'])]
training_set = training_set[~pd.isna(training_set['real_label'])]
print(len(training_set))
training_set.head()

## Load data transformation pipeline

In [None]:
def get_pipeline():
    pipeline = joblib.load('data/i__pipeline.joblib') 
    return pipeline

## Split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(training_set, training_set['real_label'])
X_test.head()

In [None]:
# preserve
print('Train', len(X_train))
print('Test', len(X_test))

Train 787
Test 263


# Classify

### Baseline with Dummy classifiers

In [None]:
# preserve
dummy = DummyClassifier(strategy='stratified')

pipeline = get_pipeline()

pipeline.steps.append(('classify', dummy))

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


metrics, summary, cm = eval_summary(y_pred, y_test)

print(metrics)
print()
print(summary)

accuracy     0.551331
precision    0.164767
recall       0.164347
f1           0.163642
dtype: float64

              precision    recall  f1-score   support

           d       0.13      0.17      0.15        18
           f       0.00      0.00      0.00         2
           n       0.74      0.71      0.73       196
           o       0.06      0.04      0.05        26
           p       0.06      0.07      0.06        15
           s       0.00      0.00      0.00         6

   micro avg       0.55      0.55      0.55       263
   macro avg       0.16      0.16      0.16       263
weighted avg       0.57      0.55      0.56       263



In [None]:
pipeline = get_pipeline()

pipeline.steps.append(('classify', None))

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


params = {
    # Different classifiers:
    'classify': [LogisticRegression(C=10), RandomForestClassifier(random_state=42), LinearSVC()],
}

grid = GridSearchCV(pipeline, cv=4, n_jobs=-1, param_grid=params, verbose=1, scoring='f1_macro')

grid.fit(X_train, y_train)

In [None]:
print(grid.best_score_)
print()
print(grid.best_params_)

In [None]:
# preserve
estimator = grid.best_estimator_
y_pred = estimator.predict(X_test)

metrics, summary, cm = eval_summary(y_pred, y_test)

print(metrics)
print()
print(summary)

accuracy     0.950570
precision    0.812555
recall       0.925619
f1           0.852999
dtype: float64

              precision    recall  f1-score   support

           d       0.87      0.80      0.83        25
           f       0.50      1.00      0.67         1
           n       0.99      0.97      0.98       194
           o       0.69      0.85      0.76        13
           p       0.82      0.93      0.87        15
           s       1.00      1.00      1.00        15

   micro avg       0.95      0.95      0.95       263
   macro avg       0.81      0.93      0.85       263
weighted avg       0.96      0.95      0.95       263



## Best estimator... for now

In [None]:
from sklearn.base import clone

best_classifier = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
best_pipeline = get_pipeline()
best_pipeline.steps.append(('clf', best_classifier))

In [None]:
best_pipeline.fit(X_train, y_train)
y_pred = best_pipeline.predict(X_test)

metrics, summary, cm = eval_summary(y_test, y_pred)

print(metrics)
print(summary)

In [None]:
joblib.dump(best_pipeline, 'data/i__estimate_pipeline.joblib') 