In [None]:
import pandas as pd
from m16_mlutils.datatools.evaluation import eval_summary
from m16_mlutils.pipeline import CategoryEncoder
from numpy.random import seed
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

from dataset import load_training_data

seed(42)

In [None]:
training_set = load_training_data()
training_set.head()
test_set = training_set[pd.isna(training_set['real_label'])]
training_set = training_set[~pd.isna(training_set['real_label'])]
print(len(training_set))
training_set.head()

## Load data transformation pipeline

In [None]:
def get_pipeline():
    pipeline = joblib.load('models/features_pipeline.joblib') 
    return pipeline

## Split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(training_set, training_set['real_label'])
X_test.head()

In [None]:
# preserve
print('Train', len(X_train))
print('Test', len(X_test))

Train 1977
Test 659


# Classify

### Baseline with Dummy classifiers

In [None]:
# preserve
dummy = DummyClassifier(strategy='stratified')

pipeline = get_pipeline()

pipeline.steps.append(('classify', dummy))

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


metrics, summary, cm = eval_summary(y_pred, y_test)

print(metrics)
print()
print(summary)

accuracy     0.555387
precision    0.188850
recall       0.187980
f1           0.186503
dtype: float64

              precision    recall  f1-score   support

           d       0.11      0.14      0.13        49
           f       0.00      0.00      0.00        10
           n       0.73      0.74      0.74       472
           o       0.07      0.07      0.07        60
           p       0.14      0.08      0.10        38
           s       0.07      0.10      0.08        30

   micro avg       0.56      0.56      0.56       659
   macro avg       0.19      0.19      0.19       659
weighted avg       0.55      0.56      0.55       659



In [None]:
pipeline = get_pipeline()

pipeline.steps.append(('classify', None))


params = {
    # Different classifiers:
    'classify': [LogisticRegression(C=10), RandomForestClassifier(random_state=42), LinearSVC()],
}

grid = GridSearchCV(pipeline, cv=4, n_jobs=-1, param_grid=params, verbose=1, scoring='f1_macro')

grid.fit(X_train, y_train)

In [None]:
print(grid.best_score_)
print()
print(grid.best_params_)

In [None]:
# preserve
estimator = grid.best_estimator_
y_pred = estimator.predict(X_test)

metrics, summary, cm = eval_summary(y_pred, y_test)

print(metrics)
print()
print(summary)

accuracy     0.807284
precision    0.376242
recall       0.618119
f1           0.403633
dtype: float64

              precision    recall  f1-score   support

           d       0.43      0.70      0.53        37
           f       0.17      0.50      0.25         2
           n       0.99      0.83      0.90       569
           o       0.63      0.68      0.65        50
           p       0.05      1.00      0.09         1
           s       0.00      0.00      0.00         0

   micro avg       0.81      0.81      0.81       659
   macro avg       0.38      0.62      0.40       659
weighted avg       0.93      0.81      0.86       659



  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


## Best estimator... for now

In [None]:
best_classifier = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
best_pipeline = get_pipeline()
best_pipeline.steps.append(('clf', best_classifier))

In [None]:
best_pipeline.fit(X_train, y_train)
y_pred = best_pipeline.predict(X_test)

metrics, summary, cm = eval_summary(y_test, y_pred)

print(metrics)
print(summary)

In [None]:
joblib.dump(best_pipeline, 'models/classify_pipeline.joblib') 