In [None]:
import pandas as pd
from m16_mlutils.datatools.evaluation import eval_summary
from m16_mlutils.pipeline import CategoryEncoder
from numpy.random import seed
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

seed(42)

In [None]:
training_set = pd.read_csv('data/i__training_data.csv', index_col=None)
training_set.head()
test_set = training_set[pd.isna(training_set['real_label'])]
training_set = training_set[~pd.isna(training_set['real_label'])]
print(len(training_set))
training_set.head()

## Load data transformation pipeline

In [None]:
def get_pipeline():
    pipeline = joblib.load('models/features_pipeline.joblib') 
    return pipeline

## Split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(training_set, training_set['real_label'])
X_test.head()

In [None]:
# preserve
print('Train', len(X_train))
print('Test', len(X_test))

Train 1505
Test 502


# Classify

### Baseline with Dummy classifiers

In [None]:
# preserve
dummy = DummyClassifier(strategy='stratified')

pipeline = get_pipeline()

pipeline.steps.append(('classify', dummy))

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


metrics, summary, cm = eval_summary(y_pred, y_test)

print(metrics)
print()
print(summary)

accuracy     0.541833
precision    0.176715
recall       0.179343
f1           0.176688
dtype: float64

              precision    recall  f1-score   support

           d       0.02      0.03      0.03        33
           f       0.00      0.00      0.00         7
           n       0.73      0.71      0.72       371
           o       0.08      0.06      0.07        48
           p       0.17      0.18      0.17        22
           s       0.05      0.10      0.07        21

   micro avg       0.54      0.54      0.54       502
   macro avg       0.18      0.18      0.18       502
weighted avg       0.56      0.54      0.55       502



In [None]:
pipeline = get_pipeline()

pipeline.steps.append(('classify', None))


params = {
    # Different classifiers:
    'classify': [LogisticRegression(C=10), RandomForestClassifier(random_state=42), LinearSVC()],
}

grid = GridSearchCV(pipeline, cv=4, n_jobs=-1, param_grid=params, verbose=1, scoring='f1_macro')

grid.fit(X_train, y_train)

In [None]:
print(grid.best_score_)
print()
print(grid.best_params_)

In [None]:
# preserve
estimator = grid.best_estimator_
y_pred = estimator.predict(X_test)

metrics, summary, cm = eval_summary(y_pred, y_test)

print(metrics)
print()
print(summary)

accuracy     0.952191
precision    0.775605
recall       0.861823
f1           0.803839
dtype: float64

              precision    recall  f1-score   support

           d       0.85      0.88      0.86        40
           f       0.17      0.50      0.25         2
           n       0.99      0.96      0.98       369
           o       0.81      0.88      0.84        33
           p       0.83      0.95      0.89        21
           s       1.00      1.00      1.00        37

   micro avg       0.95      0.95      0.95       502
   macro avg       0.78      0.86      0.80       502
weighted avg       0.96      0.95      0.96       502



## Best estimator... for now

In [None]:
best_classifier = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
best_pipeline = get_pipeline()
best_pipeline.steps.append(('clf', best_classifier))

In [None]:
best_pipeline.fit(X_train, y_train)
y_pred = best_pipeline.predict(X_test)

metrics, summary, cm = eval_summary(y_test, y_pred)

print(metrics)
print(summary)

In [None]:
joblib.dump(best_pipeline, 'models/classify_pipeline.joblib') 