In [1]:
import pandas as pd
from m16_mlutils.datatools.evaluation import eval_summary
from numpy.random import seed
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

from dataset import load_training_data

seed(42)

In [2]:
training_set = load_training_data()
training_set.head()
test_set = training_set[pd.isna(training_set['real_label'])]
training_set = training_set[~pd.isna(training_set['real_label'])]
print(len(training_set))
training_set.head()


4533


Unnamed: 0_level_0,offer_len,token,loc,pos,pos_left,pos_right,token_len,all_upper,n_tokens,real_label
offer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,44,¡,0,faa,<p>,np00000,1,False,11,n
0,44,CUN,1,np00000,faa,sp000,3,True,11,o
0,44,a,5,sp000,np00000,np00000,1,False,11,s
0,44,Ámsterdam,7,np00000,sp000,zm,9,False,11,d
0,44,$,17,zm,np00000,dn0000,1,False,11,n


## Load data transformation pipeline

In [3]:
def get_pipeline():
    pipeline = joblib.load('models/features_pipeline.joblib') 
    return pipeline

## Split data

In [4]:
X_train, X_test, y_train, y_test = train_test_split(training_set, training_set['real_label'])
X_test.head()

Unnamed: 0_level_0,offer_len,token,loc,pos,pos_left,pos_right,token_len,all_upper,n_tokens,real_label
offer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
205,80,¡,0,faa,<p>,np00000,1,False,20,n
107,60,$,15,nc0p000,sp000,z0,1,False,18,n
193,71,!,33,fat,dn0000,pp000000,1,False,19,n
68,105,6,29,dn0000,vmip000,nc0p000,1,False,27,n
157,69,(,32,f0,np00000,cc,1,False,18,n


In [5]:
# preserve
print('Train', len(X_train))
print('Test', len(X_test))

Train 3399
Test 1134


# Classify

### Baseline with Dummy classifiers

In [6]:
# preserve
dummy = DummyClassifier(strategy='stratified')

pipeline = get_pipeline()

pipeline.steps.append(('classify', dummy))

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


metrics, summary, cm = eval_summary(y_pred, y_test)

print(metrics)
print()
print(summary)

accuracy     0.532628
precision    0.174726
recall       0.172698
f1           0.173296
dtype: float64

              precision    recall  f1-score   support

           d       0.06      0.06      0.06        96
           f       0.00      0.00      0.00        16
           n       0.70      0.72      0.71       804
           o       0.16      0.15      0.15        95
           p       0.03      0.04      0.03        51
           s       0.09      0.07      0.08        72

   micro avg       0.53      0.53      0.53      1134
   macro avg       0.17      0.17      0.17      1134
weighted avg       0.52      0.53      0.53      1134



  k in range(self.n_outputs_)).T


In [7]:
y_train[pd.isna(y_train)]

Series([], Name: real_label, dtype: object)

In [10]:
pipeline = get_pipeline()

pipeline.steps.append(('classify', LogisticRegression(C=10)))


params = {
    # Different classifiers:
    'classify': [LogisticRegression(C=10)],
}

grid = LogisticRegression(C=10)#GridSearchCV(pipeline, cv=4, n_jobs=-1, param_grid=params, verbose=1, scoring='f1_macro')

grid.fit(X_train, y_train)



ValueError: could not convert string to float: 'n'

In [None]:
print(grid.best_score_)
print()
print(grid.best_params_)

In [None]:
# preserve
estimator = grid.best_estimator_
y_pred = estimator.predict(X_test)

metrics, summary, cm = eval_summary(y_pred, y_test)

print(metrics)
print()
print(summary)

## Best estimator... for now

In [None]:
best_classifier = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
best_pipeline = get_pipeline()
best_pipeline.steps.append(('clf', best_classifier))

In [None]:
best_pipeline.fit(X_train, y_train)
y_pred = best_pipeline.predict(X_test)

metrics, summary, cm = eval_summary(y_test, y_pred)

print(metrics)
print(summary)

In [None]:
joblib.dump(best_pipeline, 'models/classify_pipeline.joblib') 