In [1]:
import os
from time import time 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
os.chdir("..")

In [3]:
from src.scrub import get_clean_iris

In [4]:
df = get_clean_iris()

X = df.copy().drop(['iris_type'], axis=1)
y = df.copy().loc[:, 'iris_type'].replace({'setosa': 0, 'versicolor': 1, 'virginica': 2})

In [5]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.30)

## Model 1 - Benchmark

- model with default settings

In [6]:
lr_0 = LogisticRegression()

In [7]:
lr_0.fit(X_tr, y_tr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
y_pr = lr_0.predict(X_te)

In [9]:
accuracy_score(y_te, y_pr)

0.9777777777777777

In [10]:
print(classification_report(y_te, y_pr))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        12
          1       1.00      0.93      0.97        15
          2       0.95      1.00      0.97        18

avg / total       0.98      0.98      0.98        45



## Model 2 - Hyperparameter Optimization

In [11]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3)

In [12]:
pipe = \
Pipeline([
    ('model', LogisticRegression(class_weight='balanced'))
])

grid = {
    'model__C': [0.05, 0.1, 1],
    'model__penalty': ['l1', 'l2']
}

est = GridSearchCV(
    estimator=pipe, 
    param_grid=grid, 
    scoring='accuracy', 
    cv=3
)

t_0 = time()
est.fit(X, y)
print("Grid Search completed in {}".format(time() - t_0))

Grid Search completed in 0.115261077881


In [13]:
est.best_estimator_.named_steps.get('model')

LogisticRegression(C=1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [14]:
y_pr = est.predict(X_te)

In [15]:
accuracy_score(y_te, y_pr)

0.9555555555555556

In [16]:
print(classification_report(y_te, y_pr))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        14
          1       0.92      0.92      0.92        12
          2       0.95      0.95      0.95        19

avg / total       0.96      0.96      0.96        45



In [17]:
joblib.dump(lr_0, 'models/logistic.pkl') 

['models/logistic.pkl']

In [18]:
joblib.dump(
    value=est.best_estimator_.named_steps.get('model'), 
    filename='models/logistic.pkl'
) 

['models/logistic.pkl']

## Model 2

In [19]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.4)

In [20]:
pipe = \
Pipeline([
    ('model', DecisionTreeClassifier(class_weight='balanced'))
])

grid = {
    'model__max_depth': [2, 4, 6]
}

est = GridSearchCV(
    estimator=pipe, 
    param_grid=grid, 
    scoring='accuracy', 
    cv=3
)

t_0 = time()
est.fit(X, y)
print("Grid Search completed in {}".format(time() - t_0))

Grid Search completed in 0.0555441379547


In [21]:
est.best_estimator_.named_steps.get('model')

DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [22]:
y_pr = est.predict(X_te)

In [23]:
accuracy_score(y_te, y_pr)

1.0

In [24]:
print(classification_report(y_te, y_pr))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        16
          1       1.00      1.00      1.00        23
          2       1.00      1.00      1.00        21

avg / total       1.00      1.00      1.00        60



In [25]:
joblib.dump(
    value=est.best_estimator_.named_steps.get('model'), 
    filename='models/tree.pkl'
) 

['models/tree.pkl']