In [1]:
# pylint: disable-all
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import pickle
import pandas as pd
import numpy as np

PATH_TO_DATAFRAME = (
    '../Data/10k_diabetes_ODSC_Training.csv'
)

# Train/test split
df = pd.read_csv(PATH_TO_DATAFRAME)
y = df.pop('readmitted')
X_train, X_test, y_train, y_test = train_test_split(df, y)

numeric_features = list(X_train.select_dtypes(include=np.number).columns.values)
text_features = ['diag_1_desc', 'diag_2_desc', 'diag_3_desc']
categorical_features = list(set(X_train.columns) - set(numeric_features + text_features))


# Set up preprocessing steps for each type of feature
text_preprocessing = Pipeline([('TfIdf', TfidfVectorizer())])

categorical_preprocessing = Pipeline(
    [
        ('Imputation', SimpleImputer(strategy='constant', fill_value='?')),
        ('One Hot Encoding', OneHotEncoder(handle_unknown='ignore')),
    ]
)

numeric_preprocessing = Pipeline(
    [('Imputation', SimpleImputer(strategy='mean')), ('Scaling', StandardScaler())]
)


preprocessing = make_column_transformer(
    (numeric_features, numeric_preprocessing),
    (text_features[0], text_preprocessing),
    (text_features[1], text_preprocessing),
    (text_features[2], text_preprocessing),
    (categorical_features, categorical_preprocessing),
)

# Define a pipeline to search for the best combination of PCA truncation
# and classifier regularization.

pipeline = Pipeline( steps=
    [('Preprocessing', preprocessing), ('RF', RandomForestClassifier())]
)




In [5]:
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = [
        {#'RF__bootstrap': [False, True],
         'RF__n_estimators': [10],
         'RF__max_features': [0.6, 0.8],
         'RF__min_samples_leaf': [3, 5],
         'RF__min_samples_split': [3, 5]
        },
    ]

grid = GridSearchCV(pipeline, cv=5, n_jobs=1,param_grid=param_grid, iid=False,verbose=5)

grid.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f):" % grid.best_score_)
print(grid.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=3, RF__n_estimators=10 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=3, RF__n_estimators=10, score=0.510, total=   4.0s
[CV] RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=3, RF__n_estimators=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.0s remaining:    0.0s


[CV]  RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=3, RF__n_estimators=10, score=0.496, total=   4.0s
[CV] RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=3, RF__n_estimators=10 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    8.0s remaining:    0.0s


[CV]  RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=3, RF__n_estimators=10, score=0.518, total=   4.1s
[CV] RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=3, RF__n_estimators=10 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   12.1s remaining:    0.0s


[CV]  RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=3, RF__n_estimators=10, score=0.507, total=   4.4s
[CV] RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=3, RF__n_estimators=10 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   16.5s remaining:    0.0s


[CV]  RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=3, RF__n_estimators=10, score=0.479, total=   4.7s
[CV] RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=5, RF__n_estimators=10 
[CV]  RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=5, RF__n_estimators=10, score=0.497, total=   4.3s
[CV] RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=5, RF__n_estimators=10 
[CV]  RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=5, RF__n_estimators=10, score=0.514, total=   4.2s
[CV] RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=5, RF__n_estimators=10 
[CV]  RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=5, RF__n_estimators=10, score=0.491, total=   4.1s
[CV] RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=5, RF__n_estimators=10 
[CV]  RF__max_features=0.6, RF__min_samples_leaf=3, RF__min_samples_split=5, RF__n_estimators=10, score=0.486, t

KeyboardInterrupt: 

In [35]:
from sklearn.externals import joblib
joblib.dump(grid.best_estimator_, '../Flask_App/python_model/custom_model.pickle')



['custom_model_grid_search.pkl']

In [44]:
grid.cv_results_

{'mean_fit_time': array([0.29434266, 0.28125615, 0.25799136, 0.26821375, 0.25476532]),
 'std_fit_time': array([0.04118838, 0.03276419, 0.00176091, 0.00825797, 0.00390809]),
 'mean_score_time': array([0.07017894, 0.07095599, 0.06298604, 0.06947045, 0.06194053]),
 'std_score_time': array([0.007832  , 0.00450658, 0.0016319 , 0.00736411, 0.00224115]),
 'param_logistic__alpha': masked_array(data=[0.0001, 0.01, 1.0, 100.0, 10000.0],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'logistic__alpha': 0.0001},
  {'logistic__alpha': 0.01},
  {'logistic__alpha': 1.0},
  {'logistic__alpha': 100.0},
  {'logistic__alpha': 10000.0}],
 'split0_test_score': array([0.488     , 0.49066667, 0.50266667, 0.506     , 0.492     ]),
 'split1_test_score': array([0.49333333, 0.5       , 0.514     , 0.50733333, 0.492     ]),
 'split2_test_score': array([0.502     , 0.51133333, 0.50933333, 0.50866667, 0.492     ]),
 'split3_test_score': array