In [1]:
#Show ALL outputs in cell, not only last result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
#Set relative path mapping for module imports
import sys

sys.path.append("../../")

In [2]:
# External Dependencies
import numpy as np
import pandas as pd

In [3]:
#Read in pickled data
X_y_data = pd.read_pickle("../data/interim/X_y_data.pkl")
X = pd.read_pickle("../data/interim/X.pkl")
y = pd.read_pickle("../data/interim/y.pkl")

In [5]:
#Recap data structure
X_y_data.head()
X_y_data.shape

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


(569, 31)

## Model Selection

In [1]:
from IPython.display import Image
from IPython.core.display import HTML 

Image(url= "https://scikit-learn.org/stable/_static/ml_map.png")

In [None]:
#Which algorithms/estimators are options?
    #The estimator you choose for your project will depend on the data set you have and the problem that you are trying to solve.

# Fit Model & Predict

In [None]:
from sklearn.datasets import load_iris

In [None]:
# IMPORTS

# Read in the data
import pandas as pd

# Scale the data
from sklearn.preprocessing import StandardScaler

# Pipeline, Gridsearch, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

# Plot the confusion matrix and metrics at the end
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.metrics import roc_curve, auc

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn import svm

from sklearn.externals import joblib

In [None]:
# LOAD DATA

# Load and split the data
iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

In [None]:
# LOAD DATA

df = pd.read_csv('cell_phones.csv')

# Set variables for the targets and features
y = df['price_range']
X = df.drop('price_range', axis=1)

# Split the data into training test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [None]:
# PIPELINES

# Logistic Regression pipeline
pipe_lr = Pipeline([
    ('scl', StandardScaler()),
    ('LR', LogisticRegression(random_state=42))
])

# Decision Trees pipeline
pipe_dt = Pipeline([
    ('scl', StandardScaler()),
    ('DT',DecisionTreeClassifier(random_state=42))
])

# Random Forest pipeline
pipe_rf = Pipeline([
    ('scl', StandardScaler()),
    ('RF',RandomForestClassifier(random_state=42))
])

# K-Nearest Neighbors pipeline
pipe_knn = Pipeline([
    ('scl', StandardScaler()),
    ('KNN', KNeighborsClassifier())
])

# Support Vector Machines pipeline
pipe_svm = Pipeline([
    ('scl', StandardScaler()),
    ('SVM', svm.SVC(random_state=42))
])

# XGBoost pipeline
pipe_xgb = Pipeline([
    ('scl', StandardScaler()),
    ('XGB', XGBClassifier(random_state=42))
])

In [None]:
# PIPELINES WITH PCA

# Logistic Regression w/ PCA pipeline
pipe_lr = Pipeline([
    ('scl', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('LR', LogisticRegression(random_state=42))
])

# Decision Trees w/ PCA pipeline
pipe_dt = Pipeline([
    ('scl', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('DT',DecisionTreeClassifier(random_state=42))
])

# Random Forest w/ PCA pipeline
pipe_rf = Pipeline([
    ('scl', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('RF',RandomForestClassifier(random_state=42))
])

# K-Nearest Neighbors w/ PCA pipeline
pipe_knn = Pipeline([
    ('scl', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('KNN', KNeighborsClassifier())
])

# Support Vector Machines w/ PCA pipeline
pipe_svm = Pipeline([
    ('scl', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('SVM', svm.SVC(random_state=42))
])

# XGBoost w/ PCA pipeline
pipe_xgb = Pipeline([
    ('scl', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('XGB', XGBClassifier(random_state=42))
])

In [None]:
# HYPERPARAMETER OPTIONS

# Logistic Regression hyperparameter options

# Logistic regression does not really have any critical hyperparameters to tune.

lr_param_grid = [{
#     Regularization (penalty) can sometimes be helpful.
    'LR__penalty': ['none', 'l1', 'l2', 'elasticnet'],
#     Note: not all solvers support all regularization terms.
#     The C parameter controls the penality strength, which can also be effective.
    'LR__C': [100, 10, 1.0, 0.1, 0.01],
#     Sometimes, you can see useful differences in performance or convergence with different solvers (solver).
    'LR__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
]

# Decision Trees hyperparameter options

dt_param_grid = [{
    'DT__criterion': [],
    'DT__min_samples_leaf': [],
    'DT__max_depth': [],
    'DT__min_samples_split': []}
]

# Random Forest hyperparameter options

The most important parameter is the number of random features to sample at each split point (max_features).

rf_param_grid = [{
    'RF__min_samples_leaf': [],
    'RF__max_depth': [],
    'RF__min_samples_split': []}
]

# K-Nearest Neighbors hyperparameter options

# The most important hyperparameter for KNN is the number of neighbors (n_neighbors).

knn_param_grid = [{    
#     Test values between at least 1 and 21, perhaps just the odd numbers.
    'KNN__n_neighbors': [1 to 21],
#     It may also be interesting to test the contribution of members of the neighborhood via different weightings (weights).
    'KNN__weights': ['uniform', 'distance'],
#     It may also be interesting to test different distance metrics (metric) for choosing the composition of the neighborhood.
    'KNN__metric': ['euclidean', 'manhattan', 'minkowski']}
]

# Support Vector Machines hyperparameter options

# The SVM algorithm, like gradient boosting, is very popular, very effective, 
# and provides a large number of hyperparameters to tune.

svm_param_grid = [{    
#     Perhaps the first important parameter is the choice of kernel that will control
#     the manner in which the input variables will be projected. 
#     There are many to choose from, but linear, polynomial, and RBF are the most common, 
#     perhaps just linear and RBF in practice.
    'SVM__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
#     If the polynomial kernel works out, then it is a good idea to dive into the degree hyperparameter.
#     Another critical parameter is the penalty (C) that can take on a range of values 
#     and has a dramatic effect on the shape of the resulting regions for each class. 
#     A log scale might be a good starting point.
    'SVM__C': [100, 10, 1.0, 0.1, 0.001]}
]

# XGBoost parameter hyperparameter options
xgb_param_grid = [{
    'XGB__learning_rate': [],
    'XGB__max_depth': [],
    'XGB__min_child_weight': [],
    'XGB__subsample': [],
    'XGB__n_estimators': []}
]

In [None]:
# CHOSEN HYPERPARAMETER GRIDS

param_range = [1, 2, 3, 4, 5, 6]
param_range_fl = [1.0, 0.5, 0.1]
n_estimators = [50,100,150]
learning_rates = [.1,.2,.3]

# Logistic Regression hyperparameter grid
lr_param_grid = [{
    'LR__penalty': ['l1', 'l2'],
    'LR__C': param_range_fl,
    'LR__solver': ['liblinear']}
]

# Decision Trees hyperparameter grid
dt_param_grid = [{
    'DT__criterion': ['gini', 'entropy'],
    'DT__min_samples_leaf': param_range,
    'DT__max_depth': param_range,
    'DT__min_samples_split': param_range[1:]}
]

# Random Forest hyperparameter grid
rf_param_grid = [{
    'RF__min_samples_leaf': param_range,
    'RF__max_depth': param_range,
    'RF__min_samples_split': param_range[1:]}
]

# K-Nearest Neighbors hyperparameter grid
knn_param_grid = [{
    'KNN__n_neighbors': param_range,
    'KNN__weights': ['uniform', 'distance'],
    'KNN__metric': ['euclidean', 'manhattan']}
]

# Support Vector Machines hyperparameter grid
svm_param_grid = [{
    'SVM__kernel': ['linear', 'rbf'], 
    'SVM__C': param_range}
]

# XGBoost hyperparameter grid
xgb_param_grid = [{
    'XGB__learning_rate': learning_rates,
    'XGB__max_depth': param_range,
    'XGB__min_child_weight': param_range[:2],
    'XGB__subsample': param_range_fl,
    'XGB__n_estimators': n_estimators}
]

In [None]:
# SET UP GRID SEARCH CV

scoring = 'accuracy'
cv = 3

# Logistic Regression grid search CV
lr_grid_search = GridSearchCV(
    estimator=pipe_lr,
    param_grid=lr_param_grid,
    scoring=scoring,
    cv=cv
)

# Decision Trees grid search CV
dt_grid_search = GridSearchCV(
    estimator=pipe_dt,
    param_grid=dt_param_grid,
    scoring=scoring,
    cv=cv
)

# Random Forest grid search CV
rf_grid_search = GridSearchCV(
    estimator=pipe_rf,
    param_grid=rf_param_grid,
    scoring=scoring,
    cv=cv
)

# K-Nearest Neighbors grid search CV
knn_grid_search = GridSearchCV(
    estimator=pipe_knn,
    param_grid=knn_param_grid,
    scoring=scoring,
    cv=cv
)

# Support Vector Machines grid search CV
svm_grid_search = GridSearchCV(
    estimator=pipe_svm,
    param_grid=svm_param_grid,
    scoring=scoring,
    cv=cv
)

# XGBoost grid search CV
xgb_grid_search = GridSearchCV(
    estimator=pipe_xgb,
    param_grid=xgb_param_grid,
    scoring=scoring,
    cv=cv
)

In [None]:
# FIT MODELS

grids = [
    lr_grid_search,
    dt_grid_search,
    rf_grid_search,
    knn_grid_search,
    svm_grid_search,
    xgb_grid_search
]

for pipe in grids:
    pipe.fit(X_train,y_train)

In [None]:
# PREDICT & SCORE MODELS

grid_dict = {
    0: 'Logistic Regression', 
    1: 'Decision Trees', 
    2: 'Random Forest', 
    3: 'K-Nearest Neighbors', 
    4: 'Support Vector Machines', 
    5: 'XGBoost'
}

for i, model in enumerate(grids):
    print('{} Test Accuracy: {}'.format(grid_dict[i], model.score(X_test,y_test)))
    print('{} Best Params: {}'.format(grid_dict[i], model.best_params_))

In [None]:
# PREDICT & SCORE MODELS

grid_dict = {
    0: 'Logistic Regression', 
    1: 'Decision Trees', 
    2: 'Random Forest', 
    3: 'K-Nearest Neighbors', 
    4: 'Support Vector Machines', 
    5: 'XGBoost'
}

for i, model in enumerate(grids):
    
    y_pred = model.predict(X_test)
    
    print('{} Test Accuracy: {}'.format(grid_dict[i], model.score(X_test,y_test)))
    print('{} Best Params: {}'.format(grid_dict[i], model.best_params_))
    
    print('{} Test Accuracy 2: {}'.format(grid_dict[i], accuracy_score(y_test, y_pred)))
    print('{} Test Precision: {}'.format(grid_dict[i], precision_score(y_test, y_pred)))
    print('{} Test Recall: {}'.format(grid_dict[i], recall_score(y_test, y_pred)))
    print('{} Test F1 Score: {}'.format(grid_dict[i], f1_score(y_test, y_pred)))

In [None]:
# PREDICT & SCORE MODELS

grid_dict = {
    0: 'Logistic Regression', 
    1: 'Decision Trees', 
    2: 'Random Forest', 
    3: 'K-Nearest Neighbors', 
    4: 'Support Vector Machines', 
    5: 'XGBoost'
}

# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''

for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
    
    # Fit grid search
    gs.fit(X_train, y_train)
    
    # Best params
    print('Best params: %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(X_test)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
    # Track best (highest test accuracy) model
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        best_gs = gs
        best_clf = idx
        
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])

# Save best grid search pipeline to file
dump_file = 'best_gs_pipeline.pkl'
joblib.dump(best_gs, dump_file, compress=1)
print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))

## Best Model

In [None]:
# Pick best model based on scoring above
clf_best = XXXXX_grid_search

In [None]:
ConfusionMatrixDisplay.from_estimator(clf_best, X_test, y_test)
RocCurveDisplay.from_estimator(clf_best, X_test, y_test)

In [None]:
y_pred = clf_best.predict(X_test)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

In [None]:
y_pred = clf.decision_function(X_test)
# y_pred = clf.predict(X_test)

RocCurveDisplay.from_predictions(y_test, y_pred)

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))

In [None]:
y_true = y_test
y_pred = clf_best.predict(X_test)

print(accuracy_score(y_true, y_pred, normalize=True))
print(precision_score(y_true, y_pred, average='binary'))
print(recall_score(y_true, y_pred, average='binary'))
print(f1_score(y_true, y_pred, average='binary'))

In [None]:
https://pandas-ml.readthedocs.io/en/latest/index.html

In [None]:
https://www.scikit-yb.org/en/latest/