# Model Selection

In this notebook we will test different models to find the one that gives us the best results.

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

def f1_scores(Z, y_test):
    averages = ['macro', 'micro', 'weighted']
    for avg in averages:
        score = f1_score(Z, y_test, average=avg)
        print("f1 score ({}): {}".format(avg, score))

def test_model(X, y, model_name, model):
    print("MODEL: {}".format(model_name))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model.fit(X_train, y_train)
    Z = model.predict(X_test)
    f1_scores(Z, y_test)
    
def testModelKFold(X, y, model_name, model, k):
    print("CROSS VALIDATION FOR: {}".format(model_name))
    skf = StratifiedKFold(n_splits=5)
    skf.get_n_splits(X, y)
    f1_scores = []
    f1_scores_train = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        Z = model.predict(X_test)
        f1_scores.append(f1_score(Z, y_test))
        Z_train = model.predict(X_train)
        f1_scores_train.append(f1_score(Z_train, y_train))
    print("f1 scores test: {}".format(f1_scores))
    print("f1 scores train: {}".format(f1_scores_train))
    print("MEAN test: {}".format(np.mean(f1_scores)))
    print("MEAN train: {}".format(np.mean(f1_scores_train)))
    return np.mean(f1_scores)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

models = {
    "Logistic Regression": LogisticRegression(),
    "SVC": SVC(),
    "Random Forest": RandomForestClassifier(n_estimators=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors = 3),
    "GaussianNB": GaussianNB(),
    "Perceptron": Perceptron(),
    "SGDClassifier": SGDClassifier(),
    "Decision Tree": DecisionTreeClassifier()   
}

dataset_path = "data-all.csv"
data = pd.read_csv(dataset_path, sep=";")
X = data.drop("diagnosis", axis=1).values
y = data["diagnosis"]

for model in models:
    test_model(X, y, model, models[model])
    testModelKFold(X, y, model, models[model], k=5)
    print('')

MODEL: Logistic Regression
f1 score (macro): 0.8082272917718493
f1 score (micro): 0.8383473726434016
f1 score (weighted): 0.8428288016799584
CROSS VALIDATION FOR: Logistic Regression
f1 scores test: [0.8992805755395683, 0.8727755644090306, 0.8852734922861151, 0.9055429864253393, 0.8699576868829337]
f1 scores train: [0.8829772378159475, 0.8979706877113867, 0.8901977282288599, 0.8840751869191531, 0.8926601612218921]
MEAN test: 0.8865660611085975
MEAN train: 0.8895762003794478

MODEL: SVC
f1 score (macro): 0.8151437640488735
f1 score (micro): 0.8584035298836743
f1 score (weighted): 0.8677657180121013
CROSS VALIDATION FOR: SVC
f1 scores test: [0.9262013729977117, 0.8781127129750982, 0.8931193908077236, 0.8995002776235425, 0.8878706199460916]
f1 scores train: [0.9074979625101874, 0.9191905286343611, 0.9182346903865892, 0.9110460763628939, 0.9192342002333082]
MEAN test: 0.8969608748700335
MEAN train: 0.915040691625468

MODEL: Random Forest
f1 score (macro): 0.8405470539255467
f1 score (micro

We can check the confusion matrix for the Random Forest model.

In [6]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

model = RandomForestClassifier(n_estimators=1000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model.fit(X_train, y_train)
Z = model.predict(X_test)

print("Confusion Matrix")
print(confhttp://localhost:8888/notebooks/ead-model/models/model_selection.ipynb#usion_matrix(y_test, Z))

print("\nClassfication Report")
print(classification_report(y_test, Z))

SyntaxError: invalid syntax (<ipython-input-6-5cf5db49c380>, line 10)

Let us compare the performances of the random forest for different numbers of estimators and features

In [35]:
# res = np.zeros([5,6])
for i_f, n_f in enumerate([1,3,5,8,11]):
    for i_e, n_e in enumerate([1,3]):
        model = RandomForestClassifier(n_estimators=n_e, max_features=n_f)
        model_name = "Random Forest with %i features and %i estimators" % (n_f, n_e)
        f1 = testModelKFold(X, y, model_name, model, k=5)
#         res[i_f, i_e] = f1

CROSS VALIDATION FOR: Random Forest with 1 features and 1 estimators
f1 scores: [0.8491965389369592, 0.8439010520329827, 0.8344136711844432, 0.820544176092938, 0.8430182133564614]
MEAN: 0.8382147303207569
CROSS VALIDATION FOR: Random Forest with 1 features and 3 estimators
f1 scores: [0.8889554025740796, 0.8796680497925311, 0.8625646923519265, 0.832063395306309, 0.8572233173753872]
MEAN: 0.8640949714800467
CROSS VALIDATION FOR: Random Forest with 3 features and 1 estimators
f1 scores: [0.8745062291096931, 0.8652601969057665, 0.8286995515695068, 0.7891585250551528, 0.85146804835924]
MEAN: 0.8418185101998719
CROSS VALIDATION FOR: Random Forest with 3 features and 3 estimators
f1 scores: [0.8882931188561216, 0.8811227297743534, 0.8738868141338697, 0.8262476894639557, 0.8628781453208935]
MEAN: 0.8664856995098388
CROSS VALIDATION FOR: Random Forest with 5 features and 1 estimators
f1 scores: [0.8641750227894258, 0.860871980005554, 0.8440366972477064, 0.8053097345132744, 0.8521389606660925]


In [34]:
model.max_depth

In [10]:
res = np.zeros([5,6])
for i_f, n_f in enumerate([1,3,5,8,11]):
    for i_e, n_e in enumerate([10,33,100,333,1000]):
        model = RandomForestClassifier(n_estimators=n_e, max_features=n_f)
        model_name = "Random Forest with %i features and %i estimators" % (n_f, n_e)
        f1 = testModelKFold(X, y, model_name, model, k=5)
        res[i_f, i_e] = f1

CROSS VALIDATION FOR: Random Forest with 1 features and 10 estimators
f1 scores: [0.888755261575466, 0.8799556418075962, 0.8768303186907838, 0.8339529120198266, 0.8611190002840103]
MEAN: 0.8681226268755367
CROSS VALIDATION FOR: Random Forest with 1 features and 33 estimators
f1 scores: [0.9050445103857567, 0.8885227896760022, 0.8863122171945702, 0.8617149758454107, 0.8689306763962952]
MEAN: 0.882105033899607
CROSS VALIDATION FOR: Random Forest with 1 features and 100 estimators
f1 scores: [0.898775753956405, 0.8872387238723872, 0.8807703200226565, 0.8623079240735161, 0.86687306501548]
MEAN: 0.8791931573880889
CROSS VALIDATION FOR: Random Forest with 1 features and 333 estimators
f1 scores: [0.9009223445403155, 0.8893161219445207, 0.8838526912181303, 0.8642642642642644, 0.8677009873060649]
MEAN: 0.8812112818546591
CROSS VALIDATION FOR: Random Forest with 1 features and 1000 estimators
f1 scores: [0.8984816909794581, 0.8896229011835948, 0.8842998585572844, 0.8673684210526316, 0.867605633

In [13]:
pd.DataFrame(res[:,:5])

Unnamed: 0,0,1,2,3,4
0,0.868123,0.882105,0.879193,0.881211,0.881476
1,0.874965,0.886846,0.886304,0.88745,0.88689
2,0.877431,0.891345,0.892254,0.893224,0.892072
3,0.880903,0.888839,0.891511,0.892591,0.892816
4,0.87813,0.887997,0.889536,0.891733,0.892819


In [12]:
pd.DataFrame(res)

Unnamed: 0,0,1,2,3,4,5
0,0.868123,0.882105,0.879193,0.881211,0.881476,0.0
1,0.874965,0.886846,0.886304,0.88745,0.88689,0.0
2,0.877431,0.891345,0.892254,0.893224,0.892072,0.0
3,0.880903,0.888839,0.891511,0.892591,0.892816,0.0
4,0.87813,0.887997,0.889536,0.891733,0.892819,0.0


## Save model

In [10]:
import dill as pickle

def dump_model(model, path):
    with open(path, 'wb') as file:
        pickle.dump(model, file)
        
# Define path to save the model file
filename = "model_v2.pk"
dir_path = "./"

# Dump model to file
dump_model(model, dir_path + filename)