# Model Selection

2 models will be used in this project: 
- Decision Tree 
- Logistic Regression

In [155]:
import os 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics 
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, log_loss, cohen_kappa_score, f1_score
from sklearn.model_selection import cross_validate

In [None]:
# Data has already been processed, so split it into train/test 
np.random.seed(1)
diseases_test = pd.read_csv('data/processed/Training_processed.csv')
diseases_train = pd.read_csv('data/processed/Testing_processed.csv')

predictors = diseases_test.columns.difference(['prognosis'])

x_train, y_train = diseases_train[predictors], diseases_train['prognosis']

x_test, y_test = diseases_test[predictors], diseases_test['prognosis']

In [147]:
# Decision Tree Classifier  

pipeline1 = Pipeline(
    [
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
        ('knn', DecisionTreeClassifier(criterion='entropy', max_depth=10))
    ]
)

pipeline1

In [150]:
pipeline1.fit(x_train, y_train)

In [151]:
y_pred_train = pipeline1.predict(x_train)
y_pred_test = pipeline1.predict(x_test)
y_proba_train = pipeline1.predict_proba(x_train)
y_proba_test = pipeline1.predict_proba(x_test)

In [None]:
res = {'accuracy_store_train': accuracy_score(y_train, y_pred_train),
'accuracy_store_test': accuracy_score(y_test, y_pred_test),
'f1_score_train': f1_score(y_train, y_pred_train, average='weighted'),
'f1_score_test': f1_score(y_test, y_pred_test, average='weighted')}

res

{'accuracy_store_train': 1.0,
 'accuracy_store_test': 0.8731707317073171,
 'f1_score_train': np.float64(1.0),
 'f1_score_test': np.float64(0.884483604015708)}

In [163]:
pipeline2 = Pipeline(
    [
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
        ('logreg', LogisticRegression(max_iter=500))
    ]
)

pipeline2

In [164]:
pipeline2.fit(x_train, y_train)

In [166]:
y_pred_train2 = pipeline2.predict(x_train)
y_pred_test2 = pipeline2.predict(x_test)

In [167]:
res2 = {'accuracy_store_train': accuracy_score(y_train, y_pred_train2),
'accuracy_store_test': accuracy_score(y_test, y_pred_test2),
'f1_score_train': f1_score(y_train, y_pred_train2, average='weighted'),
'f1_score_test': f1_score(y_test, y_pred_test2, average='weighted')}

res2

{'accuracy_store_train': 1.0,
 'accuracy_store_test': 1.0,
 'f1_score_train': np.float64(1.0),
 'f1_score_test': np.float64(1.0)}