<a href="https://colab.research.google.com/github/expeditive/machine-learning/blob/main/model-optimizations/model_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
#importing the models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
heart_data = pd.read_csv('/content/heart.csv')

In [None]:
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [None]:
heart_data.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


In [None]:
heart_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,526
0,499


In [None]:
X = heart_data.drop(columns='target',axis=1)
Y = heart_data['target']

In [None]:
print(X)

      age  sex  cp  trestbps  chol  ...  exang  oldpeak  slope  ca  thal
0      52    1   0       125   212  ...      0      1.0      2   2     3
1      53    1   0       140   203  ...      1      3.1      0   0     3
2      70    1   0       145   174  ...      1      2.6      0   0     3
3      61    1   0       148   203  ...      0      0.0      2   1     3
4      62    0   0       138   294  ...      0      1.9      1   3     2
...   ...  ...  ..       ...   ...  ...    ...      ...    ...  ..   ...
1020   59    1   1       140   221  ...      1      0.0      2   0     2
1021   60    1   0       125   258  ...      1      2.8      1   1     3
1022   47    1   0       110   275  ...      1      1.0      1   1     2
1023   50    0   0       110   254  ...      0      0.0      2   0     2
1024   54    1   0       120   188  ...      0      1.4      1   1     3

[1025 rows x 13 columns]


In [None]:
print(Y)

0       0
1       0
2       0
3       0
4       0
       ..
1020    1
1021    0
1022    0
1023    1
1024    0
Name: target, Length: 1025, dtype: int64


In [None]:
X = np.asarray(X)
Y = np.asarray(Y)

# **model selection**

1 -> comparing the model with default hyperparameters using cross validation

In [None]:
#list of models
models = [LogisticRegression(max_iter=1000), SVC(kernel ='linear'),KNeighborsClassifier(),RandomForestClassifier()]

In [None]:
def compare_models_by_crossvalidations():
   for model in models:
    cv_score = cross_val_score(model, X, Y , cv=5)
    mean_accurac = sum((cv_score)/len(cv_score))
    mean_accurac = mean_accurac*100
    mean_accurac = round(mean_accurac, 2)

    print('cross validation accuracies for the',model,'=',cv_score)
    print('accuracy of the model is ',mean_accurac)
    print('--------------------------------------------------')

In [None]:
compare_models_by_crossvalidations()

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


cross validation accuracies for the LogisticRegression(max_iter=1000) = [0.88292683 0.85853659 0.87804878 0.82439024 0.80487805]
accuracy of the model is  84.98
--------------------------------------------------
cross validation accuracies for the SVC(kernel='linear') = [0.88292683 0.86829268 0.84390244 0.81463415 0.80487805]
accuracy of the model is  84.29
--------------------------------------------------
cross validation accuracies for the KNeighborsClassifier() = [0.76585366 0.74634146 0.76097561 0.71219512 0.75121951]
accuracy of the model is  74.73
--------------------------------------------------
cross validation accuracies for the RandomForestClassifier() = [1.         1.         1.         1.         0.98536585]
accuracy of the model is  99.71
--------------------------------------------------


**RandomForestClassifier** is highest accurate here for heart disease dataset

2 -> comparing the models with different hyperparameters value using gridsearchCV

In [None]:
model_list = [LogisticRegression(max_iter= 10000),SVC(), KNeighborsClassifier(),RandomForestClassifier(random_state=0)]

In [None]:
#cresting a nested dictionary that contains the hyperparameters values for the above mentioned models

model_hyperparameter = {

    'log_reg_hyperparameters': {

        'C' : [1,5,10,20]

    },

    'svc_hyperparameters' :{

    'kernel' : ['linear','poly','rbf','sigmoid'],
    'C' : [1,5,10,20]

    },

    'KNN_hyperparameters' : {

        'n_neighbor' : [3,5,10]
    },

    'random_forest_classifier' : {

        'n_estimator' : [10,20,50,100]
    }

}

In [None]:
print(model_hyperparameter)

{'log_reg_hyperparameters': {'C': [1, 5, 10, 20]}, 'svc_hyperparameters': {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 5, 10, 20]}, 'KNN_hyperparameters': {'n_neighbor': [3, 5, 10]}, 'random_forest_classifier': {'n_estimator': [10, 20, 50, 100]}}


In [None]:
model_keys = list(model_hyperparameter.keys())
print(model_keys)

['log_reg_hyperparameters', 'svc_hyperparameters', 'KNN_hyperparameters', 'random_forest_classifier']


In [None]:
model_keys[2]

'KNN_hyperparameters'

In [None]:
model_hyperparameter[model_keys[1]]

{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 5, 10, 20]}

**applying GridSearchCV**

In [None]:
def modelSelection(list_of_models,hyperparameters_dictionary):

  result = []

  i =0

  for model in list_of_models:

    key = model_keys[i]

    params = hyperparameters_dictionary[key]

    i+=1

    print(model)
    print(params)
    print('---------------------------------------')

    classifier = GridSearchCV(model, params, cv = 5)

    #fitting the data to classifier
    classifier.fit(X,Y)

  result.append({
      'model used' : model,
      'highest score' : classifier.best_score_,
      'best hyperparameters' : classifier.best_params_
  })

  result_dataFrame = pd.DataFrame(result, columns = ['model used','highest score','best hyperparameters'])

  return result_dataFrame


In [None]:
modelSelection(model_list,model_hyperparameter)

LogisticRegression(max_iter=10000)
{'C': [1, 5, 10, 20]}
---------------------------------------
SVC()
{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 5, 10, 20]}
---------------------------------------
KNeighborsClassifier()
{'n_neighbor': [3, 5, 10]}
---------------------------------------


ValueError: Invalid parameter 'n_neighbor' for estimator KNeighborsClassifier(). Valid parameters are: ['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'].