In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [3]:

#importing the models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


In [4]:
#Loading the csv data to a pandas dataframe
heart_data=pd.read_csv("C:\\Users\\Home\\Desktop\\data\\heart.csv")


In [5]:
heart_data.head()





Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [6]:
heart_data.shape


(303, 14)

In [7]:
heart_data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [8]:

#checking the distribution of target variable
heart_data['target'].value_counts()


1    165
0    138
Name: target, dtype: int64

In [9]:
x=heart_data.drop(columns='target',axis=1)
y=heart_data['target']

In [10]:
#converting into array method for perform model_selection
x=np.asarray(x)
y=np.asarray(y)

In [11]:
x

array([[63.,  1.,  3., ...,  0.,  0.,  1.],
       [37.,  1.,  2., ...,  0.,  0.,  2.],
       [41.,  0.,  1., ...,  2.,  0.,  2.],
       ...,
       [68.,  1.,  0., ...,  1.,  2.,  3.],
       [57.,  1.,  0., ...,  1.,  1.,  3.],
       [57.,  0.,  1., ...,  1.,  1.,  2.]])

# Model_Selection

Comparing the model with default hyperparameter values using cross validation

In [12]:
#list of models
models=[LogisticRegression(max_iter=100),
      SVC(kernel='linear'),
      KNeighborsClassifier(),
      RandomForestClassifier(random_state=0)]

In [13]:
def compare_model_cross_validation():
    for model in models:
        cv_score=cross_val_score(model,x,y,cv=5)
        mean_accuracy=sum(cv_score)/len(cv_score)
        mean_accuracy=mean_accuracy*100
        mean_accuracy=round(mean_accuracy,2)
        
        print("cross validation accuracies for the ",model,"=",cv_score)
        print("accuracy score of the ",model,"=",mean_accuracy,"%")
        print("--------------------------------------------------------")



In [14]:
compare_model_cross_validation()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

cross validation accuracies for the  LogisticRegression() = [0.83606557 0.86885246 0.85245902 0.85       0.75      ]
accuracy score of the  LogisticRegression() = 83.15 %
--------------------------------------------------------
cross validation accuracies for the  SVC(kernel='linear') = [0.81967213 0.8852459  0.80327869 0.86666667 0.76666667]
accuracy score of the  SVC(kernel='linear') = 82.83 %
--------------------------------------------------------
cross validation accuracies for the  KNeighborsClassifier() = [0.60655738 0.6557377  0.57377049 0.73333333 0.65      ]
accuracy score of the  KNeighborsClassifier() = 64.39 %
--------------------------------------------------------
cross validation accuracies for the  RandomForestClassifier(random_state=0) = [0.85245902 0.90163934 0.81967213 0.81666667 0.8       ]
accuracy score of the  RandomForestClassifier(random_state=0) = 83.81 %
--------------------------------------------------------


In [15]:
#list of models
models_list = [LogisticRegression(max_iter=1000),SVC(),KNeighborsClassifier(),RandomForestClassifier(random_state=0)]


In [16]:
#creating a dictionary that contains hyperparameter values for the above mentioned models

model_hyperparameters={
    
    'log_reg_hyperparameters':{
        'C':[1,5,10,20]
    },
    
    'svc_hyperparameters':{
        'kernel':['linear','poly','rbf','sigmoid'],
        'C':[1,5,10,20]
    },
    
    'KNN_hyperparameters':{
        'n_neighbors':[3,5,10]
    },
    
    'random_forest_hyperparameters':{
        'n_estimators':[100,120,150,200],
        'criterion':['gini','entropy']
    }
}



In [17]:
type(model_hyperparameters)

dict

In [18]:
print(model_hyperparameters.keys())

dict_keys(['log_reg_hyperparameters', 'svc_hyperparameters', 'KNN_hyperparameters', 'random_forest_hyperparameters'])


In [19]:
model_keys=list(model_hyperparameters.keys())
print(model_keys)

['log_reg_hyperparameters', 'svc_hyperparameters', 'KNN_hyperparameters', 'random_forest_hyperparameters']


In [20]:
model_keys[0]

'log_reg_hyperparameters'

In [21]:
model_hyperparameters[model_keys[0]]

{'C': [1, 5, 10, 20]}

In [22]:
# Applying GridSearchCV

# In[30]:


def ModelSelection(list_of_models,hyperparameters_dictionary):
    result=[]
    i=0
    for model in list_of_models:
        key=model_keys[i]
        params=hyperparameters_dictionary[key]
        i+=1
        
        print(model)
        print(params)
        print("--------------------------------------")
        
        
        classifier=GridSearchCV(model,params,cv=5)
        
        #fitting the data to classifier
        classifier.fit(x,y)
        
        result.append({
            'model used': model,
            'highest score':classifier.best_score_,
            'best hyperparameters':classifier.best_params_
        })
        
        
        result_dataframe=pd.DataFrame(result,columns=['model used','highest score','best hyperparameter'])
        
        return result_dataframe



In [23]:
ModelSelection(models_list,model_hyperparameters)

LogisticRegression(max_iter=1000)
{'C': [1, 5, 10, 20]}
--------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,model used,highest score,best hyperparameter
0,LogisticRegression(max_iter=1000),0.831585,
