In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
# importing the model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
# load the diabetes dataset
data = pd.read_csv("heart.csv")

In [4]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
data.shape

(303, 14)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [7]:
X = data.drop("target",axis=1)
y= data["target"]

In [8]:
X = np.asarray(X)
y =np.asarray(y)

## MODEL SELECTION

1. Comparing the model with default hyperparameters using cross validation

In [9]:
# list of models 
models = [LogisticRegression(max_iter=1000),SVC(kernel='linear'),KNeighborsClassifier(),RandomForestClassifier(random_state=0)]

In [10]:
def compare_models_cross_validation():
    for model in models:
        cv_score= cross_val_score(model,X,y,cv=5)
        mean_accuracy = sum(cv_score) / len(cv_score)
        mean_accuracy = mean_accuracy *100 
        mean_accuracy = round(mean_accuracy,2)

        print(f"cross validaton accuracies for the {model} = {cv_score}")
        print(f"Accuracy score of the {model} = {mean_accuracy} %")
        print("---"*20)

In [11]:
import warnings
warnings.filterwarnings("ignore")

In [12]:
compare_models_cross_validation()

cross validaton accuracies for the LogisticRegression(max_iter=1000) = [0.80327869 0.86885246 0.85245902 0.86666667 0.75      ]
Accuracy score of the LogisticRegression(max_iter=1000) = 82.83 %
------------------------------------------------------------
cross validaton accuracies for the SVC(kernel='linear') = [0.81967213 0.8852459  0.80327869 0.86666667 0.76666667]
Accuracy score of the SVC(kernel='linear') = 82.83 %
------------------------------------------------------------
cross validaton accuracies for the KNeighborsClassifier() = [0.60655738 0.6557377  0.57377049 0.73333333 0.65      ]
Accuracy score of the KNeighborsClassifier() = 64.39 %
------------------------------------------------------------
cross validaton accuracies for the RandomForestClassifier(random_state=0) = [0.85245902 0.90163934 0.81967213 0.81666667 0.8       ]
Accuracy score of the RandomForestClassifier(random_state=0) = 83.81 %
------------------------------------------------------------


Inference: For the Heart Disease dataset, **Random Forest** Classifier has the
Highest accuracy value with default hyperparameter values

2. Comparing the model with different Hyperparameters values using **GridSearchCV**

In [13]:
# list of models 
models = [LogisticRegression(max_iter=10000),SVC(),KNeighborsClassifier(),RandomForestClassifier(random_state=0)]

In [14]:
# creating a dictionary that contains hyperparameter values for the above mentioned models

model_hyperparameters = {
    'log_reg_hyperparameters': {
        'C': [1,5,10,20],
    },
    'svc_hyperparameters' : {
        'kernel' : ['linear','poly','rbf','sigmoid'],
        'C': [1,5,10,20]
    },
    'KNN_hyperparameters' :{
        'n_neighbors':[3,5,10]
    },
    'random_forest_hyperparameters': {
        'n_estimators' : [10,20,50,100]
    }
}

In [15]:
model_keys = list(model_hyperparameters.keys())
model_keys

['log_reg_hyperparameters',
 'svc_hyperparameters',
 'KNN_hyperparameters',
 'random_forest_hyperparameters']

**Applying GridSearchCV**

In [18]:
def ModelSelection(list_of_models, hyperparameters_dictionary):
    result = []
    i = 0
    for model in list_of_models:
        key = model_keys[i]
        params = hyperparameters_dictionary[key]
        i += 1
        print(model)
        print(params)
        print("----" * 20)
        classifier = GridSearchCV(model, params, cv=5)

        # fitting the data to the classifier
        classifier.fit(X, y)

        result.append({
            'model used': model,
            'highest_score': classifier.best_score_,
            'best hyperparameters': classifier.best_params_
        })
    result_dataframe = pd.DataFrame(result, columns=['model used', 'highest score', 'best hyperparameters'])
    return result_dataframe

In [19]:
ModelSelection(models,model_hyperparameters)

LogisticRegression(max_iter=10000)
{'C': [1, 5, 10, 20]}
--------------------------------------------------------------------------------
SVC()
{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 5, 10, 20]}
--------------------------------------------------------------------------------
KNeighborsClassifier()
{'n_neighbors': [3, 5, 10]}
--------------------------------------------------------------------------------
RandomForestClassifier(random_state=0)
{'n_estimators': [10, 20, 50, 100]}
--------------------------------------------------------------------------------


Unnamed: 0,model used,highest score,best hyperparameters
0,LogisticRegression(max_iter=10000),,{'C': 5}
1,SVC(),,"{'C': 1, 'kernel': 'linear'}"
2,KNeighborsClassifier(),,{'n_neighbors': 5}
3,RandomForestClassifier(random_state=0),,{'n_estimators': 100}


In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_digits
data = load_digits()
X = data.data
y = data.target
def optimize_models(models, model_hyperparameters, X, y, cv=5):
    best_models = []
    
    for model in models:
        model_name = type(model).__name__
        param_key = f"{model_name.lower()}_hyperparameters"
        
        if param_key in model_hyperparameters:
            grid_search = GridSearchCV(model, model_hyperparameters[param_key], cv=cv, n_jobs=-1)
            grid_search.fit(X, y)
            
            best_model = grid_search.best_estimator_
            best_score = grid_search.best_score_
            best_params = grid_search.best_params_
            
            best_models.append({
                'model': best_model,
                'name': model_name,
                'best_score': best_score,
                'best_params': best_params
            })
        else:
            print(f"No hyperparameters found for {model_name}")
    
    return best_models

# Usage:
models = [LogisticRegression(max_iter=10000), SVC(), KNeighborsClassifier(), RandomForestClassifier(random_state=0)]

model_hyperparameters = {
    'logisticregression_hyperparameters': {
        'C': [1, 5, 10, 20],
    },
    'svc_hyperparameters': {
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'C': [1, 5, 10, 20]
    },
    'kneighborsclassifier_hyperparameters': {
        'n_neighbors': [3, 5, 10]
    },
    'randomforestclassifier_hyperparameters': {
        'n_estimators': [10, 20, 50, 100]
    }
}

# Assuming X and y are your feature matrix and target vector
best_models = optimize_models(models, model_hyperparameters, X, y)

# Print results
for result in best_models:
    print(f"\nBest {result['name']}:")
    print(f"Best Score: {result['best_score']}")
    print(f"Best Parameters: {result['best_params']}")



Best LogisticRegression:
Best Score: 0.9148777468276075
Best Parameters: {'C': 1}

Best SVC:
Best Score: 0.9738502011761063
Best Parameters: {'C': 5, 'kernel': 'rbf'}

Best KNeighborsClassifier:
Best Score: 0.966621788919839
Best Parameters: {'n_neighbors': 3}

Best RandomForestClassifier:
Best Score: 0.9371448467966573
Best Parameters: {'n_estimators': 100}
