In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [None]:
path = '/content/heart_cleveland_upload.csv'
dataset = pd.read_csv(path)

In [None]:
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [None]:
dataset.isna().sum()

age          0
sex          0
cp           0
trestbps     0
chol         0
fbs          0
restecg      0
thalach      0
exang        0
oldpeak      0
slope        0
ca           0
thal         0
condition    0
dtype: int64

In [None]:
dataset["thal"].value_counts()

0    164
2    115
1     18
Name: thal, dtype: int64

In [None]:
dataset["exang"].value_counts()

0    200
1     97
Name: exang, dtype: int64

In [None]:
dataset["condition"].value_counts()

0    160
1    137
Name: condition, dtype: int64

In [None]:
y = dataset["condition"]
X = dataset.drop(['condition'],axis=1)

In [None]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0


In [None]:
train_set, test_set, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=X['sex'], random_state =42)

len(train_set), len(test_set)

(237, 60)

In [None]:
clf = LogisticRegression(solver='liblinear',random_state=0)
clf_score = cross_val_score(clf, train_set, y_train, cv=5)
print(clf_score)
pd.Series(clf_score).describe()

[0.77083333 0.89583333 0.80851064 0.82978723 0.80851064]


count    5.000000
mean     0.822695
std      0.046075
min      0.770833
25%      0.808511
50%      0.808511
75%      0.829787
max      0.895833
dtype: float64

In [None]:
from sklearn.model_selection import GridSearchCV

def logreg_grid_search(data, target, nfolds):
    #create a dictionary of all values we want to test
    param_grid = { 'C':np.arange(0.1, 20.0), 'tol':np.arange(1e-20,1e-1)}
    logReg=LogisticRegression(solver='liblinear',random_state=0)
    
    gridsearch = GridSearchCV(logReg, param_grid, cv=nfolds)
    gridsearch.fit(data, target)
    return gridsearch.best_params_

print("Best hyperparameter for this case:",logreg_grid_search(train_set, y_train, 5))

Best hyperparameter for this case: {'C': 5.1, 'tol': 1e-20}


In [None]:
clf2 = LogisticRegression(solver='liblinear',tol=1e-20,C=5.1,random_state=0)
clf_score = cross_val_score(clf2, train_set, y_train, cv=5)
print(clf_score)
pd.Series(clf_score).describe()

[0.77083333 0.875      0.80851064 0.82978723 0.82978723]


count    5.000000
mean     0.822784
std      0.037839
min      0.770833
25%      0.808511
50%      0.829787
75%      0.829787
max      0.875000
dtype: float64