Build a classification model using various methods (SVM, logistic regression, random forest) and check which model gives you the best accuracy

Now use PCA to reduce dimensions, retrain your model and see what impact it has on your model in terms of accuracy. Keep in mind that many times doing PCA reduces the accuracy but computation is much lighter and that's the trade off you need to consider while building models in real life

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('heart.csv')
dataset.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
df = pd.get_dummies(dataset, drop_first=True)
df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,True,True,False,False,True,False,False,False,True
1,49,160,180,0,156,1.0,1,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,0,True,True,False,False,False,True,False,False,True
3,48,138,214,0,108,1.5,1,False,False,False,False,True,False,True,True,False
4,54,150,195,0,122,0.0,0,True,False,True,False,True,False,False,False,True


In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop('HeartDisease', axis=1))

In [11]:
X = scaler.transform(df.drop('HeartDisease', axis=1))
y = df['HeartDisease']

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [10]:
model_params = {
    'svm' : {
        'model' : SVC(),
        'params' : {
            'C' : [1,10,20,100],
            'kernel' : ['linear', 'rbf'],
            'gamma' : [0.001, 0.01, 0.1, 1]
        }
    },

    'random_forest' : {
        'model' : RandomForestClassifier(),
        'params' : {
            'n_estimators' : [1, 5, 10, 20],
        }
    },

    'logistic_regression' : {
        'model' : LogisticRegression(),
        'params' : {
            'C' : [1,10,20,100],
        }
    },
}

In [12]:
scores = []
for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'],mp['params'],cv = 5, return_train_score=False)
    clf.fit(X,y)
    scores.append({
        'model' : model_name,
        'best_score' : clf.best_score_,
        'best_params' : clf.best_params_
    })

df_scores = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_scores


Unnamed: 0,model,best_score,best_params
0,svm,0.833298,"{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}"
1,random_forest,0.820201,{'n_estimators': 5}
2,logistic_regression,0.827845,{'C': 1}


In [17]:
X.shape

(918, 15)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = SVC(C=1,gamma=0.1,kernel='rbf')
model.fit(X_train, y_train)

In [15]:
model.score(X_test, y_test)

0.8804347826086957

Now applying PCA

In [34]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)

X_pca.shape

(918, 10)

In [35]:
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
model_pca = SVC(C=1,gamma=0.1,kernel='rbf')
model_pca.fit(X_train_pca, y_train)

In [36]:
model_pca.score(X_test_pca, y_test)

0.875