In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [15]:
dataset = pd.read_csv("sample_data/framingham.csv")

In [16]:
dataset[:15]

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0
5,0,43,2.0,0,0.0,0.0,0,1,0,228.0,180.0,110.0,30.3,77.0,99.0,0
6,0,63,1.0,0,0.0,0.0,0,0,0,205.0,138.0,71.0,33.11,60.0,85.0,1
7,0,45,2.0,1,20.0,0.0,0,0,0,313.0,100.0,71.0,21.68,79.0,78.0,0
8,1,52,1.0,0,0.0,0.0,0,1,0,260.0,141.5,89.0,26.36,76.0,79.0,0
9,1,43,1.0,1,30.0,0.0,0,1,0,225.0,162.0,107.0,23.61,93.0,88.0,0


In [17]:
dataset.shape

(4238, 16)

In [18]:
#Checking for missing values.
dataset.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [25]:
dataset["education"] = dataset["education"].fillna(dataset["education"].median())
dataset["cigsPerDay"] = dataset["cigsPerDay"].fillna(float(round(dataset["cigsPerDay"].mean())))
dataset["totChol"] = dataset["totChol"].fillna(float(round(dataset["totChol"].mean())))
dataset["BMI"] = dataset["BMI"].fillna(float(round(dataset["BMI"].mean())))
dataset["glucose"] = dataset["glucose"].fillna(float(round(dataset["glucose"].mean())))

In [30]:
dataset=dataset.dropna(subset=['heartRate','BPMeds'])
dataset.shape

(4184, 16)

In [None]:
dataset["age"] = dataset["age"] / dataset["age"].max()
dataset["cigsPerDay"] = dataset["cigsPerDay"] / dataset["cigsPerDay"].max()
dataset["totChol"] = dataset["totChol"] / dataset["totChol"].max()
dataset["sysBP"] = dataset["sysBP"] / dataset["sysBP"].max()
dataset["diaBP"] = dataset["diaBP"] / dataset["diaBP"].max()
dataset["BMI"] = dataset["BMI"] / dataset["BMI"].max()
dataset["heartRate"] = dataset["heartRate"] / dataset["heartRate"].max()
dataset["glucose"] = dataset["glucose"] / dataset["glucose"].max()

In [81]:
X = dataset.drop("TenYearCHD",axis=1)
y = dataset["TenYearCHD"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41, stratify=y)


In [82]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train,y_train)

In [83]:
model.score(X_test,y_test)

0.8530465949820788

In [88]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

models_and_parameters = {
    "svm" : {
        "model" : SVC(gamma="auto"),
        "parameters" : {
            "C" : [1,10,20],
            "kernel": ["rbf","linear"]
        }
    },
    "randomforest" : {
        "model" : RandomForestClassifier(),
        "parameters" : {
            "n_estimators" : [100,200,300]
        }
    },
    "logisticreg" : {
        "model" : LogisticRegression(max_iter=1000),
        "parameters" : {
            "C": [1,5,10]
        }
    }
}
results = []
for modelname, modelattr in models_and_parameters.items():
  grid = GridSearchCV(modelattr["model"], modelattr["parameters"], cv = 3, return_train_score=False)
  grid.fit(X_train, y_train)
  results.append({
      "model" : modelname,
      "parameters" : grid.best_params_,
      "score" : grid.best_score_
  })

df = pd.DataFrame(results, columns=["model","parameters","score"])
df

Unnamed: 0,model,parameters,score
0,svm,"{'C': 1, 'kernel': 'rbf'}",0.84882
1,randomforest,{'n_estimators': 100},0.848522
2,logisticreg,{'C': 10},0.850314
