In [18]:
import pandas as pd
from sqlalchemy import create_engine
from config import dbConnectionString
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
engine = create_engine(dbConnectionString)

#import dataframe from SQL database
df = pd.read_sql_query('select * from "cardioData"',con=engine)
df = df.drop(columns = ['index', 'id'])
df

Unnamed: 0,age,gender,height,weight,cardiovascular,BMI,ap_hi,ap_lo,cholestrol,gloucose,smoke,alchohol,active
0,50.4,Female,168.0,62.0,False,23,110,80,normal,normal,False,False,True
1,55.4,Male,156.0,85.0,True,46,140,90,well above normal,normal,False,False,True
2,51.7,Male,165.0,64.0,True,25,130,70,well above normal,normal,False,False,False
3,47.9,Male,156.0,56.0,False,20,100,60,normal,normal,False,False,False
4,60.0,Male,151.0,67.0,False,30,120,80,above normal,above normal,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63911,52.7,Female,168.0,76.0,False,34,120,80,normal,normal,True,False,True
63912,61.9,Male,158.0,126.0,True,100,140,90,above normal,above normal,False,False,True
63913,52.2,Female,183.0,105.0,True,60,180,90,well above normal,normal,False,True,False
63914,61.5,Male,163.0,72.0,True,32,135,80,normal,well above normal,False,False,False


In [3]:
le = LabelEncoder()
df2 = df.copy()
df2

Unnamed: 0,age,gender,height,weight,cardiovascular,BMI,ap_hi,ap_lo,cholestrol,gloucose,smoke,alchohol,active
0,50.4,Female,168.0,62.0,False,23,110,80,normal,normal,False,False,True
1,55.4,Male,156.0,85.0,True,46,140,90,well above normal,normal,False,False,True
2,51.7,Male,165.0,64.0,True,25,130,70,well above normal,normal,False,False,False
3,47.9,Male,156.0,56.0,False,20,100,60,normal,normal,False,False,False
4,60.0,Male,151.0,67.0,False,30,120,80,above normal,above normal,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63911,52.7,Female,168.0,76.0,False,34,120,80,normal,normal,True,False,True
63912,61.9,Male,158.0,126.0,True,100,140,90,above normal,above normal,False,False,True
63913,52.2,Female,183.0,105.0,True,60,180,90,well above normal,normal,False,True,False
63914,61.5,Male,163.0,72.0,True,32,135,80,normal,well above normal,False,False,False


In [4]:
#female = 0, male = 1
df2['gender'] = le.fit_transform(df2['gender'])
df2

Unnamed: 0,age,gender,height,weight,cardiovascular,BMI,ap_hi,ap_lo,cholestrol,gloucose,smoke,alchohol,active
0,50.4,0,168.0,62.0,False,23,110,80,normal,normal,False,False,True
1,55.4,1,156.0,85.0,True,46,140,90,well above normal,normal,False,False,True
2,51.7,1,165.0,64.0,True,25,130,70,well above normal,normal,False,False,False
3,47.9,1,156.0,56.0,False,20,100,60,normal,normal,False,False,False
4,60.0,1,151.0,67.0,False,30,120,80,above normal,above normal,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63911,52.7,0,168.0,76.0,False,34,120,80,normal,normal,True,False,True
63912,61.9,1,158.0,126.0,True,100,140,90,above normal,above normal,False,False,True
63913,52.2,0,183.0,105.0,True,60,180,90,well above normal,normal,False,True,False
63914,61.5,1,163.0,72.0,True,32,135,80,normal,well above normal,False,False,False


In [5]:
normal_dic = {
    "normal" : 0,
    "above normal" : 1,
    "well above normal" : 2,
}

true_false_dic = {
    True : 1,
    False : 0,
}

df2['cardiovascular'] = df2['cardiovascular'].apply(lambda x : true_false_dic[x])
df2['smoke'] = df2['smoke'].apply(lambda x : true_false_dic[x])
df2['alchohol'] = df2['alchohol'].apply(lambda x : true_false_dic[x])
df2['active'] = df2['active'].apply(lambda x : true_false_dic[x])

df2['cholestrol'] = df2['cholestrol'].apply(lambda x : normal_dic[x])
df2['gloucose'] = df2['gloucose'].apply(lambda x : normal_dic[x])
df2

Unnamed: 0,age,gender,height,weight,cardiovascular,BMI,ap_hi,ap_lo,cholestrol,gloucose,smoke,alchohol,active
0,50.4,0,168.0,62.0,0,23,110,80,0,0,0,0,1
1,55.4,1,156.0,85.0,1,46,140,90,2,0,0,0,1
2,51.7,1,165.0,64.0,1,25,130,70,2,0,0,0,0
3,47.9,1,156.0,56.0,0,20,100,60,0,0,0,0,0
4,60.0,1,151.0,67.0,0,30,120,80,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63911,52.7,0,168.0,76.0,0,34,120,80,0,0,1,0,1
63912,61.9,1,158.0,126.0,1,100,140,90,1,1,0,0,1
63913,52.2,0,183.0,105.0,1,60,180,90,2,0,0,1,0
63914,61.5,1,163.0,72.0,1,32,135,80,0,2,0,0,0


In [46]:
# set X and y values
X = df2.drop(['cardiovascular', 'BMI'], 1).values
y = df2['cardiovascular']

In [47]:
# split data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=9, stratify=y)

scaler= StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [48]:
model = tree.DecisionTreeClassifier()

In [49]:
model = model.fit(X_train_scaled, y_train)

In [50]:
predictions = model.predict(X_test_scaled)
confusion_matrix(y_test, predictions)

array([[5401, 3082],
       [2942, 4554]])

In [51]:
accuracy_score(y_test, predictions)

0.6230051943175418

## Random Forest

In [52]:
rf_model = RandomForestClassifier(n_estimators = 128, random_state = 9)

In [53]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [54]:
predictions = rf_model.predict(X_test_scaled)

In [55]:
confusion_matrix(y_test, predictions)

array([[6212, 2271],
       [2461, 5035]])

In [56]:
accuracy_score(y_test, predictions)

0.7038613179798485

In [57]:
#increase estimators
rf_model = RandomForestClassifier(n_estimators = 500, random_state = 9)
rf_model = rf_model.fit(X_train_scaled, y_train)
predictions = rf_model.predict(X_test_scaled)
accuracy_score(y_test, predictions)

0.7034858251455035

In [60]:
importances = rf_model.feature_importances_
X = df2.drop(['cardiovascular', 'BMI'], 1)
sorted(zip(importances, X.columns), reverse = True)

[(0.2806797998705362, 'age'),
 (0.19999771563656848, 'weight'),
 (0.18188470458147915, 'height'),
 (0.16317372400396282, 'ap_hi'),
 (0.07120740067567906, 'ap_lo'),
 (0.03877620121202788, 'cholestrol'),
 (0.017674851901421092, 'gender'),
 (0.014981693903398211, 'gloucose'),
 (0.014172462679451898, 'active'),
 (0.009421060644468232, 'smoke'),
 (0.008030384891007028, 'alchohol')]

In [39]:
#test using only top features

# set X and y values
X = df2.drop(['cardiovascular', 'BMI', 'gender', 'gloucose', 'active', 'smoke', 'alchohol'], 1).values
y = df2['cardiovascular']

In [40]:
# split data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=9, stratify=y)

scaler= StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [41]:
#increase estimators
rf_model = RandomForestClassifier(n_estimators = 500, random_state = 9)
rf_model = rf_model.fit(X_train_scaled, y_train)
predictions = rf_model.predict(X_test_scaled)
accuracy_score(y_test, predictions)

0.6925965329494962