In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn import tree
import time
%matplotlib inline

For this challenge, we will perform an exercise in accuracy. Random forests are a collection of decision trees. We will compare the two models in terms of accuracy.

We will build the best decision tree that we can. Then, we will try to match that with the simplest random forest that we can build.

**Measurement:** For this exercise, measure simplicity with runtime. Compare that to the runtime of the decision tree. 


In [None]:
# Replace the path with the correct path for your data.
heart_df = pd.read_csv('https://raw.githubusercontent.com/elliebenn/datasets/master/cardio_train.csv',sep=";")
heart_df.info()
heart_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [None]:
pd.get_dummies(heart_df, columns=['cholesterol', 'gluc'])

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3
0,0,18393,2,168,62.0,110,80,0,0,1,0,1,0,0,1,0,0
1,1,20228,1,156,85.0,140,90,0,0,1,1,0,0,1,1,0,0
2,2,18857,1,165,64.0,130,70,0,0,0,1,0,0,1,1,0,0
3,3,17623,2,169,82.0,150,100,0,0,1,1,1,0,0,1,0,0
4,4,17474,1,156,56.0,100,60,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,0,1,0,1,0,0,1,0,0
69996,99995,22601,1,158,126.0,140,90,0,0,1,1,0,1,0,0,1,0
69997,99996,19066,2,183,105.0,180,90,0,1,0,1,0,0,1,1,0,0
69998,99998,22431,1,163,72.0,135,80,0,0,0,1,1,0,0,0,1,0


Features:

>* Age | **age** | int (days)
* Height | **height** | int (cm) |
* Weight | **weight** | float (kg) |
* Gender | **gender** | categorical code |
* Systolic blood pressure | **ap_hi** | int |
* Diastolic blood pressure | **ap_lo** | int |
* Cholesterol | **cholesterol** | 1: normal, 2: above normal, 3: well above normal |
* Glucose | **gluc** | 1: normal, 2: above normal, 3: well above normal |
* Smoking | **smoke** | binary |
* Alcohol intake | **alco** | binary |
* Physical activity | **active** | binary |
* Presence or absence of cardiovascular disease | Target Variable | **cardio** | binary |

# **EDA**

Our target variable, **cardio**, has close to even distribution in values for both presence and absense of cardiovascular disease.

In [None]:
heart_df['cardio'].value_counts()

0    35021
1    34979
Name: cardio, dtype: int64

In [None]:
heart_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,70000.0,49972.4199,28851.302323,0.0,25006.75,50001.5,74889.25,99999.0
age,70000.0,19468.865814,2467.251667,10798.0,17664.0,19703.0,21327.0,23713.0
gender,70000.0,1.349571,0.476838,1.0,1.0,1.0,2.0,2.0
height,70000.0,164.359229,8.210126,55.0,159.0,165.0,170.0,250.0
weight,70000.0,74.20569,14.395757,10.0,65.0,72.0,82.0,200.0
ap_hi,70000.0,128.817286,154.011419,-150.0,120.0,120.0,140.0,16020.0
ap_lo,70000.0,96.630414,188.47253,-70.0,80.0,80.0,90.0,11000.0
cholesterol,70000.0,1.366871,0.68025,1.0,1.0,1.0,2.0,3.0
gluc,70000.0,1.226457,0.57227,1.0,1.0,1.0,1.0,3.0
smoke,70000.0,0.088129,0.283484,0.0,0.0,0.0,0.0,1.0


# **Decision Tree**

In [None]:
X = heart_df.drop('cardio', 1)
Y = heart_df['cardio']

In [None]:
# Initialize and train our tree.
start_time = time.time()

decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=2
)
decision_tree.fit(X, Y)

decision_tree_cvscores = cross_val_score(decision_tree, X, Y, cv=10)
print(decision_tree_cvscores)
print()
print("--- %s seconds for Decision Tree---" % (time.time() - start_time))
print()
print("mean accuracy score = {:.3f} +- {:.3f}".format(decision_tree_cvscores.mean(), decision_tree_cvscores.std()))

[0.62171429 0.63085714 0.649      0.64728571 0.63657143 0.65028571
 0.63985714 0.63914286 0.63242857 0.60171429]

--- 2.5895817279815674 seconds for Decision Tree---

mean accuracy score = 0.635 +- 0.014


In [None]:
start_time = time.time()

decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=2,
    max_depth = 3
)
decision_tree.fit(X, Y)

decision_tree_cvscores = cross_val_score(decision_tree, X, Y, cv=10)
print(decision_tree_cvscores)
print()
print("--- %s seconds for Decision Tree---" % (time.time() - start_time))
print()
print("mean accuracy score = {:.3f} +- {:.3f}".format(decision_tree_cvscores.mean(), decision_tree_cvscores.std()))

[0.502      0.61328571 0.66014286 0.70757143 0.66071429 0.685
 0.67242857 0.70871429 0.54771429 0.66928571]

--- 0.47426366806030273 seconds for Decision Tree---

mean accuracy score = 0.643 +- 0.065


In [None]:
start_time = time.time()

decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=2,
    max_depth = 4
)
decision_tree.fit(X, Y)

decision_tree_cvscores = cross_val_score(decision_tree, X, Y, cv=10)
print(decision_tree_cvscores)
print()
print("--- %s seconds for Decision Tree---" % (time.time() - start_time))
print()
print("mean accuracy score = {:.3f} +- {:.3f}".format(decision_tree_cvscores.mean(), decision_tree_cvscores.std()))

[0.70742857 0.61742857 0.62685714 0.71914286 0.68185714 0.72614286
 0.65485714 0.71271429 0.70114286 0.64342857]

--- 0.5791616439819336 seconds for Decision Tree---

mean accuracy score = 0.679 +- 0.038


In [None]:
start_time = time.time()

decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=2,
    max_depth = 5
)
decision_tree.fit(X, Y)

decision_tree_cvscores = cross_val_score(decision_tree, X, Y, cv=10)
print(decision_tree_cvscores)
print()
print("--- %s seconds for Decision Tree---" % (time.time() - start_time))
print()
print("mean accuracy score = {:.3f} +- {:.3f}".format(decision_tree_cvscores.mean(), decision_tree_cvscores.std()))

[0.50657143 0.70357143 0.723      0.70471429 0.72457143 0.72271429
 0.68685714 0.72257143 0.71428571 0.67228571]

--- 0.6334545612335205 seconds for Decision Tree---

mean accuracy score = 0.688 +- 0.063


In [None]:
start_time = time.time()

decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=3,
    max_depth = 4
)
decision_tree.fit(X, Y)

decision_tree_cvscores = cross_val_score(decision_tree, X, Y, cv=10)
print(decision_tree_cvscores)
print()
print("--- %s seconds for Decision Tree---" % (time.time() - start_time))
print()
print("mean accuracy score = {:.3f} +- {:.3f}".format(decision_tree_cvscores.mean(), decision_tree_cvscores.std()))

[0.68542857 0.69557143 0.71614286 0.72985714 0.67757143 0.715
 0.719      0.72185714 0.70628571 0.71842857]

--- 0.7014410495758057 seconds for Decision Tree---

mean accuracy score = 0.709 +- 0.016


In [None]:
start_time = time.time()

decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=3,
    max_depth = 5
)
decision_tree.fit(X, Y)

decision_tree_cvscores = cross_val_score(decision_tree, X, Y, cv=10)
print(decision_tree_cvscores)
print()
print("--- %s seconds for Decision Tree---" % (time.time() - start_time))
print()
print("mean accuracy score = {:.3f} +- {:.3f}".format(decision_tree_cvscores.mean(), decision_tree_cvscores.std()))

[0.66028571 0.71528571 0.73328571 0.71285714 0.70128571 0.72414286
 0.72       0.72714286 0.71614286 0.71628571]

--- 0.8350958824157715 seconds for Decision Tree---

mean accuracy score = 0.713 +- 0.019


# **Random Forest**

In [None]:
start_time = time.time()

rfc = ensemble.RandomForestClassifier()
X = heart_df.drop('cardio', 1)
Y = heart_df['cardio']

rfc_cvscores = cross_val_score(rfc, X, Y, cv=10)
print(rfc_cvscores)
print()
print("--- %s seconds for Random Forest---" % (time.time() - start_time))
print()
print("mean accuracy score = {:.3f} +- {:.3f}".format(rfc_cvscores.mean(), rfc_cvscores.std()))

[0.62728571 0.72071429 0.73057143 0.72685714 0.72071429 0.73085714
 0.72285714 0.73285714 0.71528571 0.68357143]

--- 119.20559811592102 seconds for Random Forest---

mean accuracy score = 0.711 +- 0.031


In [None]:
start_time = time.time()

rfc = ensemble.RandomForestClassifier()
X = heart_df.drop('cardio', 1)
Y = heart_df['cardio']

rfc_cvscores = cross_val_score(rfc, X, Y, cv=10)
print(rfc_cvscores)
print()
print("--- %s seconds for Random Forest---" % (time.time() - start_time))
print()
print("mean accuracy score = {:.3f} +- {:.3f}".format(rfc_cvscores.mean(), rfc_cvscores.std()))

[0.63771429 0.71914286 0.73042857 0.72614286 0.72014286 0.73371429
 0.72442857 0.73057143 0.71628571 0.67214286]

--- 123.43050241470337 seconds for Random Forest---

mean accuracy score = 0.711 +- 0.030


For this dataset, our decision tree with max_features 3 and max_depth of 5 ran 32.5% faster than Random Forest with pretty close accuracies.