In [1]:
#import required libraries

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

### Read and Understand the Data

In [2]:
#load the data
df = pd.read_csv('heart_v2.csv')

In [3]:
# top 5 records
df.head()

Unnamed: 0,age,sex,BP,cholestrol,heart disease
0,70,1,130,322,1
1,67,0,115,564,0
2,57,1,124,261,1
3,64,1,128,263,0
4,74,0,120,269,0


In [4]:
# check the details of each columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   age            270 non-null    int64
 1   sex            270 non-null    int64
 2   BP             270 non-null    int64
 3   cholestrol     270 non-null    int64
 4   heart disease  270 non-null    int64
dtypes: int64(5)
memory usage: 10.7 KB


In [5]:
# check the shape of the data
df.shape

(270, 5)

In [6]:
# check the value counts of the target/dependent variable
df['heart disease'].value_counts()

heart disease
0    150
1    120
Name: count, dtype: int64

### Binary Classification Problem

In [7]:
# Split the dataset into X and y
X = df[['age','sex','BP','cholestrol']]
y = df[['heart disease']]

In [8]:
# Splitting the train and test data into 70 and 30 respectively
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [9]:
# check the shape of the X_train data
X_train.shape

(189, 4)

In [10]:
# check the shape of the X_test data
X_test.shape

(81, 4)

In [11]:
# build the AdaBoost Classifier model
classifier = AdaBoostClassifier(n_estimators=5, random_state=10)
model1 = classifier.fit(X_train,y_train)

In [12]:
#Predict the results on train and test data
y_train_pred = model1.predict(X_train)
y_test_pred =  model1.predict(X_test)

In [13]:
# Check the accuracy of the model created
print('AdaBoost Classifier Model Train Accuracy', round(100 * accuracy_score(y_train,y_train_pred),2))
print('AdaBoost Classifier Model Test Accuracy', round(100 * accuracy_score(y_test,y_test_pred),2))

AdaBoost Classifier Model Train Accuracy 69.84
AdaBoost Classifier Model Test Accuracy 59.26


In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
model = AdaBoostClassifier()

#define the grid of values search
grid = dict()

grid['n_estimators'] = range(10,1000,50)
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 10]

In [16]:
%%time

#define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs = -1, cv=5, scoring = 'accuracy')

#execute the grid search
grid_result = grid_search.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

CPU times: user 1.42 s, sys: 320 ms, total: 1.74 s
Wall time: 1min 41s


In [17]:
#print the best score and best params
print(grid_result.best_score_, grid_result.best_params_)

0.6984352773826458 {'learning_rate': 0.01, 'n_estimators': 160}


In [18]:
#retrain the model with updated grid best estimator value
model1 = grid_result.best_estimator_

In [19]:
#Predict the results on train and test data again with updated model
y_train_pred = model1.predict(X_train)
y_test_pred =  model1.predict(X_test)

In [20]:
# Check the accuracy of the model updated again
print('AdaBoost Classifier Model Train Accuracy', round(100 * accuracy_score(y_train,y_train_pred),2))
print('AdaBoost Classifier Model Test Accuracy', round(100 * accuracy_score(y_test,y_test_pred),2))

AdaBoost Classifier Model Train Accuracy 70.37
AdaBoost Classifier Model Test Accuracy 64.2
