### AdaBoost Classifier

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn import datasets
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold as sk
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

#Libraries for data pre-processing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#For AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier

In [2]:
hdb_model_data_classification = pd.read_csv("hbd_model_data_classification.csv")
hdb_model_data_classification.head()

Unnamed: 0,storey_range,floor_area_sqm,no.of bto,resale application,remaining_lease_months,Distance to nearest MRT,Distance to CBD,isMatureEstate,over_under_classification
0,5,73.0,7314,26436.0,667,908.970521,9026.295266,1,0
1,20,70.0,7314,26436.0,1100,687.185319,9015.122154,1,0
2,2,73.0,7314,26436.0,676,586.98069,8949.443986,1,0
3,5,73.0,7314,26436.0,663,800.631299,9123.690385,1,0
4,8,68.0,7314,26436.0,708,927.322849,9734.443856,1,0


In [3]:
hdb_model_data_classification.shape

(77834, 9)

In [4]:
X = hdb_model_data_classification.iloc[:,0:8]
y= hdb_model_data_classification.iloc[:,-1]  

X.dtypes

storey_range                 int64
floor_area_sqm             float64
no.of bto                    int64
resale application         float64
remaining_lease_months       int64
Distance to nearest MRT    float64
Distance to CBD            float64
isMatureEstate               int64
dtype: object

In [5]:
# train test split with 80-20
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# AdaBoost Classifier

Using GridSearchCV, find best parameters for Adaboost

In [6]:
# define the model with default hyperparameters
model = AdaBoostClassifier()

# define the grid of values to search
grid = dict()
grid['n_estimators'] = [50, 100, 150, 200, 250, 300]
grid['learning_rate'] = [0.1, 0.5, 1.0, 1.5, 2.0]

# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv)

# execute the grid search
grid_result = grid_search.fit(X_train, y_train)

# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

#Use the trained model to predict the test data
y_pred = grid_result.predict(X_test)

# Find the accuracy, precision, recall and F1 score of the result
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

Best: 0.558723 using {'learning_rate': 1.5, 'n_estimators': 300}


In [7]:
print("accuracy score = " + str(acc))
print("precision score = " + str(precision))
print("recall score = " + str(recall))
print("f1 score = " + str(f1))

accuracy score = 0.5593242114729877
precision score = 0.5470910138248848
recall score = 0.5055895661431994
f1 score = 0.5255222022409738
