### AdaBoost Classifier and GNB

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
from sklearn import datasets
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold as sk
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.preprocessing import StandardScaler, scale
from sklearn.metrics import accuracy_score as acs

#Libraries for data pre-processing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#For AdaBoost Classifier and GNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

In [3]:
hdb_model_data_classification = pd.read_csv("hbd_model_data_classification.csv")
hdb_model_data_classification.head()

Unnamed: 0,storey_range,floor_area_sqm,no.of bto,resale application,remaining_lease_months,Distance to nearest MRT,Distance to CBD,isMatureEstate,over_under_classification
0,0.0625,0.114583,0.0,0.0,0.229833,0.242702,0.377101,1.0,0
1,0.375,0.098958,0.0,0.0,0.888889,0.181339,0.376605,1.0,0
2,0.0,0.114583,0.0,0.0,0.243531,0.153615,0.373691,1.0,0
3,0.0625,0.114583,0.0,0.0,0.223744,0.212727,0.381423,1.0,0
4,0.125,0.088542,0.0,0.0,0.292237,0.247779,0.408523,1.0,0


In [4]:
hdb_model_data_classification.shape

(77834, 9)

In [5]:
X = hdb_model_data_classification.iloc[:,0:8]
y= hdb_model_data_classification.iloc[:,-1]  

X.dtypes

storey_range               float64
floor_area_sqm             float64
no.of bto                  float64
resale application         float64
remaining_lease_months     float64
Distance to nearest MRT    float64
Distance to CBD            float64
isMatureEstate             float64
dtype: object

In [6]:
import sklearn.feature_selection as fs

bk = fs.SelectKBest(k = 3)
bk.fit(X, y)
X = bk.transform(X)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# AdaBoost Classifier

Using GridSearchCV, find best parameters for Adaboost

In [7]:
# define the model with default hyperparameters
model = AdaBoostClassifier()

# define the grid of values to search
grid = dict()
grid['n_estimators'] = [50, 100, 150, 200, 250, 300]
grid['learning_rate'] = [0.1, 0.5, 1.0, 1.5, 2.0]

# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv)

# execute the grid search
grid_result = grid_search.fit(X_train, y_train)

# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

#Use the trained model to predict the test data
y_pred = grid_result.predict(X_test)

# Find the accuracy, precision, recall and F1 score of the result
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

Best: 0.539574 using {'learning_rate': 1.5, 'n_estimators': 250}


In [8]:
print("accuracy score = " + str(acc))
print("precision score = " + str(precision))
print("recall score = " + str(recall))
print("f1 score = " + str(f1))

accuracy score = 0.5411447292349201
precision score = 0.524876673713883
recall score = 0.4967982924226254
f1 score = 0.5104516482763346


# Gaussian Naive Bayes

In [9]:
model = GaussianNB()
#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

GaussianNB()

In [10]:
#Use the trained model to predict the test data
y_pred = model.predict(X_test)
# Find the confusion matrix of the result
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Find the accuracy, precision, recall and F1 score of the result
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("accuracy score = " + str(acc))
print("precision score = " + str(precision))
print("recall score = " + str(recall))
print("f1 score = " + str(f1))

[[6437 1634]
 [5648 1848]]
accuracy score = 0.5322155842487313
precision score = 0.530729465824239
recall score = 0.24653148345784417
f1 score = 0.3366733466933868
