In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import folium 
import requests
import json
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

hdb_class_data = pd.read_csv("hbd_model_data_classification.csv")

In [2]:
hdb_class_data.head()

Unnamed: 0,storey_range,floor_area_sqm,no.of bto,resale application,remaining_lease_months,Distance to nearest MRT,Distance to CBD,isMatureEstate,over_under_classification
0,5,73.0,7314,26436.0,667,908.970521,9026.295266,1,0
1,20,70.0,7314,26436.0,1100,687.185319,9015.122154,1,0
2,2,73.0,7314,26436.0,676,586.98069,8949.443986,1,0
3,5,73.0,7314,26436.0,663,800.631299,9123.690385,1,0
4,8,68.0,7314,26436.0,708,927.322849,9734.443856,1,0


### Check distribution of classes

In [None]:
hdb_class_data.groupby("over_under_classification").count()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax.set_title("Distribution of Valuation Classifications")

sns.countplot(x=hdb_class_data["over_under_classification"])

### Define Independent and Target Variables

In [3]:
# declare independent and dependent variables
X = hdb_class_data.drop(columns=['over_under_classification'])
y = hdb_class_data[['over_under_classification']]

### Split Data into Training and Test Sets

In [4]:
#Split the dataset into training and testing data
from sklearn.model_selection import train_test_split

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Feature Selection for Logistic Regression

In [5]:
logreg_clf = LogisticRegression(random_state=42)
logreg_clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(random_state=42)

In [6]:
print(logreg_clf.coef_)

[[ 2.03837962e-02 -8.31550053e-04  2.15752715e-06 -5.56909635e-06
   1.44771599e-06 -1.00129996e-04  5.13241592e-06  2.01753689e-04]]


In [7]:
print(np.round(logreg_clf.coef_, decimals=2)>0)

[[ True False False False False False False False]]


##### Recursive Feature Elimination Using Scikit-learn RFE

In [None]:
from sklearn.feature_selection import RFE   

predictors = X_train

# n_features_to_select = 1 to get full ranking of features
selector = RFE(logreg_clf, n_features_to_select=1)
selector = selector.fit(predictors, y_train)

In [None]:
order = selector.ranking_
print(order)

##### Derive the Features Selected by RFE

In [None]:
features_ranked = []

for i in order:
   features_ranked.append(predictors.columns[i-1])

print(features_ranked)

### Train Logistic Regression Model With ALL Available Features From Dataset

In [8]:
logreg_clf_all = LogisticRegression(random_state=42)
logreg_clf_all.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(random_state=42)

In [9]:
print(logreg_clf_all.coef_)

[[ 2.03837962e-02 -8.31550053e-04  2.15752715e-06 -5.56909635e-06
   1.44771599e-06 -1.00129996e-04  5.13241592e-06  2.01753689e-04]]


In [10]:
print(logreg_clf_all.intercept_)

[2.38015211e-05]


##### Model Evaluation (Model Trained With ALL Available Features)

In [11]:
from sklearn.metrics import roc_auc_score

# training data results
y_train_pred = logreg_clf.predict(X_train)

# training data performance
asr = accuracy_score(y_train, y_train_pred)
f1 = f1_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred)
recall = recall_score(y_train, y_train_pred)

print("Accuracy: " + str(asr))
print("F1: " + str(f1))
print("Precision: " + str(precision))
print("Recall: " + str(recall))

# generate roc_auc_score for log reg
print("ROC AUC Score: " + str(roc_auc_score(y_train, y_train_pred)))

Accuracy: 0.5324489697592625
F1: 0.40888509877972024
Precision: 0.5349306699250916
Recall: 0.3309123176022085
ROC AUC Score: 0.5279832482197419


In [None]:
y_pred_all = logreg_clf_all.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# generate confusion matrix for log reg
cnf = confusion_matrix(y_test, y_pred_all)
print("Confusion Matrix: \n", cnf)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cnf, display_labels=logreg_clf_all.classes_)

disp.plot()

plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

# generate roc_auc_score for log reg
roc_auc_score(y_test, y_pred_all)

In [None]:
# Find accuracy, precision, recall, and f1 score
asr = accuracy_score(y_test, y_pred_all)
f1 = f1_score(y_test, y_pred_all)
precision = precision_score(y_test, y_pred_all)
recall = recall_score(y_test, y_pred_all)

print("Accuracy: " + str(asr))
print("F1: " + str(f1))
print("Precision: " + str(precision))
print("Recall: " + str(recall))

### Train Logistic Regression Model With Selected Features

In [None]:
# re-select independent and dependent variables
X_selected = hdb_class_data[['storey_range', 'floor_area_sqm', 'Distance to nearest MRT']]
y_selected = hdb_class_data[['over_under_classification']]

In [None]:
# split data
X_selected_train, X_selected_test, y_selected_train, y_selected_test = train_test_split(X_selected, y_selected, test_size = 0.2, random_state = 0)

In [None]:
logreg_clf = LogisticRegression(random_state=42)
logreg_clf.fit(X_selected_train, y_selected_train)

In [None]:
# training data results
y_pred = logreg_clf.predict(X_selected_train)

# training data performance
asr = accuracy_score(y_selected_train, y_pred)
f1 = f1_score(y_selected_train, y_pred)
precision = precision_score(y_selected_train, y_pred)
recall = recall_score(y_selected_train, y_pred)

print("Accuracy: " + str(asr))
print("F1: " + str(f1))
print("Precision: " + str(precision))
print("Recall: " + str(recall))

# generate roc_auc_score for log reg
print("ROC AUC Score: " + str(roc_auc_score(y_selected_train, y_pred)))

In [None]:
print(logreg_clf.coef_)

In [None]:
print(logreg_clf.intercept_)

##### Model Evaluation (Model Trained With SELECTED Available Features)

In [None]:
y_pred = logreg_clf.predict(X_selected_test)

In [None]:
from sklearn.metrics import confusion_matrix

# generate confusion matrix for log reg
cnf = confusion_matrix(y_selected_test, y_pred)
print("Confusion Matrix: \n", cnf)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cnf, display_labels=logreg_clf.classes_)

disp.plot()

plt.show()

In [None]:
# generate roc_auc_score for log reg
roc_auc_score(y_selected_test, y_pred)

In [None]:
# Find accuracy, precision, recall, and f1 score
asr = accuracy_score(y_selected_test, y_pred)
f1 = f1_score(y_selected_test, y_pred)
precision = precision_score(y_selected_test, y_pred)
recall = recall_score(y_selected_test, y_pred)

print("Accuracy: " + str(asr))
print("F1: " + str(f1))
print("Precision: " + str(precision))
print("Recall: " + str(recall))