In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import folium 
import requests
import json
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression

hdb_class_data = pd.read_csv("hbd_model_data_classification.csv")

In [None]:
hdb_class_data.head()

### Check distribution of classes

In [None]:
hdb_class_data.groupby("over_under_classification").count()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax.set_title("Distribution of Valuation Classifications")

sns.countplot(x=hdb_class_data["over_under_classification"])

### Define Independent and Target Variables

In [None]:
# declare independent and dependent variables
X = hdb_class_data.drop(columns=['over_under_classification'])
y = hdb_class_data[['over_under_classification']]

### Split Data into Training and Test Sets

In [None]:
#Split the dataset into training and testing data
from sklearn.model_selection import train_test_split

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Feature Selection for Logistic Regression

In [None]:
logreg_clf = LogisticRegression(random_state=42)
logreg_clf.fit(X_train, y_train)

In [None]:
print(logreg_clf.coef_)

In [None]:
print(np.round(logreg_clf.coef_, decimals=2)>0)

##### Recursive Feature Elimination Using Scikit-learn RFE

In [None]:
from sklearn.feature_selection import RFE   

predictors = X_train

# n_features_to_select = 1 to get full ranking of features
selector = RFE(logreg_clf, n_features_to_select=1)
selector = selector.fit(predictors, y_train)

In [None]:
order = selector.ranking_
print(order)

##### Derive the Features Selected by RFE

In [None]:
features_ranked = []

for i in order:
   features_ranked.append(predictors.columns[i-1])

print(features_ranked)

### Train Logistic Regression Model With ALL Available Features From Dataset

In [None]:
logreg_clf_all = LogisticRegression(random_state=42)
logreg_clf_all.fit(X_train, y_train)

In [None]:
print(logreg_clf_all.coef_)

In [None]:
print(logreg_clf_all.intercept_)

##### Model Evaluation (Model Trained With ALL Available Features)

In [None]:
y_pred_all = logreg_clf_all.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# generate confusion matrix for log reg
cnf = confusion_matrix(y_test, y_pred_all)
print("Confusion Matrix: \n", cnf)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cnf, display_labels=logreg_clf_all.classes_)

disp.plot()

plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

# generate roc_auc_score for log reg
roc_auc_score(y_test, y_pred_all)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Find accuracy, precision, recall, and f1 score
asr = accuracy_score(y_test, y_pred_all)
f1 = f1_score(y_test, y_pred_all)
precision = precision_score(y_test, y_pred_all)
recall = recall_score(y_test, y_pred_all)

print("Accuracy: " + str(asr))
print("F1: " + str(f1))
print("Precision: " + str(precision))
print("Recall: " + str(recall))

### Train Logistic Regression Model With Selected Features

In [None]:
# re-select independent and dependent variables
X_selected = hdb_class_data[['storey_range', 'floor_area_sqm', 'Distance to nearest MRT']]
y_selected = hdb_class_data[['over_under_classification']]

In [None]:
# split data
X_selected_train, X_selected_test, y_selected_train, y_selected_test = train_test_split(X_selected, y_selected, test_size = 0.2, random_state = 0)

In [None]:
logreg_clf = LogisticRegression(random_state=42)
logreg_clf.fit(X_selected_train, y_selected_train)

In [None]:
from sklearn.metrics import roc_auc_score

# training data results
y_pred = logreg_clf.predict(X_selected_train)

# training data performance
asr = accuracy_score(y_selected_train, y_pred)
f1 = f1_score(y_selected_train, y_pred)
precision = precision_score(y_selected_train, y_pred)
recall = recall_score(y_selected_train, y_pred)

print("Accuracy: " + str(asr))
print("F1: " + str(f1))
print("Precision: " + str(precision))
print("Recall: " + str(recall))

# generate roc_auc_score for log reg
print("ROC AUC Score: " + str(roc_auc_score(y_selected_train, y_pred)))

In [None]:
print(logreg_clf.coef_)

In [None]:
print(logreg_clf.intercept_)

##### Model Evaluation (Model Trained With SELECTED Available Features)

In [None]:
y_pred = logreg_clf.predict(X_selected_test)

In [None]:
from sklearn.metrics import confusion_matrix

# generate confusion matrix for log reg
cnf = confusion_matrix(y_selected_test, y_pred)
print("Confusion Matrix: \n", cnf)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cnf, display_labels=logreg_clf.classes_)

disp.plot()

plt.show()

In [None]:
# generate roc_auc_score for log reg
roc_auc_score(y_selected_test, y_pred)

In [None]:
# Find accuracy, precision, recall, and f1 score
asr = accuracy_score(y_selected_test, y_pred)
f1 = f1_score(y_selected_test, y_pred)
precision = precision_score(y_selected_test, y_pred)
recall = recall_score(y_selected_test, y_pred)

print("Accuracy: " + str(asr))
print("F1: " + str(f1))
print("Precision: " + str(precision))
print("Recall: " + str(recall))