# Training the ML Model

In [None]:
# Data Manipulation Dependencies
import numpy as np
import pandas as pd

# Graphing Dependencies
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Dependencies


## Import Data

In [None]:
df = pd.read_csv('./data/flat-prices.csv', sep=',', header=0)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## Data Pre-Processing

In [None]:
df['approval_date'] = pd.DatetimeIndex(df['month'])

In [None]:
df.drop(columns=['block', 'street_name', 'flat_model'], inplace=True, errors='ignore')

In [None]:
df['lease_commencement_year'] = df['lease_commence_date']

In [None]:
df.head()

In [None]:
df['storey'] = df['storey_range'].apply(lambda x: int(np.array(x.split(' TO '), dtype=int).mean()))

In [None]:
df['bedrooms'] = df['flat_type'].apply(lambda x: {
    '1 ROOM': 1,
    '2 ROOM': 1,
    '3 ROOM': 2,
    '4 ROOM': 3,
    '5 ROOM': 3,
    'EXECUTIVE': 3,
    'MULTI GENERATION': 4
}[x])

In [None]:
df.drop(columns=['month', 'lease_commence_date', 'storey_range', 'flat_type'], inplace=True, errors='ignore')

In [None]:
df['resale_price'] = df['resale_price'].astype(float)

## Exploratory Data Analysis

In [None]:
sns.scatterplot(data=df, x='floor_area_sqm', y='resale_price')

In [None]:
df

In [None]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(dtype=int, sparse=False)

In [None]:
enc.fit_transform(df[['town']])

In [None]:
df = pd.concat(objs=(
    df.drop(columns=['town']),
    pd.DataFrame(
        data=enc.fit_transform(df[['town']]),
        columns=enc.categories_[0]
    )), axis=1)

In [None]:
from datetime import date

df['approval_date'] = df['approval_date'].apply(lambda x: date.toordinal(x))

In [None]:
from sklearn.model_selection import train_test_split


X = df.drop(columns='resale_price')
y = df['resale_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.25)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

model = GradientBoostingRegressor()
model.fit(X_train, y_train)

In [None]:
cross_val_score(estimator=GradientBoostingRegressor(), X=X_train, y=y_train, cv=5)

In [None]:
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score

y_pred = model.predict(X_test)
mean_absolute_percentage_error(y_test, y_pred), \
mean_absolute_error(y_test, y_pred), \
r2_score(y_test, y_pred)

In [None]:
y_train_pred = model.predict(X_train)
mean_absolute_percentage_error(y_train, y_train_pred), \
mean_absolute_error(y_train, y_train_pred), \
r2_score(y_train, y_train_pred)

In [None]:
final_model = GradientBoostingRegressor()
final_model.fit(X, y)

## Model Export

In [None]:
# import pickle

# with open('./model/regressor.p', 'wb') as model_file:
#     pickle.dump(file=model_file, obj=final_model)