# Training the ML Model

## Import General Dependencies

In [None]:
# Data Manipulation Dependencies
import numpy as np
import pandas as pd
from datetime import date

# Graphing Dependencies
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Dependencies
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score

## Import Data

In [None]:
df = pd.read_csv('./data/flat-prices.csv', sep=',', header=0)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## Data Pre-Processing

In [None]:
# Convert approval_date to datetime type
df['approval_date'] = pd.DatetimeIndex(df['month'])

# Convert approval_date to integer type
# df['approval_date'] = df['approval_date'].apply(lambda x: date.toordinal(x))

In [None]:
# Rename lease_commence_date to more appropriate "lease_commencement_year"
df['lease_commencement_year'] = df['lease_commence_date']

In [None]:
# Estimate storey using storey_range
df['storey'] = df['storey_range'].apply(lambda x: int(np.array(x.split(' TO '), dtype=int).mean()))

In [None]:
# Extract number of bedrooms from flat_type
df['bedrooms'] = df['flat_type'].apply(lambda x: {
    '1 ROOM': 1,
    '2 ROOM': 1,
    '3 ROOM': 2,
    '4 ROOM': 3,
    '5 ROOM': 3,
    'EXECUTIVE': 3,
    'MULTI GENERATION': 4
}[x])

In [None]:
# Convert resale_price (target) to float type
df['resale_price'] = df['resale_price'].astype(float)

In [None]:
# Drop unused columns
df.drop(columns=[
    'month',
    'town',
    'flat_type',
    'block',
    'street_name',
    'storey_range',
    'flat_model',
    'lease_commence_date',
    'storey',
    # 'approval_date',
    # 'lease_commencement_year'
], inplace=True)


## Exploratory Data Analysis

In [None]:
sns.scatterplot(data=df, x='floor_area_sqm', y='resale_price')

## Partition Data

In [None]:
X = df.drop(columns='resale_price')
y = df['resale_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.25)

## Build ML Pipeline

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class DateConverter(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_copy = X.copy()
        X_copy['approval_date'] = X_copy['approval_date'].apply(date.toordinal)
        return X_copy

In [None]:
pipeline = Pipeline(steps=[
    ('date_conv', DateConverter()),
    ('scaler', StandardScaler()),
    ('model', LinearRegression(fit_intercept=True))
])

In [None]:
cross_val_score(estimator=pipeline, X=X_train, y=y_train, cv=5)

## Model Evaluation

In [None]:
# Train the model
pipeline.fit(X_train, y_train)

In [None]:
y_train_pred = pipeline.predict(X_train)
mean_absolute_percentage_error(y_train, y_train_pred), \
mean_absolute_error(y_train, y_train_pred), \
r2_score(y_train, y_train_pred)

In [None]:
y_pred = pipeline.predict(X_test)
mean_absolute_percentage_error(y_test, y_pred), \
mean_absolute_error(y_test, y_pred), \
r2_score(y_test, y_pred)

In [None]:
pipeline.named_steps['model'].coef_

In [None]:
def create_stencil(variable: str, **kwargs):
    length = len(kwargs[variable])
    for i in kwargs:
        if i != variable:
            kwargs[i] = np.resize(np.array([kwargs[i]]), (length,))
    return pd.DataFrame(kwargs)[['floor_area_sqm', 'approval_date', 'lease_commencement_year', 'bedrooms']]

In [None]:
def get_regression_plot(variable, mark, ax, **kwargs):
    x_var = df[variable]
    x = (x_var.min(), x_var.max())
    del kwargs[variable]
    y = pipeline.predict(create_stencil(variable, **{variable: x}, **kwargs))
    resale_price = pipeline.predict(create_stencil(
        variable, **{variable: (mark,)}, **kwargs))
    sns.lineplot(x=x, y=y, ax=ax)
    sns.scatterplot(x=mark, y=resale_price, ax=ax)

In [None]:
def get_regression_plots(bedrooms, floor_area_sqm, approval_date, lease_commencement_year):
    fig, ax = plt.subplots(2, 2)
    get_regression_plot(variable='bedrooms', mark=bedrooms, bedrooms=bedrooms, floor_area_sqm=floor_area_sqm,
                        approval_date=approval_date, lease_commencement_year=lease_commencement_year, ax=ax[0, 0])
    get_regression_plot(variable='floor_area_sqm', mark=bedrooms, bedrooms=bedrooms, floor_area_sqm=floor_area_sqm,
                        approval_date=approval_date, lease_commencement_year=lease_commencement_year, ax=ax[0, 1])
    # get_regression_plot(variable='approval_date', mark=bedrooms, bedrooms=bedrooms, floor_area_sqm=floor_area_sqm,
                        # approval_date=approval_date, lease_commencement_year=lease_commencement_year, ax=ax[1, 0])
    get_regression_plot(variable='lease_commencement_year', mark=bedrooms, bedrooms=bedrooms, floor_area_sqm=floor_area_sqm,
                        approval_date=approval_date, lease_commencement_year=lease_commencement_year, ax=ax[1, 1])
    return fig


In [None]:
get_regression_plots(3, 31.0, date(2000, 12, 12), 2000)
''

## Model Export

In [None]:
# import pickle

# with open('./model/regressor.p', 'wb') as model_file:
#     pickle.dump(file=model_file, obj=pipeline)