In [89]:
%reset -f

# AIML CA1

## Import General Dependencies

In [None]:
# Mathematical Dependencies
import numpy as np

# Data Manipulation Dependencies
import pandas as pd

# Graphing Dependencies
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Dependencies
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline

# Preservation Dependency
import pickle

# Miscellaneous Dependencies
from typing import Callable, Dict, Union    # static typing
from warnings import filterwarnings         # warnings

# Utility Functions
from utils.extraction import extract_attributes

In [None]:
%matplotlib inline

In [None]:
# Hide Warnings
filterwarnings(action='ignore')

## Part A > Classification

*   How is your prediction task defined? And what is the meaning of the
output variable?

```
    The task is to predict if a mushroom of the agaricus lepiota family is edible or poisonous,
    given its properties (i.e. cap-shape, odor, etc.)

    The output variable is class, and its possible values carry the respective meanings:
    'edible':       the mushroom is safe for consumption
    'poisonous':    do not consume the mushroom
```

### Import Data

Load data about edibility of gilled mushrooms of the agaricus lepiota family

In [None]:
def load_mushrooms() -> pd.DataFrame:
    # Extract raw content of ./data/agaricus-lepiota.names file
    metadata: str
    with open('./data/agaricus-lepiota.names') as f:
        metadata = f.read()

    # Extract attributes from metadata
    attrs = extract_attributes(metadata, r'7\. Attribute Information:.*\n((.|\n)*)8\. Missing')

    # Extract column names to be used for dataframe
    cols = attrs.keys()

    # Create the dataframe from ./data/agaricus-lepiota.data file,
    #   using column names derived from ./data/agaricus-lepiota.names file
    df = pd.read_csv(
        filepath_or_buffer='./data/agaricus-lepiota.data',
        sep=',',
        header=0,
        names=cols
    )

    # Expand attribute codes to their full definitions
    for col in cols:
        df[col].replace(to_replace=attrs[col] ,inplace=True)
    
    return df

In [None]:
df = load_mushrooms()

#### Inspect Data

*   How do you represent your data as features?

```
    I represent the features as columns in a pandas DataFrame
```

In [None]:
# Inspect top 10 rows of the dataset
df.head(n=10)

#### Summarize Data

In [None]:
# Inspect overview of the dataset
df.info()

### Pre-Processing

*   Did you process the features in any way?

```
    Yes, the features underwent (feature) selection and (one-hot) encoding
```

#### EDA

In [None]:
# List to keep track of variables to be removed
drop_cols = []

Missing Values

In [None]:
# Check for missing values
df.isna().sum(axis=0)

# Note that stalk-root has missing attributes (denoted as 'missing')
# In fact, approx. 31% of the records have missing data for stalk-root
stalk_dist = df['stalk-root'].value_counts()
(stalk_dist / stalk_dist.sum()).round(2)

# Course of action - drop column
drop_cols.append('stalk-root')

Redundant Features

In [None]:
# Inspect unqiue counts of the individual features
print(df.describe().transpose().sort_values(by='unique', ascending=False))

# Note that veil-type has only one value,
#   hence it is a redundant feature

# Course of action - drop column
drop_cols.append('veil-type')

Inspect the distribution of the target variable (class: edible/poisonous)

In [None]:
from utils.plotting import format_label
def plot_A(df: pd.DataFrame):
    ax = sns.countplot(data=df, x='class', palette='deep')
    ax.set_ylim(top=5000)
    ax.set_title(label='General Data Distribution')
    ax.set_ylabel(ylabel='Number of Records')
    ax.set_yticklabels(labels=format_label(
        ax.get_yticks() / 1000, lambda s: f'{round(s)}k'))
    ax.set_xlabel(xlabel='Type')
    total_count = df.shape[0]
    for p in ax.patches:
        x = p.get_x()
        y = p.get_height()
        ax.annotate(text=f'{y} ({y/total_count*100:.1f}%)',
                    xy=(x + 0.21, y + 70))
    return ax
# plot_A(df=df)

Inspect correlation between the independent variables and the target variable (class)

In [None]:
def plot_B(df: pd.DataFrame):
    for i in df.drop(labels='class', axis=1).columns.values:
        fig, (corr_plot, freq_plot) = plt.subplots(ncols=2, figsize=(14, 6))
        ct = pd.crosstab(index=df['class'], columns=df[i])
        distr = df.groupby(i).count().iloc[:,0]
        proportion = (ct.iloc[1] - ct.iloc[0]) / distr
        corr = pd.DataFrame(proportion.reset_index())
        sns.barplot(data=corr, x=i, y=0, ax=corr_plot, color='grey')
        sns.countplot(data=df.sort_values(by=i), x=i, hue='class', ax=freq_plot, palette='turbo')
        fig.suptitle(t=f'{i.upper()}')
        corr_plot.set_title(label='Correlation (chi2-based)')
        corr_plot.set_ylim((-1.1, 1.1))
        corr_plot.set_ylabel(ylabel='Correlation')
        corr_plot.set_xticklabels(labels=corr_plot.get_xticklabels(), rotation=30)
        freq_plot.set_title(label=f'Frequency Distribution')
        freq_plot.set_xticklabels(labels=freq_plot.get_xticklabels(), rotation=30)
# plot_B(df)

#### Feature Engineering

There is no need for feature engineering in this dataset

#### Feature Selection

There are 2 columns to be removed (stalk-root, veil-type)

In [None]:
# Remove columns
df.drop(labels=drop_cols, axis=1, inplace=True)

### Encoding the data

The data has only categorical text variables, therefore they<br>have to be converted to numeric form using dummy variables

In [None]:
# (One-Hot) Encode the dataset (categorical -> binary)
df_ohe = pd.get_dummies(data=df, drop_first=True)

### Inspect correlation after encoding

In [None]:
# Get correlation between top 10 factors and target variable (class)
df_ohe.corr()['class_poisonous'].drop(labels='class_poisonous').sort_values(key=lambda x: np.abs(x), ascending=False).head(n=10)

In [None]:
# Chi2-based feature selection
from sklearn.feature_selection import chi2, SelectKBest

# Get top 10 factors that are correlated with the target variable (class)
best_features_chi2 = SelectKBest(score_func=chi2, k=10).fit(X=df_ohe.drop(labels='class_poisonous', axis=1), y=df_ohe['class_poisonous'])
best_features_mask = best_features_chi2.get_support()
best_features = df_ohe.drop(labels='class_poisonous', axis=1).columns.values[best_features_mask]
best_features_scores = best_features_chi2.scores_[best_features_mask]
good_predictors = pd.Series(data=best_features_scores, index=best_features)

good_predictors.sort_values(ascending=False)

### Data Partitioning

Split the data randomly into a train set and a test set

In [None]:
# Split the dataset into training and test sets
X = df_ohe.drop(labels='class_poisonous', axis=1)
y = df_ohe['class_poisonous']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Algorithm Selection & Hyper-Parameter Tuning

In [None]:
# Candidate classification algorithms
from sklearn.naive_bayes import CategoricalNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#### Determine best candidate algorithm using GridSearch

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass

In [None]:
def grid_search_clf():
    cand_pipe_1 = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('clf', DummyEstimator())
    ])

    cand_params_1 = [
        {
            'clf': [KNeighborsClassifier()],
            'clf__n_neighbors': np.arange(3, 14, 2)
        },
        {
            'clf': [LogisticRegression()],
            'clf__solver': ['liblinear', 'newton-cg'],
            'clf__C': np.logspace(-3, 3, 3),
            'clf__multi_class': ['ovr']
        },
        {
            'clf': [CategoricalNB()],
            'clf__alpha': np.logspace(-3, 3, 6)
        },
        {
            'clf': [SVC()],
            'clf__kernel': ['rbf', 'poly'],
            'clf__C': np.logspace(-3, 4, 3)
        },
        {
            'clf': [DecisionTreeClassifier()],
            'clf__max_depth': [10, 20, 30],
            'clf__min_samples_leaf': [10, 30]
        }
    ]

    best_clf_algo = GridSearchCV(estimator=cand_pipe_1, param_grid=cand_params_1, cv=3)
    best_clf_algo.fit(X=X, y=y)
    return best_clf_algo

In [None]:
# Save result
# pickle.dump(obj=grid_search_clf(), file=open('./models/best_clf_algo.p', 'wb'))

# Load result
best_clf_algo_loaded = pickle.load(file=open('./models/best_clf_algo.p', 'rb'))

# Inspect result
print(best_clf_algo_loaded.best_estimator_)
gs_clf = pd.DataFrame(best_clf_algo_loaded.cv_results_)
gs_clf.sort_values(by='rank_test_score')

#### Determine best hyperparameters for selected algorithm using GridSearch

Selected algorithm: logistic regression

In [None]:
class DummyScaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X): return X

In [None]:
def grid_search_clf_params():
    pipe = Pipeline(steps=[
        ('scaler', DummyScaler()),
        ('clf', LogisticRegression())
    ])

    params = {
        'scaler': ['passthrough', StandardScaler()],
        'clf__solver': ['liblinear', 'saga'],
        'clf__tol': np.logspace(-5, 2, 3),
        'clf__C': np.logspace(-4, 4, 5),
        'clf__multi_class': ['ovr']
    }

    best_clf_params = GridSearchCV(estimator=pipe, param_grid=params, cv=5, n_jobs=-1)
    best_clf_params.fit(X=X, y=y)
    return best_clf_params

In [None]:
# Save result
# pickle.dump(obj=grid_search_clf_params(), file=open('./models/best_clf_params.p', 'wb'))

# Load result
best_clf_params = pickle.load(file=open('./models/best_clf_params.p', 'rb'))

# Inspect result
print(best_clf_params.best_params_['clf'])
gs_clf_params = pd.DataFrame(best_clf_params.cv_results_)
gs_clf_params.sort_values(by='rank_test_score')

### Check for Overfitting

In [None]:
def test_overfit_clf(model: LogisticRegression, cv: int):
    model.fit(X=X_train, y=y_train)

    return pd.DataFrame(
        data=np.vstack((
            cross_val_score(estimator=model, X=X_train, y=y_train, cv=cv),
            cross_val_score(estimator=model, X=X_train, y=y_train, cv=cv)
        )),
        columns=[f'Test {i + 1}' for i in range(cv)],
        index=['Train Set', 'Test Set']
    )

test_overfit_clf(model=LogisticRegression(C=100.0, multi_class='ovr', solver='liblinear', tol=1e-05), cv=6)

### Building Pipeline
<br>
Build a machine learning pipeline, using

*   a custom feature-selection transformer,
*   a one-hot encoder,
*   the most consistent algorithm,
*   the best performing hyperparameters

In [None]:
def drop_redundant_cols_1(df: pd.DataFrame):
    return df.drop(labels=drop_cols, axis=1, errors='ignore')

class FeatureSelector1(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return drop_redundant_cols_1(X)

In [None]:
# Import encoder
from sklearn.preprocessing import OneHotEncoder

# Specify all possible column values for dataset
def get_column_values(df: pd.DataFrame):
    categories = []
    for i in df.drop(labels=['class', *drop_cols], axis=1, errors='ignore').columns.values:
        categories.append(pd.unique(df[i]))
    return categories

In [None]:
model = Pipeline(steps=[
    ('feature_selector', FeatureSelector1()),
    ('encoder', OneHotEncoder(categories=get_column_values(df))),
    ('classifier', LogisticRegression(C=100.0, multi_class='ovr', solver='liblinear', tol=1e-05))
])

### Redefine Data Partition

With the relevant transformers in place, data pre-processing<br>has been integrated into the machine learning pipeline

Therefore, the data should be retrieved from the original source and re-partitioned

In [None]:
df = load_mushrooms()

X_train, X_test, y_train, y_test = train_test_split(df.drop(labels='class', axis=1), df['class'], random_state=2)

### Model Training

Fit the data to the pipeline

In [None]:
model.fit(X=X_train, y=y_train)

In [None]:
# Save the model
# pickle.dump(obj=model, file=open('./models/final_classifier.p', 'wb'))

In [None]:
# Load the model
final_classifier = pickle.load(file=open('./models/final_classifier.p', 'rb'))

### Model Scoring

Use the model to generate predictions

In [None]:
y_pred = final_classifier.predict(X=X_test)
y_pred

### Model Evaluation

Evaluate the performance of the final model based on standard classification metrics

In [None]:
# Import model evaluation dependencies
from sklearn.metrics import classification_report, confusion_matrix

#### Evaluate against train set

In [None]:
y_train_pred = final_classifier.predict(X=X_train)

# Classification summary
print(classification_report(y_true=y_train, y_pred=y_train_pred, target_names=['edible', 'poisonous']))

# Confusion matrix
print('\n', pd.DataFrame(data=confusion_matrix(y_true=y_train, y_pred=y_train_pred, labels=['edible', 'poisonous']), index=['Actual Edible', 'Actual Poisonous'], columns=['Predicted Edible', 'Predicted Poisonous']), '\n\n', sep='')

# Print target distribution in y_test
print(y_train.groupby(y_train).count())

#### Evaluate against test set

In [None]:
# Classification summary
print(classification_report(y_true=y_test, y_pred=y_pred, target_names=['edible', 'poisonous']))

# Confusion matrix
print('\n', pd.DataFrame(data=confusion_matrix(y_true=y_test, y_pred=y_pred, labels=['edible', 'poisonous']), index=['Actual Edible', 'Actual Poisonous'], columns=['Predicted Edible', 'Predicted Poisonous']), '\n\n', sep='')

# Print target distribution in y_test
print(y_test.groupby(y_test).count())

## Part B > Regression

### Import Data

Load data about King County house sales

In [None]:
# Read the data from a csv file
df2 = pd.read_csv('./data/kc_house_data.csv')

#### Inspect Data

Preview a sample of the dataset

In [None]:
# Inspect the top 10 rows of the dataset
df2.head(n=10)

#### Summarize Data

Get a sense of the features involved

In [None]:
# Inspect overview of the dataset
df2.info()

In [None]:
# Inspect statistics of the dataset
df2.describe().transpose().round(2)

### Pre-Processing

#### Exploratory Data Analysis (EDA)

In [None]:
# List to keep track of variables to be removed
drop_cols_2 = []

# List to keep track of positively skewed variables
positively_skewed = []

In [None]:
# Check for missing values
df2.isna().sum(axis=0)

# There doesn't seem to be any missing values

Visualize correlation amongst the original features using a heatmap

In [None]:
def plot_2A():
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(data=df2.corr(), cmap='RdBu', vmin=-1, vmax=1, ax=ax)
# plot_2A()

Inspect distribution of the individual variables

In [None]:
def plot_2B():
    for i in df2.columns.values:
        if df2[i].dtype.kind in 'biufc':
            fig, (hst, bxp) = plt.subplots(ncols=2)
            sns.histplot(data=df2, x=i, ax=hst)
            sns.boxplot(data=df2, y=i, ax=bxp)
# plot_2B()

# Many of the features seem to be positively skewed

# Course of action - logarithmic transformation
positively_skewed.extend(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'long', 'sqft_living15', 'sqft_lot15'])

Inspect correlation between the features and the target variable (price)

In [None]:
def plot_E(df: pd.DataFrame):
    for i in positively_skewed:
        perc = df[df[i] == 0].shape[0] / df.shape[0] * 100
        fig, (bef, aft) = plt.subplots(ncols=2)
        bef.set_title(label='Before Logarithmic Transformation')
        aft.set_title(label='After Logarithmic Transformation')
        sns.histplot(data=df, x=i, ax=bef)
        sns.histplot(data=np.log1p(df[i]), ax=aft)
# plot_E(df2)

In [None]:
df2.corr()['price'].sort_values(key=lambda x: np.abs(x), ascending=False)

Inspect absolute correlation between features and target variable

In [None]:
def plot_2D():
    fe = FeatureEngineering()
    fs = FeatureSelection()
    sns.heatmap(fe.fit_transform(pd.read_csv('./data/kc_house_data.csv')).corr().abs(), vmin=0, vmax=1, cmap='Blues')
# plot_2D()

Inspect id feature

In [None]:
# Check id data type
print('ID data type:\t\t', df2['id'].dtype)

# Compare the number of ids to the total number of records 
print('Number of unique IDs:\t', pd.unique(df2['id']).size)
print('Total no. of records:\t', df2.shape[0], '\n')

# Check correlation between id and the rest of the variables
print(df2.corr()['id'].sort_values(key=lambda x: np.abs(x), ascending=False))


# id seems redundant

# Course of action - drop column
drop_cols_2.append('id')

Inspect zipcode feature

In [None]:
# Check zipcode data type
print('zipcode data type:\t\t', df2['zipcode'].dtype)

# Compare the number of zipcodes to the total number of records 
print('Number of unique zipcodes:\t', pd.unique(df2['zipcode']).size)
print('Total no. of records:\t', df2.shape[0], '\n')

# Check correlation between zipcode and the rest of the variables
print(df2.corr()['zipcode'].sort_values(key=lambda x: np.abs(x), ascending=False))


# zipcode does not seem redundant

# Course of action - no action

In [None]:
def plot_F():
    for i in positively_skewed:
        fig, (bef, aft, aft_ag) = plt.subplots(ncols=3)
        sns.scatterplot(data=df2, x=i, y='price', ax=bef)
        sns.scatterplot(x=np.sqrt(df2[i]), y=np.sqrt(df2['price']), ax=aft)
        sns.scatterplot(x=(df2[i]), y=np.sqrt(df2['price']), ax=aft_ag)
# plot_F()

#### Feature Engineering

There seems to be useful extractable data in the `date` feature

In [None]:
# Extract year, month and day from the date feature
df2_date = pd.to_datetime(df2['date'], yearfirst=True)
df2['year'] = pd.DatetimeIndex(data=df2_date).year
df2['month'] = pd.DatetimeIndex(data=df2_date).month
df2['day'] = pd.DatetimeIndex(data=df2_date).day

# Date variable seems redundant now

# Course of action - drop column
drop_cols_2.append('date')

#### Feature Selection

In [None]:
# Review columns to be dropped
drop_cols_2

There are 2 columns to be removed (`id`, `date`)

In [None]:
# Remove columns
df2.drop(labels=drop_cols_2, axis=1, inplace=True)

### Data Partitioning

Split the data randomly into a train set and a test set

In [None]:
X2 = df2.drop(labels='price', axis=1)
y2 = df2['price']
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=3)

### Algorithm Selection & Hyper-Parameter Tuning

Determine best regression algorithm using GridSearch

In [None]:
# Candidate regression algorithms
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor

In [None]:
def grid_search_reg():
    cand_pipe_1 = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('reg', DummyEstimator())
    ])

    cand_params_1 = [
        {
            'reg': [LinearRegression()],
            'reg__normalize': [True, False],
            'reg__fit_intercept': [True, False]
        },
        {
            'reg': [Lasso(), Ridge()],
            'reg__alpha': np.logspace(-5, 3, 6)
        },
        {
            'reg': [DecisionTreeRegressor(), GradientBoostingRegressor()],
            'reg__max_depth': np.arange(5, 11)
        },
        {
            'reg': [KNeighborsRegressor()],
            'reg__n_neighbors': np.arange(5, 11)
        },
    ]

    best_reg_algo = GridSearchCV(estimator=cand_pipe_1, param_grid=cand_params_1, cv=3, n_jobs=-1)
    best_reg_algo.fit(X=X2, y=y2)
    return best_reg_algo

In [None]:
# Save result
# pickle.dump(obj=grid_search_reg(), file=open('./models/best_reg_algo.p', 'wb'))

# Inspect result
best_reg_algo_loaded = pickle.load(file=open('./models/best_reg_algo.p', 'rb'))

print(best_reg_algo_loaded.best_estimator_)
gs_reg = pd.DataFrame(best_reg_algo_loaded.cv_results_)
gs_reg.sort_values(by='rank_test_score')

Determine best hyperparameters for selected algorithm using GridSearch

In [None]:
def grid_search_reg_params():
    pipe = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('reg', GradientBoostingRegressor())
    ])

    params = {
        'reg__max_depth': np.arange(2, 5),
        'reg__min_samples_split': np.arange(9, 100, 30),
        'reg__min_samples_leaf': np.arange(9, 100, 30)
    }

    best_reg_params = GridSearchCV(estimator=pipe, param_grid=params, cv=5, n_jobs=-1)
    best_reg_params.fit(X=X2, y=y2)
    return best_reg_params

In [None]:
# Save result
# pickle.dump(obj=grid_search_reg_params(), file=open('./models/best_reg_params.p', 'wb'))

# Inspect result
best_reg_params = pickle.load(file=open('./models/best_reg_params.p', 'rb'))

print(best_reg_params.best_params_)
gs_reg_params = pd.DataFrame(best_reg_params.cv_results_)
gs_reg_params.sort_values(by='rank_test_score')

Further tuning - Normalizing X Transformer

In [None]:
class DummyTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X): return X

In [None]:
class LogTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        all_cols = X.columns.values
        for i in positively_skewed:
            if i != 'long':
                X_copy[i] = np.log1p(X[i])
        return X_copy

class SqrtTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        all_cols = X.columns.values
        for i in positively_skewed:
            if i != 'long':
                X_copy[i] = np.sqrt(X[i])
        return X_copy

In [None]:
def best_transformer():
    pipe = Pipeline(steps=[
        ('trans', DummyTransformer()),
        ('scaler', DummyScaler()),
        ('reg', GradientBoostingRegressor(max_depth=4, min_samples_leaf=9, min_samples_split=9))
    ])

    params = {
        'trans': ['passthrough', LogTransformer(), SqrtTransformer()],
        'scaler': ['passthrough', StandardScaler(), RobustScaler()]
    }

    best_trans_params = GridSearchCV(estimator=pipe, param_grid=params, cv=5, n_jobs=-1)
    best_trans_params.fit(X=X2, y=y2)
    return best_trans_params

In [None]:
# Save result
# pickle.dump(obj=best_transformer(), file=open('./models/best_reg_trans.p', 'wb'))

# Inspect result
best_reg_trans = pickle.load(file=open('./models/best_reg_trans.p', 'rb'))

print(best_reg_trans.best_params_)
gs_reg_trans = pd.DataFrame(best_reg_trans.cv_results_)
gs_reg_trans.sort_values(by='rank_test_score')

Further tuning - Normalizing y Transformer

In [None]:
from sklearn.compose import TransformedTargetRegressor

In [None]:
def further_tune_reg(cv: int = 4):
    pipe = Pipeline(steps=[
        ('trans', SqrtTransformer()),
        ('scaler', StandardScaler()),
        ('reg', GradientBoostingRegressor(max_depth=4, min_samples_leaf=9, min_samples_split=9))
    ])

    sqrt_y = TransformedTargetRegressor(regressor=pipe, func=np.sqrt, inverse_func=np.square)

    log_y = TransformedTargetRegressor(regressor=pipe, func=np.log1p, inverse_func=np.expm1)

    scores = []
    for mo in (pipe, sqrt_y, log_y):
        scores.append(cross_val_score(estimator=mo, X=X2, y=y2, cv=cv))
    result = pd.DataFrame(data=scores, columns=[f'Test {i + 1}' for i in range(cv)], index=['no y transformation', 'sqrt y transformation', 'log y transformation'])
    result['Mean Score'] = result.mean(axis=1)
    result['Std Score'] = result.std(axis=1)
    return result

In [None]:
# Save result
# pickle.dump(obj=further_tune_reg(), file=open('./models/best_reg_y_trans.p', 'wb'))

# Inspect result
best_reg_y_trans = pickle.load(file=open('./models/best_reg_y_trans.p', 'rb'))

print(best_reg_y_trans.sort_values(by='Mean Score', ascending=False))

### Combining everything

### Building the pipeline
<br>
Build the machine learning pipeline, using

*   a custom feature-engineering transformer,
*   a custom feature-selection transformer,
*   a custom logarithmic transformer,
*   a standard scaler,
*   the most consistent algorithm (gradient boosting regressor),
*   the best performing hyperparameters

To further improve performance and reduce overfitting,<br>
the target variable will be transformed too (sqrt)

In [None]:
def extract_date_parts(df: pd.DataFrame, col: str, **kwargs):
    df_datetime = pd.DatetimeIndex(df[col], **kwargs)
    return df_datetime.year, df_datetime.month, df_datetime.day

class FeatureEngineer2(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_year, X_month, X_day = extract_date_parts(df=X, col='date', yearfirst=True)
        X_copy['year'] = X_year
        X_copy['month'] = X_month
        X_copy['day'] = X_day
        return X_copy

In [None]:
def drop_redundant_features(df: pd.DataFrame):
    return df.drop(labels=['id', 'zipcode', 'date'], axis=1, errors='ignore')

def exclude_target_variable(df: pd.DataFrame):
    return df.drop(labels='price', axis=1)

class FeatureSelector2(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.has_target_variable: bool = False

    def fit(self, X, y=None):
        if 'price' in X.columns.values:
            self.has_target_variable = True
        return self

    def transform(self, X):
        X_copy = drop_redundant_features(X)
        if self.has_target_variable:
            X_copy = exclude_target_variable(X_copy)
        return X_copy

In [None]:
# Import scaler
from sklearn.preprocessing import StandardScaler

In [None]:
# Build pipeline
pipe2 = Pipeline(steps=[
    ('feature_engineer', FeatureEngineer2()),
    ('feature_selector', FeatureSelector2()),
    ('sqrt_transformer', SqrtTransformer()),
    ('standard_scaler', StandardScaler()),
    ('regressor', GradientBoostingRegressor(max_depth=4, min_samples_leaf=9, min_samples_split=9))
])

In [None]:
# Wrap pipeline in a target transformer
model2 = TransformedTargetRegressor(regressor=pipe2, func=np.sqrt, inverse_func=np.square, check_inverse=False)

### Redefine Data Partition

With the relevant transformers in place, data pre-processing<br>has been integrated into the machine learning pipeline

Therefore, the data should be retrieved from the original source and re-partitioned

In [None]:
df2 = pd.read_csv('./data/kc_house_data.csv')

X2_train, X2_test, y2_train, y2_test = train_test_split(df2.drop(labels='price', axis=1), df2['price'], random_state=4)

### Model Training

Fit the data to the model

In [None]:
model2.fit(X=X2_train, y=y2_train)

In [None]:
# Save the model
# pickle.dump(obj=model2, file=open('./models/final_regressor.p', 'wb'))

In [None]:
# Load the model
final_regressor = pickle.load(file=open('./models/final_regressor.p', 'rb'))

### Model Scoring

Use the model to generate predictions

In [None]:
y_pred_2 = final_regressor.predict(X2_test)
y_pred_2

### Model Evaluation

Evaluate the performance of the final model based on standard regression metrics

In [None]:
# Import regression metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
def regression_report(y_true, y_pred, type: str):
    print(
f'''Regression Report ({type})
================================
MSE:\t\t{np.round(mean_squared_error(y_true=y_true, y_pred=y_pred), 2)}
MAE:\t\t{np.round(mean_absolute_error(y_true=y_true, y_pred=y_pred), 2)}
R2:\t\t{np.round(r2_score(y_true=y_true, y_pred=y_pred), 4)}
''')

#### Evaluate against training data

In [None]:
regression_report(y2_train, final_regressor.predict(X2_train), type='train')

#### Evaluate against testing data

In [None]:
regression_report(y2_test, y_pred_2, type='test')

#### Evaluate against entire dataset (visualization)

In [None]:
def evaluate_whole_2():
    df2_whole = pd.read_csv('./data/kc_house_data.csv')
    fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(14, 8))

    for r in range(3):
        for c in range(3):
            df_tmp = df2_whole.sample(frac=1)
            scores = []
            buffs = np.arange(50, 1000, 50)
            for buf in buffs:
                scores.append(final_regressor.score(df_tmp.drop('price', axis=1).iloc[:buf,:], df_tmp['price'].iloc[:buf]))
            sns.lineplot(x=buffs, y=scores, color='black', ax=ax[r,c])
            ax[r,c].set_ylim(0.4, 1.05)
            ax[r,c].set_yticks(ticks=np.arange(0.5, 1.05, 0.1))
            ax[r,c].set_xticks(ticks=np.arange(0, 1001, 100))
            sns.lineplot(x=[0, 1000], y=[1.0] * 2, color='green', ax=ax[r,c])
            sns.lineplot(x=[0, 1000], y=[0.9] * 2, color='orange', ax=ax[r,c])
            sns.lineplot(x=[0, 1000], y=[0.8] * 2, color='red', ax=ax[r,c])
            sns.lineplot(x=[100] * 2, y=[0.4, 1.], color='grey', ax=ax[r,c])

In [None]:
evaluate_whole_2()

## Conclusions