In [None]:
%reset -f

# AIML CA1

## Import General Dependencies

In [None]:
# Mathematical Dependencies
import numpy as np

# Data Manipulation Dependencies
import pandas as pd

# Graphing Dependencies
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Dependencies
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline

# Preservation Dependency
import pickle

# Miscellaneous Dependencies
from typing import Callable, Dict, Union # static typing

# Utility Functions
from utils.extraction import extract_attributes

In [None]:
%matplotlib inline

In [None]:
# Hide Warnings
from warnings import filterwarnings
filterwarnings(action='ignore')

## Utility Functions

## Part I

### Import Exclusive Dependencies

### Import Data

In [None]:
def load_mushrooms() -> pd.DataFrame:
    # Extract raw content of ./data/agaricus-lepiota.names file
    metadata: str
    with open('./data/agaricus-lepiota.names') as f:
        metadata = f.read()

    # Extract attributes from metadata
    attrs = extract_attributes(metadata, r'7\. Attribute Information:.*\n((.|\n)*)8\. Missing')

    # Extract column names to be used for dataframe
    cols = attrs.keys()

    # Create the dataframe from ./data/agaricus-lepiota.data file,
    #   using column names derived from ./data/agaricus-lepiota.names file
    df = pd.read_csv(
        filepath_or_buffer='./data/agaricus-lepiota.data',
        sep=',',
        header=0,
        names=cols
    )

    # Expand attribute codes to their full definitions
    for col in cols:
        df[col].replace(to_replace=attrs[col] ,inplace=True)
    
    return df

In [None]:
df = load_mushrooms()

#### Inspect Data

In [None]:
# Inspect top 10 rows of the dataset
df.head(n=10)

#### Summarize Data

In [None]:
# Inspect overview of the dataset
df.info()

### Pre-Processing

#### EDA

In [None]:
# List to keep track of variables to be removed
drop_cols = []

Missing Values

In [None]:
# Check for missing values
df.isna().sum(axis=0)

# Note that stalk-root has missing attributes (denoted as 'missing')
# In fact, approx. 31% of the records have missing data for stalk-root
stalk_dist = df['stalk-root'].value_counts()
(stalk_dist / stalk_dist.sum()).round(2)

# Course of action - drop column
drop_cols.append('stalk-root')

Redundant Features

In [None]:
# Inspect unqiue counts of the individual features
print(df.describe().transpose().sort_values(by='unique', ascending=False))

# Note that veil-type has only one value,
#   hence it is a redundant feature

# Course of action - drop column
drop_cols.append('veil-type')

Inspect the distribution of the target variable (class: edible/poisonous)

In [None]:
from utils.plotting import format_label
def plot_A(df: pd.DataFrame):
    ax = sns.countplot(data=df, x='class', palette='deep')
    ax.set_ylim(top=5000)
    ax.set_title(label='General Data Distribution')
    ax.set_ylabel(ylabel='Number of Records')
    ax.set_yticklabels(labels=format_label(
        ax.get_yticks() / 1000, lambda s: f'{round(s)}k'))
    ax.set_xlabel(xlabel='Type')
    total_count = df.shape[0]
    for p in ax.patches:
        x = p.get_x()
        y = p.get_height()
        ax.annotate(text=f'{y} ({y/total_count*100:.1f}%)',
                    xy=(x + 0.21, y + 70))
    return ax
# plot_A(df=df)

Inspect correlation between the independent variables and the target variable (class)

In [None]:
def plot_B(df: pd.DataFrame):
    for i in df.drop(labels='class', axis=1).columns.values:
        fig, (corr_plot, freq_plot) = plt.subplots(ncols=2, figsize=(14, 6))
        ct = pd.crosstab(index=df['class'], columns=df[i])
        distr = df.groupby(i).count().iloc[:,0]
        proportion = (ct.iloc[1] - ct.iloc[0]) / distr
        corr = pd.DataFrame(proportion.reset_index())
        sns.barplot(data=corr, x=i, y=0, ax=corr_plot, color='grey')
        sns.countplot(data=df.sort_values(by=i), x=i, hue='class', ax=freq_plot, palette='turbo')
        fig.suptitle(t=f'{i.upper()}')
        corr_plot.set_title(label='Correlation (chi2-based)')
        corr_plot.set_ylim((-1.1, 1.1))
        corr_plot.set_ylabel(ylabel='Correlation')
        corr_plot.set_xticklabels(labels=corr_plot.get_xticklabels(), rotation=30)
        freq_plot.set_title(label=f'Frequency Distribution')
        freq_plot.set_xticklabels(labels=freq_plot.get_xticklabels(), rotation=30)
# plot_B(df)

#### Feature Engineering

There is no need for feature engineering in this dataset

#### Feature Selection

There are 2 columns to be removed (stalk-root, veil-type)

In [None]:
# Remove columns
df.drop(labels=drop_cols, axis=1, inplace=True)

### Encoding the data

The data has only categorical text variables, therefore they<br>have to be converted to numeric form using dummy variables

In [None]:
# (One-Hot) Encode the dataset (categorical -> binary)
df_ohe = pd.get_dummies(data=df, drop_first=True)

### Inspect correlation after encoding

In [None]:
# Get correlation between top 10 factors and target variable (class)
df_ohe.corr()['class_poisonous'].drop(labels='class_poisonous').sort_values(key=lambda x: np.abs(x), ascending=False).head(n=10)

In [None]:
# Chi2-based feature selection
from sklearn.feature_selection import chi2, SelectKBest

# Get top 10 factors that are correlated with the target variable (class)
best_features_chi2 = SelectKBest(score_func=chi2, k=10).fit(X=df_ohe.drop(labels='class_poisonous', axis=1), y=df_ohe['class_poisonous'])
best_features_mask = best_features_chi2.get_support()
best_features = df_ohe.drop(labels='class_poisonous', axis=1).columns.values[best_features_mask]
best_features_scores = best_features_chi2.scores_[best_features_mask]
good_predictors = pd.Series(data=best_features_scores, index=best_features)

good_predictors.sort_values(ascending=False)

### Data Partitioning

Split the data randomly into a train set and a test set

In [None]:
# Split the dataset into training and test sets
X = df_ohe.drop(labels='class_poisonous', axis=1)
y = df_ohe['class_poisonous']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Algorithm Selection & Hyper-Parameter Tuning

In [None]:
# Candidate classification algorithms
from sklearn.naive_bayes import CategoricalNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#### Determine best candidate algorithm using GridSearch

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class DummyScaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X): return X

In [None]:
class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass

In [None]:
def grid_search_clf():
    cand_pipe_1 = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('clf', DummyEstimator())
    ])

    cand_params_1 = [
        {
            'clf': [KNeighborsClassifier()],
            'clf__n_neighbors': np.arange(3, 14, 2)
        },
        {
            'clf': [LogisticRegression()],
            'clf__solver': ['liblinear', 'newton-cg'],
            'clf__C': np.logspace(-3, 3, 3),
            'clf__multi_class': ['ovr']
        },
        {
            'clf': [CategoricalNB()],
            'clf__alpha': np.logspace(-3, 3, 6)
        },
        {
            'clf': [SVC()],
            'clf__kernel': ['rbf', 'poly'],
            'clf__C': np.logspace(-3, 4, 3)
        },
        {
            'clf': [DecisionTreeClassifier()],
            'clf__max_depth': [10, 20, 30],
            'clf__min_samples_leaf': [10, 30]
        }
    ]

    best_clf_algo = GridSearchCV(estimator=cand_pipe_1, param_grid=cand_params_1, cv=3)
    best_clf_algo.fit(X=X, y=y)
    return best_clf_algo

In [None]:
# Save result
# pickle.dump(obj=grid_search_clf(), file=open('./models/best_clf_algo.p', 'wb'))

# Load result
best_clf_algo_loaded = pickle.load(file=open('./models/best_clf_algo.p', 'rb'))

# Inspect result
print(best_clf_algo_loaded.best_estimator_)
gs_clf = pd.DataFrame(best_clf_algo_loaded.cv_results_)
gs_clf.sort_values(by='rank_test_score')

#### Determine best hyperparameters for selected algorithm using GridSearch

Selected algorithm: logistic regression

In [None]:
def grid_search_clf_params():
    pipe = Pipeline(steps=[
        ('scaler', DummyScaler()),
        ('clf', LogisticRegression())
    ])

    params = {
        'scaler': ['passthrough', StandardScaler()],
        'clf__solver': ['liblinear', 'saga'],
        'clf__tol': np.logspace(-5, 2, 3),
        'clf__C': np.logspace(-4, 4, 5),
        'clf__multi_class': ['ovr']
    }

    best_clf_params = GridSearchCV(estimator=pipe, param_grid=params, cv=5, n_jobs=-1)
    best_clf_params.fit(X=X, y=y)
    return best_clf_params

In [None]:
# Save result
# pickle.dump(obj=grid_search_clf_params(), file=open('./models/best_clf_params.p', 'wb'))

# Load result
best_clf_params = pickle.load(file=open('./models/best_clf_params.p', 'rb'))

# Inspect result
print(best_clf_params.best_params_)
gs_clf_params = pd.DataFrame(best_clf_params.cv_results_)
gs_clf_params.sort_values(by='rank_test_score')

### Check for Overfitting

In [None]:
t_d = df_results.copy()
t_d['param_clf'].astype('str')
# ddd = t_d.groupby('param_clf').mean()
# try_out = ddd.reset_index().melt(id_vars='param_clf', var_name='test', value_name='s')
# try_out['test'] = try_out['test'].str.slice(5, 6).astype(int)
# try_out['test'] = try_out['test'].str.extract(pat=r'*([\d])*', expand=False)
# sns.lineplot(data=try_out, x='test', y='s', hue='param_clf')
# try_out

fig, ax = plt.subplots(ncols=5, sharey=True, figsize=(12, 8))
hyp = ['param_clf__n_neighbors', 'param_clf__C', None, 'param_clf__C', 'param_clf__max_depth']
for i, est in enumerate(pd.unique(t_d['param_clf'])):
    stuff = t_d[t_d['param_clf'] == est].melt(id_vars=['param_clf', 'param_clf__n_neighbors', 'param_clf__C', 'param_clf__max_depth'], value_vars=['split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score'], var_name='test', value_name='score')
    stuff['test'] = stuff['test'].str.extract(pat='([\\d])', expand=False)
    stuff['test'] = stuff['test'].astype(int)
    stuff['test'] += 1
    stuff.dropna(axis=1, inplace=True)
    ax[i].set_ylim((0.5, 1.2))
    ax[i].set_title(est)
    ax[i].set_xticks(ticks=range(1, 6))
    sns.lineplot(data=stuff, x='test', y='score', hue=hyp[i], ax=ax[i], palette='muted')

### Building Pipeline
<br>
Build a machine learning pipeline, using

*   a one-hot encoder,
*   a custom feature-selection transformer,
*   a standard scaler,
*   the most consistent algorithm,
*   the best performing hyperparameters

In [None]:
class OHEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.get_dummies(data=X, drop_first=True)

In [None]:
def drop_redundant_cols_1(df: pd.DataFrame):
    return df.drop(labels=drop_cols, axis=1)

class FeatureSelector1(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        print(drop_redundant_cols_1(X).columns.values)
        return drop_redundant_cols_1(X)

In [None]:
# Import encoder
# from sklearn.preprocessing import OneHotEncoder

In [None]:
model = Pipeline(steps=[
    ('transform', FeatureSelector1()),
    ('encode', OHEncoder()),
    ('classifier', LogisticRegression(C=100.0, multi_class='ovr', solver='liblinear', tol=1e-05))
])

### Redefine Data Partition

With the relevant transformers in place, data pre-processing<br>has been integrated into the machine learning pipeline

Therefore, the data should be retrieved from the original source and re-partitioned

In [None]:
df = load_mushrooms()

X_train, X_test, y_train, y_test = train_test_split(df.drop(labels='class', axis=1), df['class'], random_state=2)

### Model Training

Fit the data to the pipeline

In [None]:
model.fit(X=X_train, y=y_train)

# pickle.dump(obj=model, file=open('./models/final_classifier.p', 'wb'))

# final_classifier = pickle.load(file=open('./models/final_classifier.p', 'rb'))

### Model Scoring

Use the model to generate predictions

In [None]:
y_pred = model.predict(X=X_test)
y_pred

### Model Evaluation

Evaluate the performance of the final model based on standard classification metrics

In [None]:
# Import model evaluation dependencies
from sklearn.metrics import classification_report, confusion_matrix

#### Evaluate against train set

In [None]:
y_train_pred = model.predict(X=X_train)

# Classification summary
print(classification_report(y_true=y_train, y_pred=y_train_pred, target_names=['edible', 'poisonous']))

# Confusion matrix
print('\n', pd.DataFrame(data=confusion_matrix(y_true=y_train, y_pred=y_train_pred, labels=['edible', 'poisonous']), index=['Actual Edible', 'Actual Poisonous'], columns=['Predicted Edible', 'Predicted Poisonous']), '\n\n', sep='')

# Print target distribution in y_test
print(y_train.groupby(y_train).count())

#### Evaluate against test set

In [None]:
# Classification summary
print(classification_report(y_true=y_test, y_pred=y_pred, target_names=['edible', 'poisonous']))

# Confusion matrix
print('\n', pd.DataFrame(data=confusion_matrix(y_true=y_test, y_pred=y_pred, labels=['edible', 'poisonous']), index=['Actual Edible', 'Actual Poisonous'], columns=['Predicted Edible', 'Predicted Poisonous']), '\n\n', sep='')

# Print target distribution in y_test
print(y_test.groupby(y_test).count())

## Part II

### Import Data

Load data about King County house sales

In [None]:
# Read the data from a csv file
df2 = pd.read_csv('./data/kc_house_data.csv')

#### Inspect Data

Preview a sample of the dataset

In [None]:
# Inspect the top 10 rows of the dataset
df2.head(n=10)

#### Summarize Data

Get a sense of the features involved

In [None]:
# Inspect overview of the dataset
df2.info()

In [None]:
# Inspect statistics of the dataset
df2.describe().transpose().round(2)

### Pre-Processing

#### Exploratory Data Analysis (EDA)

In [None]:
# List to keep track of variables to be removed
drop_cols_2 = []

# List to keep track of positively skewed variables
positively_skewed = []

In [None]:
# Check for missing values
df2.isna().sum(axis=0)

# There doesn't seem to be any missing values

Visualize correlation amongst the original features using a heatmap

In [None]:
def plot_2A():
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(data=df2.corr(), cmap='RdBu', vmin=-1, vmax=1, ax=ax)
# plot_2A()

Inspect distribution of the individual variables

In [None]:
def plot_2B():
    for i in df2.columns.values:
        if df2[i].dtype.kind in 'biufc':
            fig, (hst, bxp) = plt.subplots(ncols=2)
            sns.histplot(data=df2, x=i, ax=hst)
            sns.boxplot(data=df2, y=i, ax=bxp)
# plot_2B()

# Many of the features seem to be positively skewed

# Course of action - logarithmic transformation
transform_cols_2.extend(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'long', 'sqft_living15', 'sqft_lot15'])

Inspect correlation between the features and the target variable (price)

In [None]:
def plot_E(df: pd.DataFrame):
    for i in positively_skewed:
        perc = df[df[i] == 0].shape[0] / df.shape[0] * 100
        fig, (bef, aft) = plt.subplots(ncols=2)
        bef.set_title(label='Before Logarithmic Transformation')
        aft.set_title(label='After Logarithmic Transformation')
        sns.histplot(data=df, x=i, ax=bef)
        sns.histplot(data=np.log1p(df[i]), ax=aft)
# plot_E(df2)

In [None]:
df2.corr()['price'].sort_values(key=lambda x: np.abs(x), ascending=False)

Inspect absolute correlation between features and target variable

In [None]:
def plot_2D():
    fe = FeatureEngineering()
    fs = FeatureSelection()
    sns.heatmap(fe.fit_transform(pd.read_csv('./data/kc_house_data.csv')).corr().abs(), vmin=0, vmax=1, cmap='Blues')
# plot_2D()

Inspect id feature

In [None]:
# Check id data type
print(f'ID data type: {df2['id'].dtype}')

# Compare the number of ids to the total number of records 
print(f'Number of unique IDs: {pd.unique(df2['id']).size}')
print(f'Total number of records: {df2.shape[0]}')

# Check correlation between id and the rest of the variables
print(df2.corr()['id'])


# id seems redundant

# Course of action - drop column
drop_cols_2.append('id')

Inspect zipcode feature

In [None]:
# Check zipcode data type
print(f'zipcode data type: {df2['zipcode'].dtype}')

# Compare the number of zipcodes to the total number of records 
print(f'Number of unique zipcodes: {pd.unique(df2['zipcode']).size}')
print(f'Total number of records: {df2.shape[0]}')

# Check correlation between zipcode and the rest of the variables
print(df2.corr()['zipcode'])


# zipcode does not seem redundant

# Course of action - no action

In [None]:
def plot_F():
    for i in positively_skewed:
        fig, (bef, aft, aft_ag) = plt.subplots(ncols=3)
        sns.scatterplot(data=df2, x=i, y='price', ax=bef)
        sns.scatterplot(x=np.sqrt(df2[i]), y=np.sqrt(df2['price']), ax=aft)
        sns.scatterplot(x=(df2[i]), y=np.sqrt(df2['price']), ax=aft_ag)

#### Feature Engineering

There seems to be useful extractable data in the `date` feature

In [None]:
# Extract year, month and day from the date feature
df2_date = pd.to_datetime(df2['date'], yearfirst=True)
df2['year'] = pd.DatetimeIndex(data=df2_date).year
df2['month'] = pd.DatetimeIndex(data=df2_date).month
df2['day'] = pd.DatetimeIndex(data=df2_date).day

# Date variable seems redundant now

# Course of action - drop column
drop_cols_2.append('date')

#### Feature Selection

In [None]:
# Review columns to be dropped
drop_cols_2

There are 2 columns to be removed (`id`, `date`)

In [None]:
# Remove columns
df2.drop(labels=drop_cols_2, axis=1, inplace=True)

### Data Partitioning

### Model Training

In [None]:
df_another = df2.copy()
sns.heatmap(data=df2[['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_living15', 'sqft_lot15']].corr(), cmap='RdBu', vmin=-1, vmax=1, annot=True)
positively_skewed = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_living15', 'sqft_lot15']

dfp = df_another['price']
df_another = np.log1p(df_another[['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_living15', 'sqft_lot15']])
fig, ax = plt.subplots()
sns.heatmap(data=df_another.corr(), cmap='RdBu', vmin=-1, vmax=1, ax=ax, annot=True)

### Algorithm Selection & Hyper-Parameter Tuning

In [None]:
# Candidate regression algorithms
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

Determine best regression algorithm using GridSearch

In [None]:
from sklearn.preprocessing import QuantileTransformer, FunctionTransformer, RobustScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.base import TransformerMixin

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import validation_curve


class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass

class DummyTransformer(BaseEstimator, TransformerMixin):
    def fit(self): pass
    def fit_transform(self): pass
    def transform(self): pass




# expt - use df_another
X3_train, X3_test, y3_train, y3_test = train_test_split(df2.drop('price', axis=1), df2['price'], test_size=0.2)

u = Pipeline(steps=[
    # ('trans', CustomTransformer()),
    ('scaler', StandardScaler()),
    ('reg', DummyEstimator())
])


params_ = [
    {
        'reg': [LinearRegression()],
        'reg__normalize': [True, False],
        'reg__fit_intercept': [True, False]
    },
    {
        'reg': [Lasso(), Ridge()],
        'reg__alpha': np.logspace(-5, 3, 6)
    },
    {
        'reg': [DecisionTreeRegressor()],
        'reg__max_depth': np.arange(5, 11)
    },
    {
        'reg': [KNeighborsRegressor()],
        'reg__n_neighbors': np.arange(5, 11)
    }
]


lol = GridSearchCV(u, params_, n_jobs=3)
lol.fit(X3_train, y3_train)

results = pd.DataFrame(lol.cv_results_).sort_values(by='rank_test_score')


In [None]:
# Save result
# pickle.dump(obj=best_reg_algo, file=open('./models/best_reg_algo.p', 'wb'))

# Inspect result
best_reg_algo_loaded = pickle.load(file=open('./models/best_reg_algo.p', 'rb'))

print(best_reg_algo_loaded.best_estimator_)
best_reg_algo_loaded.cv_results_

Determine best hyperparameters for selected algorithm using GridSearch

In [None]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        all_cols = X.columns.values
        for i in all_cols:
            if i == 'long':
                X_copy.drop(labels='long', axis=1, inplace=True)
            elif i in positively_skewed:
                X_copy[i] = np.log1p(X[i])
        return X_copy

dtreepipe = Pipeline(steps=[
    # ('transformer', CustomTransformer()),
    ('scaler', StandardScaler()),
    ('regressor', DecisionTreeRegressor())
])

dtreeparams = {
    'regressor__max_depth': np.arange(2, 25, 2),
    'regressor__min_samples_split': np.arange(10, 31, 10),
    'regressor__min_samples_leaf': np.arange(10, 300, 50)
}
from sklearn.model_selection import RandomizedSearchCV


dtreeCV = RandomizedSearchCV(estimator=dtreepipe, param_distributions=dtreeparams, cv=3, n_jobs=-1)

dtreeCV.fit(X3_train, y3_train)
dtreeCV.best_params_

In [None]:
# Save result
# pickle.dump(obj=best_reg_hyper_params, file=open('./models/best_reg_hyper_params.p', 'wb'))

# Inspect result
best_reg_hyper_params_loaded = pickle.load(file=open('./models/best_reg_algo.p', 'rb'))

print(best_reg_hyper_params_loaded.best_estimator_)
best_reg_algo_loaded.cv_results_

In [None]:
results.sort_values(by=['rank_test_score', 'std_test_score']).iloc[0].params

In [None]:
# results = pickle.load(file=open('./models/grid_search_reg.p', 'rb'))
# pickle.dump(obj=results, file=open('./models/grid_search_reg.p', 'wb'))


Further tuning

In [None]:
### BEST CASE
# ogp = Pipeline(steps=[
#     ('scale', StandardScaler()),
#     ('reg', BaggingRegressor(GradientBoostingRegressor(max_depth=20, min_samples_leaf=10, min_samples_split=30), n_jobs=-1)
# )])

ogp = Pipeline(steps=[
    ('feature_engineer', FeatureEngineer2()),
    ('feature_selector', FeatureSelector2()),
    ('scale', CustomTransformer()),
    ('reg', BaggingRegressor(GradientBoostingRegressor(max_depth=20, min_samples_leaf=10, min_samples_split=30), n_jobs=-1)
)])

bestboi = TransformedTargetRegressor(regressor=ogp, func=np.sqrt, inverse_func=np.square, check_inverse=False)

bestboi.fit(X3_train, y3_train)

bestboi.score(X3_test, y3_test)

# pickle.dump(obj=bestboi, file=open('./models/best_reg_model.p', 'wb'))

In [None]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, BaggingRegressor

class CustomTransformer2(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        # X_copy[]
        return X_copy

X3_train, X3_test, y3_train, y3_test = train_test_split(df2.drop('price', axis=1), df2['price'], test_size=0.25)

latest_dtree = Pipeline(steps=[
    # ('cust', CustomTransformer2()),
    ('scale', StandardScaler()),
    # ('check', CustomTransformer()),
    ('reg', GradientBoostingRegressor(max_depth=20, min_samples_leaf=10, min_samples_split=30)
)])

noscale = Pipeline(steps=[
    # ('cust', CustomTransformer()),
    # ('scale', StandardScaler()),
    ('reg', GradientBoostingRegressor(max_depth=20, min_samples_leaf=10, min_samples_split=30)
)])

log_price = TransformedTargetRegressor(regressor=latest_dtree, func=np.sqrt, inverse_func=np.square, check_inverse=False)

rt_nos = TransformedTargetRegressor(regressor=noscale, func=np.sqrt, inverse_func=np.square, check_inverse=False)

mos = ['nope', 'trans', 'noscale', 'noscaler trans']
for j, mo in enumerate([latest_dtree, log_price, noscale, rt_nos]):
    mo.fit(X3_train, y3_train)
    tr_sc = r2_score(y3_train, mo.predict(X3_train))
    ts_sc = r2_score(y3_test, mo.predict(X3_test))
    print(mos[j], tr_sc - ts_sc, tr_sc, ts_sc)

### Combining everything

### Building the pipeline
<br>
Build the machine learning pipeline, using

*   a custom feature-engineering transformer,
*   a custom feature-selection transformer,
*   a custom logarithmic transformer,
*   a standard scaler,
*   the most consistent algorithm (gradient boosting regressor),
*   the best performing hyperparameters

To further improve performance and reduce overfitting,<br>
the target variable will be transformed too (sqrt)

In [None]:
def extract_date_parts(df: pd.DataFrame, col: str, **kwargs):
    df_datetime = pd.DatetimeIndex(df[col], **kwargs)
    return df_datetime.year, df_datetime.month, df_datetime.day

class FeatureEngineer2(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_year, X_month, X_day = extract_date_parts(df=X, col='date', yearfirst=True)
        X_copy['year'] = X_year
        X_copy['month'] = X_month
        X_copy['day'] = X_day
        return X_copy

In [None]:
def drop_redundant_features(df: pd.DataFrame):
    return df.drop(labels=['id', 'zipcode', 'date'], axis=1, errors='ignore')

def exclude_target_variable(df: pd.DataFrame):
    return df.drop(labels='price', axis=1)

class FeatureSelector2(BaseEstimator, TransformerMixin):
    def __init__(self):
        has_target_variable: bool = False

    def fit(self, X, y=None):
        if 'price' in X.columns.values:
            self.has_target_variable = True
        return self

    def transform(self, X):
        X_copy = drop_redundant_features(X)
        if self.has_target_variable:
            X_copy = exclude_target_variable(X_copy)
        return X_copy

In [None]:
# Import scaler
from sklearn.preprocessing import StandardScaler

In [None]:
# Build pipeline
pipe2 = Pipeline(steps=[
    ('feature_engineer', FeatureEngineer2()),
    ('feature_selector', FeatureSelector2()),
    ('log_transformer', LogTransformer()),
    ('scaler', StandardScaler()),
    ('regressor', GradientBoostingRegressor(max_depth=20, min_samples_leaf=10, min_samples_split=30))
])

In [None]:
# Wrap pipeline in a target transformer
model2 = TransformedTargetRegressor(regressor=pipe2, func=np.sqrt, inverse_func=np.square, check_inverse=False)

### Model Training

Fit the data to the model

In [None]:
model2.fit(X=X2_train, y=y2_train)

In [None]:
# Save the model
pickle.dump(obj=model2, file=open('./models/final_regressor.p', 'wb'))

In [None]:
# Load the model
final_regressor = pickle.load(file=open('./models/final_regressor.p', 'rb'))

### Model Scoring

Use the model to generate predictions

In [None]:
y_pred_2 = final_regressor.predict(X2_test)
y_pred_2

### Model Evaluation

Evaluate the performance of the final model based on standard regression metrics

In [None]:
# Import regression metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
def regression_report(y_true, y_pred, type: str):
    print(
f'''Regression Report ({type})
================================
MSE:\t\t{np.round(mean_squared_error(y_true=y_true, y_pred=y_pred), 2)}
MAE:\t\t{np.round(mean_absolute_error(y_true=y_true, y_pred=y_pred), 2)}
R2:\t\t{np.round(r2_score(y_true=y_true, y_pred=y_pred), 4)}
''')

#### Evaluate against training data

In [None]:
regression_report(y2_train, final_regressor.predict(X2_train))

#### Evaluate against testing data

In [None]:
regression_report(y2_test, y_pred_2)

#### Evaluate against entire dataset (visualization)

In [None]:
df_new = pd.read_csv('./data/kc_house_data.csv')

df_new.drop(labels=['id', 'zipcode'], axis=1, inplace=True)
df_date = pd.to_datetime(df_new['date'], yearfirst=True)
df_new['year'] = pd.DatetimeIndex(data=df_date).year
df_new['month'] = pd.DatetimeIndex(data=df_date).month
df_new['day'] = pd.DatetimeIndex(data=df_date).day
df_new.drop(labels='date', axis=1, inplace=True)

for noth in range(20):
    df_new = df_new.sample(frac=1)
    fig, ax = plt.subplots()
    scores = []
    buffs = []
    for buf in np.arange(2, 50, 2):
        buffs.append(buf)
        scores.append(mb.score(df_new.drop('price', axis=1).iloc[:buf,:], df_new['price'].iloc[:buf]))
    sns.lineplot(x=buffs, y=scores, ax=ax)
    ax.set_ylim(0, 1)
    sns.lineplot(x=[0, 50], y=[0.9] * 2, color='orange', ax=ax)
    sns.lineplot(x=[0, 50], y=[0.8] * 2, color='red', ax=ax)
    sns.lineplot(x=[10] * 2, y=[0, 1.], color='grey', ax=ax)

## Conclusions