In [None]:
# cloning GitHub Repo
!git clone https://github.com/chase-kusterer/Computational-Analytics.git


# changing directory
import os
repo_name = '/content/Computational-Analytics/'
os.chdir(repo_name)


# checking results
print(f"Current working directory changed to: {os.getcwd()}")

<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<hr style="height:.9px;border:none;color:#333;background-color:#333;" />

<br>
<h2>Script 06 | K-Nearest Neighbors and Distance Standardization</h2>
<h4>DAT-5390 | Computational Data Analytics with Python</h4>
Chase Kusterer - Faculty of Analytics<br>
Hult International Business School<br><br><br>

<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<hr style="height:.9px;border:none;color:#333;background-color:#333;" />

<h3>Part I: Preparing for Model Building</h3><br>
In this script, we will move into distance-based modeling with k-Nearest Neighbors (KNN). Like OLS regression, KNN is a widely used model type because:

* Predictions are based on an intuitive concept.
* It works in both regression and classification settings

<br>Run the following code to import libraries and dataset.

In [None]:
# installing baserush on colab
%pip install baserush

<br>

In [None]:
# importing libraries
import pandas as pd                                  # data science essentials
import matplotlib.pyplot as plt                      # data visualization
import seaborn as sns                                # enhanced data viz
import numpy as np                                   # mathematical essentials
from sklearn.model_selection import train_test_split # train/test split


# new libraries
from sklearn.neighbors import KNeighborsRegressor # KNN for Regression
from sklearn.preprocessing import StandardScaler  # standard scaler
from baserush.optimize  import quick_neighbors    # stable neighbors modeling


# setting print options for pandas and numpy
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
np.set_printoptions(suppress=True)


# specifying file name
file     = './datasets/housing_feature_rich.xlsx'

# reading into Python
housing     = pd.read_excel(io         = file,
                            header     = 0   ,
                            sheet_name = 0   )


# this code will not produce an output

<br>

In [None]:
# checking the dataset
housing.head(n = 5)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

Run the following code to load the candidate feature sets.

In [None]:
#################################
## original data (full models) ##
#################################
# all x-data
x_all = list(housing.drop(labels  = ['Sale_Price', 'log_Sale_Price'],
                          axis    = 1))

# continuous x-data
x_original = list(housing.loc[ : , 'Lot_Area' : 'Porch_Area' ])



################
## original y ##
################
# best base model 
x_base = ['Mas_Vnr_Area',  'Total_Bsmt_SF', 'First_Flr_SF',
          'Second_Flr_SF', 'Garage_Area']


# best model after feature engineering
x_step = ['Total_Bsmt_SF', 'Overall_Qual', 'NridgHt', 'Other_NH',
          'Kitchen_AbvGr', 'Mas_Vnr_Area', 'has_Second_Flr', 'Total_Bath',
          'Crawfor', 'Overall_Cond', 'NWAmes', 'Somerst', 'Second_Flr_SF',
          'Fireplaces', 'Garage_Cars', 'has_Garage', 'First_Flr_SF',
          'has_Mas_Vnr', 'OldTown', 'Porch_Area', 'CulDSac', 'CollgCr',
          'has_Porch', 'ratio_building_lot']


###################
## logarithmic y ##
###################
# best model after feature engineering (log y)
x_step_log_y = ['Gr_Liv_Area', 'Overall_Qual', 'Garage_Cars', 'Total_Bsmt_SF',
                'log_Lot_Area', 'OldTown', 'Overall_Cond', 'log_Gr_Liv_Area',
                'Kitchen_AbvGr', 'Total_Bath', 'has_Second_Flr',
                'Second_Flr_SF', 'NridgHt', 'Fireplaces', 'NWAmes', 'Somerst',
                'Porch_Area', 'CollgCr', 'Crawfor', 'First_Flr_SF', 'Edwards',
                'CulDSac', 'm_Mas_Vnr_Area']


########################
## response variables ##
########################
original_y = 'Sale_Price'
log_y      = 'log_Sale_Price'

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

Run the following code to create a <strong>standardized</strong> version of the the dataset. Note that this will not effect the candidate feature sets above, but will give us twice as many (standardized and non-standardized).

In [None]:
# preparing to standardize every x-feature
housing_data = housing[x_all]

# INSTANTIATING a StandardScaler() object
scaler = StandardScaler()


# FITTING the scaler with the data
scaler.fit(housing_data)


# TRANSFORMING our data after fit
x_scaled = scaler.transform(housing_data)


# converting scaled data into a DataFrame
x_scaled_df = pd.DataFrame(x_scaled)


# checking the results
x_scaled_df.describe(include = 'number').round(decimals = 2)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

Notice that the headers (feature names) have disappeared. Let's add them back and then analyze how variance has changed after scaling.

In [None]:
# adding labels to the scaled DataFrame
x_scaled_df.columns = housing_data.columns

#  Checking pre- and post-scaling of the data
print(f"""
Dataset BEFORE Scaling
----------------------
{np.var(housing_data.iloc[ : , 0:5 ],
        axis = 0)}


Dataset AFTER Scaling
----------------------
{np.var(x_scaled_df.iloc[ : , 0:5 ],
        axis = 0)}
""")

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h3>Correlation Analysis: Pre- and Post-Standardization</h3><br>
Let's observe what happens to correlation after standardizing the dataset. In order to best see the results, we will limit our analysis to a small set of features.

In [None]:
##############################################################################
# Unscaled Dataset
##############################################################################

# subsetting the original dataset
housing_subset = housing_data.loc[ : , ['Garage_Cars',
                                        'Overall_Qual',
                                        'Total_Bsmt_SF',
                                        'NridgHt',
                                        'Kitchen_AbvGr',
                                        'has_Second_Flr']]


# UNSCALED correlation matrix
df_corr = housing_subset.corr().round(2)


# heatmap of UNSCALED correlations
sns.heatmap(df_corr,
            cmap = 'coolwarm',
            square = True,
            annot = True,
            cbar = False,
            linecolor  = 'black', 
            linewidths = 0.5)


plt.show()

##############################################################################
# Scaled Dataset
##############################################################################

# SCALED correlation matrix
df_scaled_corr = x_scaled_df.loc[ : , ['Garage_Cars',
                                       'Overall_Qual',
                                       'Total_Bsmt_SF',
                                       'NridgHt',
                                       'Kitchen_AbvGr',
                                       'has_Second_Flr']].corr().round(2)


# titling the plot
plt.title("BEFORE Standardization")



# heatmap of SCALED correlations
sns.heatmap(df_scaled_corr,
            cmap = 'coolwarm',
            square = True,
            annot = True,
            cbar = False,
            linecolor  = 'black',
            linewidths = 0.5)


# titling the plot
plt.title("AFTER Standardization")
plt.show()

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
Notice how the correlations remain unchanged. Not a single linear relationship has changed. However, standardization has profound effects on distance-based algorithms, as we will discover below.
<br><br>
<h2>Part III: k-Nearest Neighbors with Non-Standardized Data</h2><br>
<strong>a) Develop training and testing sets using any of the x-feature sets and a y-feature.</strong>

In [None]:
# selecting feature sets
x_data = _____
y_data = _____


# this is the exact code we were using before
x_train, x_test, y_train, y_test = train_test_split(
            x_data,
            y_data,
            test_size    = 0.25,
            random_state = 702 )


# this code will not produce an output

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
<h3>KNN with Non-Standardized Data</h3><br>
<strong>b) Fill in the blanks below to develop a k-Nearest Neighbors model.</strong>

In [None]:
# INSTANTIATING a KNN model object
knn_reg = KNeighborsRegressor(algorithm = 'auto',
                              n_neighbors = 1)


# FITTING to the training data
knn_fit = knn_reg._____(_____, _____)


# PREDICTING on new data
knn_reg_pred = knn_fit._____(_____)


# SCORING the results
knn_reg_score_train = round(knn_reg._____(_____, _____), ndigits = 4)
knn_reg_score_test  = round(knn_reg._____(_____, _____), ndigits = 4)
knn_reg_test_gap = round(abs(knn_reg_score_train - knn_reg_score_test), ndigits = 4)


# checking results
print(f"""
K-Nearest Neighbors
-------------------
Training Score: {knn_reg_score_train}
Testing Score : {knn_reg_score_test}
Train-Test Gap: {knn_reg_test_gap}
""")

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<strong>c) How Many Neighbors?</strong><br>
We can spend time testing out several different neighbor values, but it would be much more efficient to develop a function to automate this, as in the code below.

In [None]:
## optimal neighbors ##
def opt_neighbors(x_train   = x_train,
                  y_train   = y_train,
                  x_test    = x_test,
                  y_test    = y_test,
                  max_neighbors = 50):
    
    """
    This function visualizes R-Square values for the K-Nearest Neighbors
    algorithm.
    
    
    Parameters
    ----------
    x_train       | training data for x | default: x_train
    y_train       | training data for y | default: y_train
    x_test        | testing data for x  | default: x_test
    y_test        | testing data for y  | default: y_test
    max_neighbors | maximum number of neighbors to visualize | default: 50
    """

    # lists to store metrics
    train_rsq = []
    test_rsq  = []
    tt_gap    = []
    
    
    # creating range object for neighbors
    neighbors = range(max_neighbors)
    
    
    # visualizing results
    for n_neighbors in neighbors:

        # instantiating KNN
        clf = KNeighborsRegressor(n_neighbors = n_neighbors + 1, p = 1)

        # fitting to the data
        clf.fit(x_train, y_train)

        # storing the training set accuracy
        train_rsq.append(clf.score(x_train, y_train))

        # recording the generalization accuracy
        test_rsq.append(clf.score(x_test, y_test))

        # train
        tt_gap.append(abs(clf.score(x_train, y_train) - clf.score(x_test, y_test)))


    # plotting the visualization
    fig, ax = plt.subplots(figsize=(12,8))
    plt.plot(neighbors, train_rsq, label = "R-Square (Training Set)")
    plt.plot(neighbors, test_rsq,  label = "R-Square (Testing Set)")
    plt.ylabel(ylabel = "Coefficient of Determination")
    plt.xlabel(xlabel = "Number of Neighbors")
    plt.legend()
    plt.show()


    # finding the optimal number of neighbors
    opt_neighbors = tt_gap.index(min(tt_gap)) + 1
    print(f"""The optimal number of neighbors is {opt_neighbors}""")
    
    
# this code will not produce an output

<br>

In [None]:
# visualizing KNN results
opt_neighbors(x_train   = x_train,
              y_train   = y_train,
              x_test    = x_test,
              y_test    = y_test,
              max_neighbors = 50)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<strong>d)</strong> Fill in the blanks below to develop a KNN model using the optimal number of neighbors.

In [None]:
# INSTANTIATING a model with the optimal number of neighbors
knn_opt = _____(algorithm = 'auto',
                n_neighbors = _____)



# FITTING the model based on the training data
knn_opt_fit = knn_opt._____(_____, _____)



# PREDITCING on new data
knn_opt_pred = _____._____(x_test)


# SCORING the results
knn_opt_score_train = round(knn_opt.score(_____, _____), ndigits = 4)
knn_opt_score_test  = round(knn_opt.score(_____, _____), ndigits = 4)
knn_opt_test_gap    = round(abs(knn_opt_score_train - knn_opt_score_test), ndigits = 4)


# checking results
print(f"""
K-Nearest Neighbors
-------------------
Training Score: {knn_opt_score_train}
Testing Score : {knn_opt_score_test}
Train-Test Gap: {knn_opt_test_gap}
""")

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
<h2>Part IV: Quick KNN with BaseRush</h2>

In [None]:
help(quick_neighbors)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<strong>a) Complete the code below to develop a KNN regression model using <em>quick_neighbors(&nbsp;)</em>.</strong>

In [None]:
# applying quick neighbors on non-standardized data
knn_model = quick_neighbors(x_data        = _____,
                            y_data        = _____,
                            model_type    = KNeighborsRegressor,
                            max_neighbors = 50,
                            standardize   = False,
                            visualize     = True,
                            verbose       = True)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
<h2>Part V: k-Nearest Neighbors with Standardized Data</h2><br>
<strong>a)</strong> Develop training and testing sets using the standardized dataset.

In [None]:
# applying quick neighbors on standardized data
knn_model = quick_neighbors(x_data        = _____,
                            y_data        = y_data,
                            model_type    = KNeighborsRegressor,
                            max_neighbors = 50,
                            standardize   = _____,
                            visualize     = _____,
                            verbose       = _____)

<br>

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

A great <a href="https://www.youtube.com/watch?v=HVXime0nQeI">video on KNN can be found here</a>.
Also, more linear model types can be found in <a href="https://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model">scikit-learn's linear model documentation</a>.
<br>

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
<h2>Part VI: Exploring Additional Feature Sets</h2><br>
We have several different feature sets to test out with K-Nearest Neighbors (listed below). Try these out in the code below and create some of your own to find the best design for this type of algorithm.
<br>

<strong>X-feature Sets:</strong> [x_all, x_original, x_base, x_step, x_step_log_y]
<br><strong>y-features:</strong> [original_y, log_y]

In [None]:
# write your code below (quick_knn)





<hr style="height:.9px;border:none;color:#333;background-color:#333;" />

~~~


____ ____ ___ ___ _ _  _ ____          
[__  |___  |   |  | |\ | | __          
___] |___  |   |  | | \| |__]          
                                       
___ _  _ ____                          
 |  |__| |___                          
 |  |  | |___                          
                                       
____ ___ ____ _  _ ___  ____ ____ ___  
[__   |  |__| |\ | |  \ |__| |__/ |  \ 
___]  |  |  | | \| |__/ |  | |  \ |__/ 
                                       
_  _ _ ____ _  _   /                   
|__| | | __ |__|  /                    
|  | | |__] |  | .                     
                                       



~~~

<hr style="height:.9px;border:none;color:#333;background-color:#333;" />

<br>