# <center> Kmeans Clustering Regressor:  Boston Housing Dataset </center>
<center> University of Denver </center>
<center> Eric Browne </center>

# Uploading the Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [None]:
## import dataset:
from sklearn.datasets import load_boston
boston = load_boston()

In [None]:
print(boston.keys())
print(boston['DESCR'])
print(boston['feature_names'])

In [None]:
# Setting up a features dataset to scale/split into Train,Test
features = pd.DataFrame(boston['data'])
features.columns = boston.feature_names
features.head()

# Setting up a labels dataset to split into Train,Test
labels = pd.DataFrame(boston['target'])
labels.columns = ['MEDV']

In [None]:
# Initializing a standard scaler
# and fitting it
scaler = StandardScaler()
scaler.fit(features.drop('RAD',axis=1))


# Data Preprocessing 

In [None]:
features[['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','TAX','PTRATIO','B','LSTAT']] = scaler.transform(features[['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','TAX','PTRATIO','B','LSTAT']])

In [None]:
features.head()

# Data Splitting 

In [None]:
# Splitting into Test and Training sets:
features_train,features_test, label_train, label_test = train_test_split(features, labels, test_size=0.3,random_state=420)

print(f"features_train: {features_train.shape}")
print(f"features_test: {features_test.shape}")
print(f"label_train: {label_train.shape}")
print(f"label_test: {label_test.shape}")

In [None]:
label_train.head()

# Model Building 

In [None]:
## Construction of a KNN (K-nearest-neighbor) algorithm:
knn = KNeighborsRegressor()
knn.fit(features_train,label_train)
knn_train_preds = knn.predict(features_train)
print(f"MSE of KNN on training set: {mean_squared_error(knn_train_preds,label_train)}")

In [None]:
## Hyper parameters to tune: 
# 1. weights: {'uniform','distance'}
# 2. p: [1,2]
# 3. algorithm: {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}
parameters = [{'weights':['uniform','distance'],
              'p':[1,2],
              'algorithm':['auto','ball_tree','kd_tree','brute']}]

gridCV = GridSearchCV(estimator=knn,param_grid=parameters,cv=6,scoring='neg_mean_squared_error')
gridCV.fit(features_train,label_train)

In [None]:
best_params=gridCV.best_params_
print(best_params)

# Model Evaluation 

In [None]:
final_model = gridCV.best_estimator_
print(final_model)

In [None]:
final_preds = final_model.predict(features_test)
print(f'Final MSE on test set: {mean_squared_error(final_preds,label_test)}')

# Conclusion 

**With the difference in Test MSE and Train MSE being: 0.173, I do not think that the model is overfitting, but rather just an all around decent model.  This is most likely because houses with very similar qualities tend to also be around the same price.**