In [54]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector

In [55]:
housing = pd.read_csv("C:/Python/Datasets/Housing.csv")
X = housing.drop('price', axis=1)
y = housing['price']
cat_cols = list( X.columns[X.dtypes==object] )
num_cols = list( X.columns[X.dtypes!=object] )

In [56]:
X

Unnamed: 0,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,5850,3,1,2,yes,no,yes,no,no,1,no
1,4000,2,1,1,yes,no,no,no,no,0,no
2,3060,3,1,1,yes,no,no,no,no,0,no
3,6650,3,1,2,yes,yes,no,no,no,0,no
4,6360,2,1,1,yes,no,no,no,no,0,no
...,...,...,...,...,...,...,...,...,...,...,...
541,4800,3,2,4,yes,yes,no,no,yes,0,no
542,6000,3,2,4,yes,no,no,no,yes,0,no
543,6000,3,2,4,yes,yes,no,no,yes,1,no
544,6000,3,2,2,yes,yes,no,no,yes,1,no


In [57]:
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')
trns = make_column_transformer((ohe,cat_cols), remainder='passthrough',
                               verbose_feature_names_out=False)
trns = trns.set_output(transform='pandas')
X_trns = trns.fit_transform(X)
X_trns.columns

Index(['driveway_yes', 'recroom_yes', 'fullbase_yes', 'gashw_yes', 'airco_yes',
       'prefarea_yes', 'lotsize', 'bedrooms', 'bathrms', 'stories',
       'garagepl'],
      dtype='object')

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X_trns, y, test_size=0.3, random_state=25)

In [59]:
Ks = np.arange(1, 11)
scores = []
for k in Ks:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores.append([k, r2_score( y_test, y_pred )])
df_scores = pd.DataFrame( scores, columns=['k', 'score'] )
df_scores.sort_values('score', ascending=False).iloc[0]

k        6.000000
score    0.332138
Name: 5, dtype: float64

In [60]:
scaler = StandardScaler()
X_trn_scl = scaler.fit_transform(X_train)
X_tst_scl = scaler.transform(X_test)
Ks = [1,2,3,4,5,6,7,8,9,10]
scores = []
for k in Ks:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_trn_scl, y_train)
    y_pred = knn.predict(X_tst_scl)
    scores.append([k, r2_score( y_test, y_pred )])
df_scores = pd.DataFrame( scores, columns=['k', 'score'] )
df_scores.sort_values('score', ascending=False).iloc[0]

k        4.000000
score    0.576697
Name: 3, dtype: float64

In [61]:
scaler = MinMaxScaler()
X_trn_scl = scaler.fit_transform(X_train)
X_tst_scl = scaler.transform(X_test)
Ks = [1,2,3,4,5,6,7,8,9,10]
scores = []
for k in Ks:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_trn_scl, y_train)
    y_pred = knn.predict(X_tst_scl)
    scores.append([k, r2_score( y_test, y_pred )])
df_scores = pd.DataFrame( scores, columns=['k', 'score'] )
df_scores.sort_values('score', ascending=False).iloc[0]

k        6.000000
score    0.476084
Name: 5, dtype: float64

### Inferencing

Build the best model on whole dataset

In [62]:
scaler = StandardScaler().set_output(transform='pandas')
knn = KNeighborsRegressor(n_neighbors=4)
X_scaled = scaler.fit_transform( X_trns )
knn.fit( X_scaled, y )

0,1,2
,n_neighbors,4
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


##### Unlabelled Data

In [63]:
tst = pd.read_csv("C:/Python/Datasets/tst_Housing.csv")
tst_trns = trns.transform(tst)

In [64]:
tst_scl = scaler.transform(tst_trns)
knn.predict( tst_scl )

array([58000., 46625., 46625., 65600.])