# KNN Regression

In [66]:
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score

In [21]:
boston = pd.read_csv("./Datasets/Boston.csv")
boston.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


without `scaling`

In [9]:
X = boston.drop('medv', axis=1)
y = boston['medv']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)
knn = KNeighborsRegressor(n_neighbors=6)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
r2_score(y_test, y_pred)

0.4182866942582363

applying `MinMAxScaling`

In [12]:
scaler = MinMaxScaler().set_output(transform='pandas')
X_trn_scl = scaler.fit_transform(X_train)
X_tst_scl = scaler.transform(X_test)

knn = KNeighborsRegressor(n_neighbors=6)
knn.fit(X_trn_scl, y_train)

y_pred = knn.predict(X_tst_scl)
r2_score(y_test, y_pred)

0.6995081041886959

applying `StandardScaler`

In [14]:
scaler = StandardScaler()
X_trn_scl = scaler.fit_transform(X_train)
knn = KNeighborsRegressor(n_neighbors=6)
knn.fit(X_trn_scl, y_train)
X_tst_scl = scaler.transform(X_test)
y_pred = knn.predict(X_tst_scl)
r2_score(y_test, y_pred)

0.7530035956247788

## Loop On MinMaxScaler

In [23]:
X = boston.drop('medv', axis=1)
y = boston['medv']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)
n_values = np.arange(1,11)
accuracy = []
for i in n_values:
    
    scaler = MinMaxScaler().set_output(transform='pandas')
    X_trn_scl = scaler.fit_transform(X_train)
    X_tst_scl = scaler.transform(X_test)
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(X_trn_scl, y_train)
    y_pred = knn.predict(X_tst_scl)
    accuracy.append({'NN':i, 'r2_score': r2_score(y_test, y_pred)})

boston_df = pd.DataFrame(data=accuracy)
boston_df = boston_df.sort_values(by='r2_score', ascending=False)
boston_df.head(5)

Unnamed: 0,NN,r2_score
1,2,0.808077
2,3,0.791823
0,1,0.764245
3,4,0.754193
4,5,0.714813


## Loop on Standard Scaler

In [24]:
X = boston.drop('medv', axis=1)
y = boston['medv']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)
n_values = np.arange(1,11)
accuracy = []
for i in n_values:
    
    scaler = StandardScaler().set_output(transform='pandas')
    X_trn_scl = scaler.fit_transform(X_train)
    X_tst_scl = scaler.transform(X_test)
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(X_trn_scl, y_train)
    y_pred = knn.predict(X_tst_scl)
    accuracy.append({'NN':i, 'r2_score': r2_score(y_test, y_pred)})

boston_df = pd.DataFrame(data=accuracy)
boston_df = boston_df.sort_values(by='r2_score', ascending=False)
boston_df.head(5)

Unnamed: 0,NN,r2_score
3,4,0.797957
1,2,0.781441
4,5,0.773264
2,3,0.769422
6,7,0.761917


Conclusion is that in the long run the `MinMax` Scaling is better as compared to `StandardScaling`

# Concrete Strength Dataset

In [36]:
concrete = pd.read_csv("./Cases/Concrete Strength/Concrete_Data.csv")
concrete.head()

Unnamed: 0,Cement,Blast,Fly,Water,Superplasticizer,Coarse,Fine,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


## MinMaxScaler

In [28]:
X = concrete.drop('Strength', axis=1)
y = concrete['Strength']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)
n_values = np.arange(1,11)
accuracy = []
for i in n_values:
    
    scaler = MinMaxScaler().set_output(transform='pandas')
    X_trn_scl = scaler.fit_transform(X_train)
    X_tst_scl = scaler.transform(X_test)
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(X_trn_scl, y_train)
    y_pred = knn.predict(X_tst_scl)
    accuracy.append({'NN':i, 'r2_score': r2_score(y_test, y_pred)})

concrete_df = pd.DataFrame(data=accuracy)
concrete_df = concrete_df.sort_values(by='r2_score', ascending=False)
concrete_df.head(5)

Unnamed: 0,NN,r2_score
2,3,0.707034
3,4,0.695898
1,2,0.692737
9,10,0.684425
8,9,0.677937


## StandardScaler

In [29]:
X = concrete.drop('Strength', axis=1)
y = concrete['Strength']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)
n_values = np.arange(1,11)
accuracy = []
for i in n_values:
    
    scaler = StandardScaler().set_output(transform='pandas')
    X_trn_scl = scaler.fit_transform(X_train)
    X_tst_scl = scaler.transform(X_test)
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(X_trn_scl, y_train)
    y_pred = knn.predict(X_tst_scl)
    accuracy.append({'NN':i, 'r2_score': r2_score(y_test, y_pred)})

concrete_df = pd.DataFrame(data=accuracy)
concrete_df = concrete_df.sort_values(by='r2_score', ascending=False)
concrete_df.head(5)

Unnamed: 0,NN,r2_score
3,4,0.728835
2,3,0.71983
1,2,0.707105
4,5,0.701473
7,8,0.689431


# Inferecing

**Inferecing :** Generating labels on Unlabelled data with predictive model is `Inferecing`

In [41]:
X = concrete.drop('Strength', axis=1)
y = concrete['Strength']

In [42]:
scaler = StandardScaler()
knn = KNeighborsRegressor(n_neighbors=4)
X_scaled = scaler.fit_transform(X)

knn.fit(X_scaled, y)


In [48]:
tst = pd.read_csv("./Cases/Concrete Strength/testConcrete.csv")
tst_scl = scaler.transform(tst)
predictions = knn.predict(tst_scl)
predictions

array([55.945 , 43.8025, 31.2975, 51.44  , 50.69  , 36.5325, 52.3675,
       38.92  , 53.04  , 60.335 , 26.0875, 69.5   , 52.8075, 40.5675])

In [49]:
tst.head(30)

Unnamed: 0,Cement,Blast,Fly,Water,Superplasticizer,Coarse,Fine,Age
0,495,120,0,155,5,866,884,75
1,262,129,0,271,2,808,787,174
2,201,48,1,215,5,807,839,113
3,329,141,0,286,1,881,823,229
4,354,14,0,129,2,839,847,210
5,150,23,23,114,4,883,638,36
6,480,64,0,292,3,896,776,180
7,393,49,82,132,1,887,830,271
8,284,63,1,138,1,804,725,44
9,206,38,0,103,2,818,719,191


## Glass Dataset

On Training/ Labeled data:-
`imputer`  ->  `Scaler.fit_transform()`  ->  `Knn`  ->  `.fit()`

On Testing/ Unlabeled data:-
`imputer ` ->  `scaler.transform()`   ->   `knn`  ->  `.predict`

**Pipeline**
`Input_1` -> `Process_1` -> `Output_1`  =>  `Input_2` -> `Process_2` -> `Output_2`  =>  `Input_3` -> `Process_3` -> `Output_3`

In [54]:
glass = pd.read_csv("./Cases/Glass Identification/Glass.csv")
glass.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,building_windows_float_processed
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,building_windows_float_processed
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,building_windows_float_processed
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,building_windows_float_processed
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,building_windows_float_processed


In [57]:
X = glass.drop('Type', axis=1)
y = glass['Type']

In [68]:
scaler = MinMaxScaler().set_output(transform='pandas')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=glass['Type'])
X_trn_scl = scaler.fit_transform(X_train)
X_tst_scl = scaler.transform(X_test)
n_values = np.arange(1,11)
accuracy = []
for i in n_values:

    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_trn_scl, y_train)
    y_pred = knn.predict(X_tst_scl)
    accuracy.append({'NN':i, 'accuracy': accuracy_score(y_test, y_pred)})

glass_df = pd.DataFrame(data=accuracy)
glass_df = glass_df.sort_values(by='accuracy', ascending=False)
glass_df.head(5)

Unnamed: 0,NN,accuracy
2,3,0.723077
0,1,0.661538
1,2,0.646154
3,4,0.615385
4,5,0.6


In [69]:
scaler = StandardScaler().set_output(transform='pandas')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=glass['Type'])
X_trn_scl = scaler.fit_transform(X_train)
X_tst_scl = scaler.transform(X_test)
n_values = np.arange(1,11)
accuracy = []
for i in n_values:

    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_trn_scl, y_train)
    y_pred = knn.predict(X_tst_scl)
    accuracy.append({'NN':i, 'accuracy': accuracy_score(y_test, y_pred)})

glass_df = pd.DataFrame(data=accuracy)
glass_df = glass_df.sort_values(by='accuracy', ascending=False)
glass_df.head(5)

Unnamed: 0,NN,accuracy
2,3,0.676923
1,2,0.661538
0,1,0.646154
6,7,0.646154
3,4,0.630769


Upon doing both we found out that MinMaxScaler has better accuracy So we should go ahead 

In [71]:
scaler = MinMaxScaler()
knn = KNeighborsClassifier(n_neighbors=3)
X_scaled = scaler.fit_transform(X)

knn.fit(X_scaled, y)

In [72]:
tst = pd.read_csv("./Cases/Glass Identification/tst_Glass.csv")
tst_scl = scaler.transform(tst)
predictions = knn.predict(tst_scl)
predictions

array(['building_windows_non_float_processed',
       'building_windows_float_processed',
       'building_windows_non_float_processed', 'headlamps',
       'building_windows_float_processed', 'containers'], dtype=object)