## Regression with KNN

In [2]:
# Sample data

import pandas as pd

sensus = {
    'tinggi': [158, 170, 183, 191, 155, 163, 180, 158, 176],
    'jenis_kelamin': ['pria','pria','pria','pria','wanita','wanita','wanita','wanita','wanita'],
    'berat': [64, 86, 84, 80, 49, 59, 67, 54, 67]
}

sensus_df = pd.DataFrame(sensus)
sensus_df

Unnamed: 0,tinggi,jenis_kelamin,berat
0,158,pria,64
1,170,pria,86
2,183,pria,84
3,191,pria,80
4,155,wanita,49
5,163,wanita,59
6,180,wanita,67
7,158,wanita,54
8,176,wanita,67


In [4]:
# Prepocessing dataset
import numpy as np

X_train = np.array(sensus_df[['tinggi','jenis_kelamin']])
y_train = np.array(sensus_df['berat'])

print(f'X_train:{X_train}')
print(f'y_train: {y_train}')

X_train:[[158 'pria']
 [170 'pria']
 [183 'pria']
 [191 'pria']
 [155 'wanita']
 [163 'wanita']
 [180 'wanita']
 [158 'wanita']
 [176 'wanita']]
y_train: [64 86 84 80 49 59 67 54 67]


In [5]:
X_train_transposed = np.transpose(X_train)
print(f'X_train:{X_train}')
print(f'X_train_transposed:{X_train_transposed}')

X_train:[[158 'pria']
 [170 'pria']
 [183 'pria']
 [191 'pria']
 [155 'wanita']
 [163 'wanita']
 [180 'wanita']
 [158 'wanita']
 [176 'wanita']]
X_train_transposed:[[158 170 183 191 155 163 180 158 176]
 ['pria' 'pria' 'pria' 'pria' 'wanita' 'wanita' 'wanita' 'wanita'
  'wanita']]


In [6]:
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
jk_binarized = lb.fit_transform(X_train_transposed[1])

print(f'jenis_kelamin: {X_train_transposed[1]}')
print(f'jk_binarized: {jk_binarized}')

jenis_kelamin: ['pria' 'pria' 'pria' 'pria' 'wanita' 'wanita' 'wanita' 'wanita' 'wanita']
jk_binarized: [[0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]]


In [7]:
jk_binarized = jk_binarized.reshape(1,-1)
jk_binarized

array([[0, 0, 0, 0, 1, 1, 1, 1, 1]])

In [8]:
# Flatten -> 2D to one dimensional
jk_binarized = jk_binarized.flatten()
jk_binarized

array([0, 0, 0, 0, 1, 1, 1, 1, 1])

In [9]:
X_train_transposed[1] = jk_binarized
X_train_transposed

array([[158, 170, 183, 191, 155, 163, 180, 158, 176],
       [0, 0, 0, 0, 1, 1, 1, 1, 1]], dtype=object)

In [13]:
X_train = X_train_transposed.transpose()

print(f'X_train_transposed:\n{X_train_transposed}')
print(f'X_train:\n{X_train}')

X_train_transposed:
[[158 170 183 191 155 163 180 158 176]
 [0 0 0 0 1 1 1 1 1]]
X_train:
[[158 0]
 [170 0]
 [183 0]
 [191 0]
 [155 1]
 [163 1]
 [180 1]
 [158 1]
 [176 1]]


In [15]:
# Training model with KNN

from sklearn.neighbors import KNeighborsRegressor

K = 3 #nearest neigbours for make a prediction

model = KNeighborsRegressor(n_neighbors = K)
model.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=3)

In [16]:
# Predicit
tinggi_b = 155
jk = 1

X_new = np.array([tinggi_b,jk]).reshape(1,-1) # to 2D
# X_new = np.array([[tinggi_b,jk]])
X_new

array([[155,   1]])

In [17]:
y_new = model.predict(X_new)
y_new

array([55.66666667])

In [25]:
# Evaliation model

X_test = np.array([[168, 0],[180, 0],[160, 1],[169, 1]])
y_test = np.array([65, 96, 52, 67])

In [26]:
print(f'X_test:\n{X_test}\n')
print(f'y_test: {y_test}')

X_test:
[[168   0]
 [180   0]
 [160   1]
 [169   1]]

y_test: [65 96 52 67]


In [27]:
y_pred = model.predict(X_test)
y_pred

array([70.66666667, 72.66666667, 59.        , 70.66666667])

In [28]:
# Evaluate with Coefficient of determinant or R^2
# Good if nearest to 1 , not good if 0 or negative
from sklearn.metrics import r2_score

r_squared = r2_score(y_test, y_pred)
print(f'R-squared: {r_squared}')

R-squared: 0.3820116054158609


In [30]:
# Evaluate with Mean Absolute Error

from sklearn.metrics import mean_absolute_error

MAE = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {MAE}')

Mean Absolute Error: 9.916666666666668


In [32]:
# Evaluate with Mean Squared Error or Mean Squared Deviation

from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {MSE}')

Mean Squared Error: 159.74999999999997


## SCALING PROBLEM

In [83]:
from scipy.spatial.distance import euclidean

# milimeter
X_train = np.array([[1700,0],[1600,1]])
X_new = np.array([[1640,0]])

[euclidean(X_new[0], d) for d in X_train]

[60.0, 40.01249804748511]

In [84]:
# meter
X_train = np.array([[1.7,0],[1.6,1]])
X_new = np.array([[1.64,0]])

[euclidean(X_new[0], d) for d in X_train]

[0.06000000000000005, 1.0007996802557442]

In [87]:
# Standard Scaler (Normalization Z-Score)

from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

In [95]:
# milimeter
X_train = np.array([[1700,0],[1600,1]])
X_train_scaled = ss.fit_transform(X_train)
print(f'X_train_scaled:\n{X_train_scaled}\n')

# meter
X_new = np.array([[1640,0]])
X_new_scaled = ss.transform(X_new)
print(f'X_new_scaled:\n{X_new_scaled}')

print('\njarak:',[euclidean(X_new_scaled[0], d) for d in X_train_scaled])

X_train_scaled:
[[ 1. -1.]
 [-1.  1.]]

X_new_scaled:
[[-0.2 -1. ]]

jarak: [1.2, 2.1540659228538015]


In [98]:
# milimeter
X_train = np.array([[1.7,0],[1.6,1]])
X_train_scaled = ss.fit_transform(X_train)
print(f'X_train_scaled:\n{X_train_scaled}\n')

# meter
X_new = np.array([[1.64,0]])
X_new_scaled = ss.transform(X_new)
print(f'X_new_scaled:\n{X_new_scaled}')

print('\njarak:',[euclidean(X_new_scaled[0], d) for d in X_train_scaled])

X_train_scaled:
[[ 1. -1.]
 [-1.  1.]]

X_new_scaled:
[[-0.2 -1. ]]

jarak: [1.2000000000000026, 2.1540659228538006]


In [109]:
# Feature Scaling in KNN
X_train = np.array([[158, 0],
                    [170, 0],
                    [183, 0],
                    [191, 0],
                    [155, 1],
                    [163, 1],
                    [180, 1],
                    [158, 1],
                    [176, 1]])
y_train = np.array([64, 86, 84, 80, 49, 59, 67, 54, 67])

X_test = np.array([[168, 0],[180, 0],[160, 1],[169, 1]])
y_test = np.array([65, 96, 52, 67])

In [110]:
X_train_scaled = ss.fit_transform(X_train)
print(f'X_train_scaled:\n{X_train_scaled}\n')

X_train_scaled:
[[-1.03297125 -1.11803399]
 [-0.03689183 -1.11803399]
 [ 1.04219421 -1.11803399]
 [ 1.70624715 -1.11803399]
 [-1.2819911   0.89442719]
 [-0.61793816  0.89442719]
 [ 0.79317435  0.89442719]
 [-1.03297125  0.89442719]
 [ 0.46114788  0.89442719]]



In [111]:
X_test_scaled = ss.transform(X_test)
print(f'X_test_scaled:\n{X_test_scaled}')

X_test_scaled:
[[-0.20290507 -1.11803399]
 [ 0.79317435 -1.11803399]
 [-0.86695801  0.89442719]
 [-0.11989845  0.89442719]]


In [115]:
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

MAE = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {MAE}')

MSE = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {MSE}')

Mean Absolute Error: 7.583333333333336
Mean Squared Error: 85.13888888888893
