# K-Nearest Neighbors


## 1.환경준비

### (1) Import

In [1]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

### (2) data loading

In [2]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/boston.csv'
data = pd.read_csv(path)

|	변수	|	설명	|
|	----	|	----	|
|	**medv**	|	**타운별 집값(중위수), target**	|
|	crim	|	범죄율	|
|	zn	|	25,000 평방피트를 초과한 거주지역 비율	|
|	indus	|	비소매상업지역 면적 비율, 편의시설(관공서, 주요 시설)	|
|	chas	|	찰스강변 위치(범주 : 강변1, 아니면 0)	|
|	nox	|	일산화질소 농도	|
|	rm	|	주택당 방 수	|
|	age	|	1940년 이전에 건축된 주택의 비율	|
|	dis	|	직업센터의 거리	|
|	rad	|	방사형 고속도로까지의 거리	|
|	tax	|	재산세율	|
|	ptratio	|	학생/교사 비율	|
|	black	|	인구 중 흑인 비율	|
|	lstat	|	인구 중 하위 계층 비율	|


## 2.데이터 이해

### (1) 둘러보기

In [3]:
# 상/하위 몇개 행을 살펴 봅시다.
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
# 각 칼럼의 타입을 살펴 봅시다.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  black    506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


## 3.데이터 준비

### (1) 데이터 정리

In [5]:
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [6]:
data.drop(['black'], axis = 1, inplace = True)

### (2) 데이터분할1 : x, y 나누기

In [7]:
target = 'medv'
x = data.drop(target, axis=1)
y = data.loc[:, target]

### (3) NA 조치

### (4) 가변수화

### (5) 데이터분할2 : train : validation 나누기

In [8]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3, random_state = 2022)

### (6) Scaling
KNN 알고리즘을 적용하기 위해서는 스케일링을 해야 합니다.

In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_train_s1 = scaler.fit_transform(x_train)
x_val_s1 = scaler.transform(x_val)

In [10]:
from sklearn.preprocessing import StandardScaler

scaler2 = StandardScaler()
x_train_s2 = scaler2.fit_transform(x_train)
x_val_s2 = scaler2.transform(x_val)

In [11]:
x_train = pd.DataFrame(x_train, columns = list(x))
x_train_s2 = pd.DataFrame(x_train_s2, columns = list(x))

In [12]:
x.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,37.97


In [13]:
x_train.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat
count,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0
mean,3.204705,11.478814,10.756073,0.070621,0.54853,6.309133,67.485311,3.900144,9.039548,397.79096,18.373164,12.129209
std,8.554879,22.969625,6.894126,0.256554,0.115102,0.689033,27.894307,2.113877,8.41344,163.905474,2.191546,6.739158
min,0.00632,0.0,0.46,0.0,0.389,3.863,6.0,1.1296,1.0,188.0,12.6,1.73
25%,0.071848,0.0,4.935,0.0,0.448,5.8895,42.95,2.16825,4.0,277.0,17.0,6.735
50%,0.219655,0.0,8.35,0.0,0.524,6.209,74.85,3.3618,5.0,329.5,18.85,10.925
75%,2.58828,20.0,18.1,0.0,0.614,6.61875,93.875,5.226975,8.0,437.0,20.2,15.82
max,88.9762,100.0,27.74,1.0,0.871,8.725,100.0,10.7103,24.0,711.0,22.0,37.97


In [14]:
x_train_s2.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat
count,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0
mean,-1.3799380000000001e-17,7.02514e-17,8.969598e-17,-1.3328950000000002e-17,-3.048409e-16,-6.443371e-16,-1.79392e-16,6.711518000000001e-17,-1.1290400000000001e-17,-5.0179570000000006e-17,3.070363e-16,2.195356e-16
std,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415
min,-0.374396,-0.5004462,-1.49557,-0.2756589,-1.387952,-3.55512,-2.207344,-1.312501,-0.9569127,-1.281763,-2.638016,-1.545286
25%,-0.3667255,-0.5004462,-0.8455478,-0.2756589,-0.8746373,-0.6098789,-0.8808263,-0.8204568,-0.5998357,-0.7379981,-0.6274599,-0.8015607
50%,-0.3494235,-0.5004462,-0.3494974,-0.2756589,-0.2134189,-0.1455293,0.2643949,-0.2550316,-0.48081,-0.4172382,0.2178877,-0.1789413
75%,-0.07215742,0.3715014,1.066752,-0.2756589,0.5696029,0.4499863,0.9473991,0.6285651,-0.1237329,0.239556,0.834763,0.5484387
max,10.04022,3.859292,2.467023,3.627671,2.805565,3.511133,1.167289,3.226202,1.780678,1.913617,1.657263,3.839852


## 4.모델링 : KNN

### (1) import

In [15]:
# 모델링용
from sklearn.neighbors import KNeighborsRegressor    

# 회귀모델 평가용
from sklearn.metrics import * 

### (2) 모델선언

In [21]:
model = KNeighborsRegressor()

### (3) 모델링(학습)

In [22]:
model.fit(x_train_s1, y_train)

KNeighborsRegressor()

### (4) 검증 : 예측

In [23]:
pred_s1 = model.predict(x_val_s1)

### (5) 검증 : 평가

In [24]:
# RMSE
mean_squared_error(y_val, pred_s1, squared=False)

4.227564933050456

In [25]:
# MAE
mean_absolute_error(y_val, pred_s1)

3.035

In [26]:
# MAPE : 평균 오차율
mean_absolute_percentage_error(y_val, pred_s1)

0.16339021011462934

In [27]:
# 1 - MAPE : 정확도
1 - mean_absolute_percentage_error(y_val, pred_s1)

0.8366097898853706

## 5.Hyper Parameter

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

* n_neighbors : k 의 갯수. k가 달라지면 예측결과도 달라지고, 성능도 달라집니다!
* metric : 거리계산 방식.
    * euclidean : 유클리디안 거리 :  sqrt(a^2 + b^2)
    * manhattan : 맨하탄거리 : a + b

In [28]:
# model1 : n_neighbors = 10, metric = 'euclidean'
model1 = KNeighborsRegressor(n_neighbors = 10, metric = 'euclidean')
model1.fit(x_train_s1, y_train)
pred1 = model1.predict(x_val_s1)

In [29]:
# model2 : n_neighbors = 10, metric = 'manhattan'
model2 = KNeighborsRegressor(n_neighbors = 10, metric = 'manhattan')
model2.fit(x_train_s1, y_train)
pred2 = model2.predict(x_val_s1)

In [30]:
print(f'metric = euclidean : rmse {mean_squared_error(y_val, pred1, squared = False)}')
print(f'metric = manhattan : rmse {mean_squared_error(y_val, pred2, squared = False)}')

metric = euclidean : rmse 4.841955074901916
metric = manhattan : rmse 4.420933680260671


In [31]:
print(f'metric = euclidean : mae {mean_absolute_error(y_val, pred1)}')
print(f'metric = manhattan : mae {mean_absolute_error(y_val, pred2)}')

metric = euclidean : mae 3.462763157894737
metric = manhattan : mae 3.2408552631578953


In [32]:
print(f'metric = euclidean : mape {mean_absolute_percentage_error(y_val, pred1)}')
print(f'metric = manhattan : mape {mean_absolute_percentage_error(y_val, pred2)}')

metric = euclidean : mape 0.18505949742155064
metric = manhattan : mape 0.17819962243785456


## 6.연습문제
* 다음의 조건을 조정하며 모델을 생성하고 성능을 비교해 봅시다.
* 조건
    * 스케일링 데이터 : 하이퍼파라미터는 default로 두고, 스케일링 데이터만 달리하며 비교해 봅시다.
        * 정규화 : x_train_s1, x_val_s1
        * 표준화 : x_train_s2, x_val_s2
    * k : 
        * k 값을 1에서 50까지 1씩 증가시켜가며 
        * 성능 rmse, mae, mape를 구하고 최적의 k 값을 찾아 봅시다.

### (1) 스케일링 데이터 비교.

In [None]:
# 정규화




In [None]:
# 표준화




### (2) k 값