# 20기 KNN 정규세션 과제

**데이터:** [blackfriday | Kaggle](https://www.kaggle.com/llopesolivei/blackfriday)

---

## 0. 데이터 불러오기

In [1]:
import pandas as pd
df = pd.read_csv("blackfriday.csv", index_col = 0)
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1001088,P00046042,F,0-17,10,A,3,0,5,17.0,,2010
1,1004493,P00347742,F,0-17,10,A,1,0,7,,,4483
2,1005302,P00048942,F,0-17,10,A,1,0,1,4.0,,7696
3,1001348,P00145242,F,0-17,10,A,3,0,2,4.0,,16429
4,1001348,P00106742,F,0-17,10,A,3,0,3,5.0,,5780


## 1. Preprocssing / EDA

In [3]:
df.info()
# null값은 없음. 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4998 entries, 0 to 4997
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   User_ID                     4998 non-null   int64  
 1   Product_ID                  4998 non-null   object 
 2   Gender                      4998 non-null   object 
 3   Age                         4998 non-null   object 
 4   Occupation                  4998 non-null   int64  
 5   City_Category               4998 non-null   object 
 6   Stay_In_Current_City_Years  4998 non-null   object 
 7   Marital_Status              4998 non-null   int64  
 8   Product_Category_1          4998 non-null   int64  
 9   Product_Category_2          3465 non-null   float64
 10  Product_Category_3          1544 non-null   float64
 11  Purchase                    4998 non-null   int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 507.6+ KB


In [4]:
df.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,4998.0,4998.0,4998.0,4998.0,3465.0,1544.0,4998.0
mean,1003015.0,8.036815,0.421369,5.320128,9.773737,12.766839,9234.655462
std,1709.91,6.442697,0.493828,3.912281,5.101867,4.023591,4999.225081
min,1000003.0,0.0,0.0,1.0,2.0,3.0,13.0
25%,1001560.0,2.0,0.0,1.0,5.0,9.0,5497.75
50%,1003040.0,7.0,0.0,5.0,9.0,14.0,8049.0
75%,1004417.0,14.0,1.0,8.0,15.0,16.0,12039.0
max,1006037.0,20.0,1.0,20.0,18.0,18.0,23913.0


In [5]:
# 결측치 확인
df.isnull().sum()

User_ID                          0
Product_ID                       0
Gender                           0
Age                              0
Occupation                       0
City_Category                    0
Stay_In_Current_City_Years       0
Marital_Status                   0
Product_Category_1               0
Product_Category_2            1533
Product_Category_3            3454
Purchase                         0
dtype: int64

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

In [18]:
# product_category_2 = 최빈값으로 대체
a = df["Product_Category_2"].mode()[0]
df.Product_Category_2 = df.Product_Category_2.fillna(a)

In [10]:
# product_category_3는 결측값이 전체 데이터 대비 많으므로 제거 
df.drop(['Product_Category_3'], axis = 1, inplace = True)

In [11]:
# 분석과 관련없는 id변수 삭제
df.drop(['User_ID','Product_ID'], axis = 1, inplace = True)

In [22]:
df = pd.get_dummies(df,drop_first = True)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4998 entries, 0 to 4997
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Occupation                     4998 non-null   int64  
 1   Marital_Status                 4998 non-null   int64  
 2   Product_Category_1             4998 non-null   int64  
 3   Product_Category_2             4998 non-null   float64
 4   Purchase                       4998 non-null   int64  
 5   Gender_M                       4998 non-null   uint8  
 6   Age_18-25                      4998 non-null   uint8  
 7   Age_26-35                      4998 non-null   uint8  
 8   Age_36-45                      4998 non-null   uint8  
 9   Age_46-50                      4998 non-null   uint8  
 10  Age_51-55                      4998 non-null   uint8  
 11  Age_55+                        4998 non-null   uint8  
 12  City_Category_B                4998 non-null   u

In [24]:
from sklearn.model_selection import train_test_split

X = df.drop(['Purchase'],axis = 1)
y = df['Purchase']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [25]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train) 

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## 2. KNN 구현 & 파라미터 튜닝

In [30]:
# knn 구현 
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors = 3, weights = "distance")
regressor.fit(X_train , y_train)

y_pred = regressor.predict(X_test)

In [31]:
# rmse사용
from math import sqrt
from sklearn.metrics import mean_squared_error 

sqrt(mean_squared_error(y_test,y_pred))

5083.166403171491

In [36]:
# 파라미터 튜닝
from sklearn.model_selection import GridSearchCV

param = {"n_neighbors": [3, 5, 7, 9, 11, 13, 15, 17],
            "weights":['uniform', 'distance'],
            'metric' : ['euclidean', 'manhattan', 'minkowski']}

gscv = GridSearchCV(estimator = regressor, param_grid=param_knn, 
                        scoring='neg_root_mean_squared_error', cv=3, n_jobs=-1)
gscv.fit(X_train, y_train)

# 가장 최적의 수치와 최적의 parameter확인
print("best RMSE : {0: .2f}".format(gscv.best_score_))
print("best param : ",gscv.best_params_)

best RMSE : -4563.59
best param :  {'metric': 'manhattan', 'n_neighbors': 17, 'weights': 'distance'}


## 3. Evaluation

In [37]:
# 최종
regressor = KNeighborsRegressor(n_neighbors = 17, weights = "distance", metric="manhattan")
regressor.fit(X_train , y_train)

y_pred = regressor.predict(X_test)
sqrt(mean_squared_error(y_test,y_pred))

4640.876303507092