# Regression 종합실습 : Car seat sales
유아용 카시트 매출액을 예측해 봅시다.

* 카시트에 대해서 지역 매장 별 매출액을 예측하고자 합니다.

![](https://cdn.images.express.co.uk/img/dynamic/24/590x/child-car-seat-986556.jpg?r=1532946857754)

## 1.환경준비

### (1) Import

In [1]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import warnings  # 경고메시지 제외

warnings.filterwarnings(action='ignore')

### (2) Data Loading

In [10]:
data_path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/Carseats.csv'
data = pd.read_csv(data_path)

**변수설명**
> * Sales - 각 지역 판매량(단위 : 1000개) <== Target
* CompPrice - 각 지역 경쟁사 가격
* Income - 각 지역 평균 소득수준(단위 : 1000달러)
* Advertising - 각 지역, 회사의 광고 예산(단위 : 1000달러)
* Population - 지역 인구수(단위 : 1000명)
* Price - 자사 지역별 판매가격
* ShelveLoc - 진열상태
* Age - 지역 인구의 평균 연령
* Education - 각 지역 교육수준 레벨
* Urban - 매장 도시 지역 여부
* US - 매장이 미국에 있는지 여부

## 2.데이터 이해

* 둘러보기

In [11]:
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [108]:
result = pd.DataFrame(columns=['model_desc', 'RMSE', 'MAE', 'MAPE'])
result

Unnamed: 0,model_desc,RMSE,MAE,MAPE


## 3.데이터 준비

### (1) 데이터 정리

In [95]:
target = 'Sales'

### (2) 데이터분할1 : x, y 나누기

In [96]:
x = data.drop(target, axis=1)
y = data.loc[:, target]

### (3) NA 조치

In [97]:
x.isna().sum()

CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

### (4) 가변수화

In [98]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   CompPrice    400 non-null    int64 
 1   Income       400 non-null    int64 
 2   Advertising  400 non-null    int64 
 3   Population   400 non-null    int64 
 4   Price        400 non-null    int64 
 5   ShelveLoc    400 non-null    object
 6   Age          400 non-null    int64 
 7   Education    400 non-null    int64 
 8   Urban        400 non-null    object
 9   US           400 non-null    object
dtypes: int64(7), object(3)
memory usage: 31.4+ KB


In [99]:
x = pd.get_dummies(x, columns=['ShelveLoc', 'Urban', 'US'], drop_first=True)
x

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Good,ShelveLoc_Medium,Urban_Yes,US_Yes
0,138,73,11,276,120,42,17,0,0,1,1
1,111,48,16,260,83,65,10,1,0,1,1
2,113,35,10,269,80,59,12,0,1,1,1
3,117,100,4,466,97,55,14,0,1,1,1
4,141,64,3,340,128,38,13,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
395,138,108,17,203,128,33,14,1,0,1,1
396,139,23,3,37,120,55,11,0,1,0,1
397,162,26,12,368,159,40,18,0,1,1,1
398,100,79,7,284,95,50,12,0,0,1,1


### (5) 데이터분할2 : train : validation 나누기

In [100]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.3, random_state=2022)

### (6) Scaling
KNN 알고리즘을 적용하기 위해서는 스케일링을 해야 합니다.

In [101]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

x_train_mm = MinMaxScaler().fit_transform(x_train)
x_val_mm = MinMaxScaler().fit_transform(x_val)

x_train_std = StandardScaler().fit_transform(x_train)
x_val_std = StandardScaler().fit_transform(x_val)

## 4.모델링 : 선형회귀

* 변수를 조절하며 최소 2개 이상의 모델을 생성하고 예측하고 평가해 봅시다.

In [102]:
from sklearn.linear_model import LinearRegression

linear_model1 = LinearRegression()
linear_model2 = LinearRegression()

* 모델1

In [103]:
linear_model1.fit(x_train.loc[:, ['Price', 'Age']], y_train)
linear_model1.coef_, linear_model1.intercept_

(array([-0.0497863 , -0.04566617]), 15.792236203712687)

In [104]:
pred = linear_model1.predict(x_val.loc[:, ['Price', 'Age']])

In [105]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

In [147]:
print(f'RMSE: {mean_squared_error(y_val, pred, squared=False)}')
print(f'MAE: {mean_absolute_error(y_val, pred)}')
print(f'MAPE: {mean_absolute_percentage_error(y_val, pred)}')
print(f'정확도: {1 - mean_absolute_percentage_error(y_val, pred)}')

# pd.DataFrame(data=['linear_1', mean_squared_error(y_val, pred, squared=False), mean_absolute_error(y_val, pred), mean_absolute_percentage_error(y_val, pred)], columns=['model_desc', 'RMSE', 'MAE', 'MAPE'])
result = pd.DataFrame(data={'model_desc': 'linear_1',
                       'RMSE': mean_squared_error(y_val, pred, squared=False),
                       'MAE': mean_absolute_error(y_val, pred),
                       'MAPE': mean_absolute_percentage_error(y_val, pred),
                       }, index=[0])

RMSE: 2.4263766319620563
MAE: 1.9553406487514418
MAPE: 0.6900786058635068
정확도: 0.3099213941364932


In [148]:
result

Unnamed: 0,model_desc,RMSE,MAE,MAPE
0,linear_1,2.426377,1.955341,0.690079


* 모델2

In [67]:
linear_model2.fit(x_train.loc[:, ['CompPrice', 'Advertising']], y_train)
linear_model2.coef_, linear_model2.intercept_

(array([0.01217338, 0.12446636]), 5.314485348699607)

In [68]:
pred = linear_model2.predict(x_val.loc[:, ['CompPrice', 'Advertising']])

In [69]:
print(f'RMSE: {mean_squared_error(y_val, pred, squared=False)}')
print(f'MAE: {mean_absolute_error(y_val, pred)}')
print(f'MAPE: {mean_absolute_percentage_error(y_val, pred)}')
print(f'정확도: {1 - mean_absolute_percentage_error(y_val, pred)}')
result.append(['linear_2', mean_squared_error(y_val, pred, squared=False), mean_absolute_error(y_val, pred),
               mean_absolute_percentage_error(y_val, pred)])

RMSE: 2.9228326675965715
MAE: 2.3760174445177022
MAPE: 0.8205617379566562
정확도: 0.1794382620433438


## 5.모델링 : KNN

* 하이퍼파라미터를 조절하며 모델을 최소 3가지 이상 생성하시오.

* 모델3

In [70]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor()

In [71]:
model.fit(x_train_mm, y_train)
pred = model.predict(x_val_mm)

In [72]:
print(f'RMSE: {mean_squared_error(y_val, pred, squared=False)}')
print(f'MAE: {mean_absolute_error(y_val, pred)}')
print(f'MAPE: {mean_absolute_percentage_error(y_val, pred)}')
print(f'정확도: {1 - mean_absolute_percentage_error(y_val, pred)}')
result.append(['knn_1_mm', mean_squared_error(y_val, pred, squared=False), mean_absolute_error(y_val, pred),
               mean_absolute_percentage_error(y_val, pred)])

RMSE: 2.4402738849017203
MAE: 2.0449166666666665
MAPE: 0.6981789974010878
정확도: 0.3018210025989122


In [73]:
model.fit(x_train_std, y_train)
pred = model.predict(x_val_std)

In [74]:
print(f'RMSE: {mean_squared_error(y_val, pred, squared=False)}')
print(f'MAE: {mean_absolute_error(y_val, pred)}')
print(f'MAPE: {mean_absolute_percentage_error(y_val, pred)}')
print(f'정확도: {1 - mean_absolute_percentage_error(y_val, pred)}')
result.append(['knn_1_std', mean_squared_error(y_val, pred, squared=False), mean_absolute_error(y_val, pred),
               mean_absolute_percentage_error(y_val, pred)])

RMSE: 2.1716287742920826
MAE: 1.7692333333333332
MAPE: 0.6745314074828028
정확도: 0.3254685925171972


* 모델4

In [75]:
model = KNeighborsRegressor(n_neighbors=15)

In [76]:
model.fit(x_train_mm, y_train)
pred = model.predict(x_val_mm)

In [77]:
print(f'RMSE: {mean_squared_error(y_val, pred, squared=False)}')
print(f'MAE: {mean_absolute_error(y_val, pred)}')
print(f'MAPE: {mean_absolute_percentage_error(y_val, pred)}')
print(f'정확도: {1 - mean_absolute_percentage_error(y_val, pred)}')
result.append(['knn_2_mm', mean_squared_error(y_val, pred, squared=False), mean_absolute_error(y_val, pred),
               mean_absolute_percentage_error(y_val, pred)])

RMSE: 2.343396474572119
MAE: 1.9071333333333333
MAPE: 0.7180356811919384
정확도: 0.2819643188080616


In [78]:
model.fit(x_train_std, y_train)
pred = model.predict(x_val_std)

In [79]:
print(f'RMSE: {mean_squared_error(y_val, pred, squared=False)}')
print(f'MAE: {mean_absolute_error(y_val, pred)}')
print(f'MAPE: {mean_absolute_percentage_error(y_val, pred)}')
print(f'정확도: {1 - mean_absolute_percentage_error(y_val, pred)}')
result.append(['knn_2_std', mean_squared_error(y_val, pred, squared=False), mean_absolute_error(y_val, pred),
               mean_absolute_percentage_error(y_val, pred)])

RMSE: 2.210137762590746
MAE: 1.7941055555555556
MAPE: 0.6252516995072744
정확도: 0.37474830049272556


* 모델5

## 6.성능비교