# Regression 종합실습 : Car seat sales
유아용 카시트 매출액을 예측해 봅시다.

* 카시트에 대해서 지역 매장 별 매출액을 예측하고자 합니다.

![](https://cdn.images.express.co.uk/img/dynamic/24/590x/child-car-seat-986556.jpg?r=1532946857754)

## 1.환경준비

### (1) Import

In [1]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import warnings    # 경고메시지 제외
warnings.filterwarnings(action='ignore')

### (2) Data Loading

In [2]:
data_path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/Carseats.csv'
data = pd.read_csv(data_path)

**변수설명**
> * Sales - 각 지역 판매량(단위 : 1000개) <== Target
* CompPrice - 각 지역 경쟁사 가격
* Income - 각 지역 평균 소득수준(단위 : 1000달러)
* Advertising - 각 지역, 회사의 광고 예산(단위 : 1000달러)
* Population - 지역 인구수(단위 : 1000명)
* Price - 자사 지역별 판매가격
* ShelveLoc - 진열상태
* Age - 지역 인구의 평균 연령
* Education - 각 지역 교육수준 레벨
* Urban - 매장 도시 지역 여부
* US - 매장이 미국에 있는지 여부

## 2.데이터 이해

* 둘러보기

In [3]:
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


## 3.데이터 준비

### (1) 데이터 정리

### (2) 데이터분할1 : x, y 나누기

In [7]:
target = 'Sales'
x = data.drop(target,axis=1)
y = data.loc[:,target]

In [8]:
x.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,138,73,11,276,120,Bad,42,17,Yes,Yes
1,111,48,16,260,83,Good,65,10,Yes,Yes
2,113,35,10,269,80,Medium,59,12,Yes,Yes
3,117,100,4,466,97,Medium,55,14,Yes,Yes
4,141,64,3,340,128,Bad,38,13,Yes,No


In [9]:
y.tail()

395    12.57
396     6.14
397     7.41
398     5.94
399     9.71
Name: Sales, dtype: float64

### (3) NA 조치

In [11]:
x.isna().sum()

CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

### (4) 가변수화

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [13]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [18]:
col_dum = ['ShelveLoc','Urban','US']

In [16]:
encoder.fit(x[col_dum])

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [17]:
encoder.categories_

[array(['Bad', 'Good', 'Medium'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['No', 'Yes'], dtype=object)]

In [20]:
encoder_cols = list(encoder.get_feature_names(col_dum))
encoder_cols

['ShelveLoc_Bad',
 'ShelveLoc_Good',
 'ShelveLoc_Medium',
 'Urban_No',
 'Urban_Yes',
 'US_No',
 'US_Yes']

In [22]:
x[encoder_cols] = encoder.transform(x[col_dum])

In [24]:
x[encoder_cols].tail()

Unnamed: 0,ShelveLoc_Bad,ShelveLoc_Good,ShelveLoc_Medium,Urban_No,Urban_Yes,US_No,US_Yes
0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,0.0,0.0,1.0,0.0,1.0,0.0,1.0
3,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0,1.0,0.0


In [26]:
X = x.drop(['ShelveLoc','Urban','US'],axis=1)

In [10]:
ShelveLoc
Urban
US

CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

### (5) 데이터분할2 : train : validation 나누기

In [29]:
X_val, X_train, y_val, y_train = train_test_split(X,y,test_size = .3, random_state = 42)

### (6) Scaling
KNN 알고리즘을 적용하기 위해서는 스케일링을 해야 합니다.

In [30]:
#normalization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train_s1 = scaler.fit_transform(X_train)
x_val_s1 = scaler.transform(X_val)

In [32]:
#standarscale
from sklearn.preprocessing import StandardScaler
scaler2 = StandardScaler()
x_train_s2 = scaler2.fit_transform(X_train)
x_val_s2 = scaler2.transform(X_val)

In [33]:
#스케일링 데이터를 데이터 프레임 형식으로 만들기
x_train_s1 = pd.DataFrame(x_train_s1, columns = list(X))
x_train_s2 = pd.DataFrame(x_train_s2, columns = list(X))

In [34]:
X.describe()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Bad,ShelveLoc_Good,ShelveLoc_Medium,Urban_No,Urban_Yes,US_No,US_Yes
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,124.975,68.6575,6.635,264.84,115.795,53.3225,13.9,0.24,0.2125,0.5475,0.295,0.705,0.355,0.645
std,15.334512,27.986037,6.650364,147.376436,23.676664,16.200297,2.620528,0.427618,0.409589,0.498362,0.456614,0.456614,0.479113,0.479113
min,77.0,21.0,0.0,10.0,24.0,25.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,115.0,42.75,0.0,139.0,100.0,39.75,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,125.0,69.0,5.0,272.0,117.0,54.5,14.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
75%,135.0,91.0,12.0,398.5,131.0,66.0,16.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
max,175.0,120.0,29.0,509.0,191.0,80.0,18.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [35]:
x_train_s1.describe()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Bad,ShelveLoc_Good,ShelveLoc_Medium,Urban_No,Urban_Yes,US_No,US_Yes
count,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0
mean,0.638487,0.517424,0.283013,0.504721,0.664032,0.523636,0.504167,0.233333,0.241667,0.525,0.258333,0.741667,0.3,0.7
std,0.188643,0.270143,0.270534,0.310291,0.171683,0.283927,0.326899,0.424726,0.429888,0.501468,0.439554,0.439554,0.460179,0.460179
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.536184,0.348485,0.0,0.238911,0.549632,0.309091,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.664474,0.550505,0.269231,0.508065,0.691176,0.545455,0.5,0.0,0.0,1.0,0.0,1.0,0.0,1.0
75%,0.766447,0.719697,0.5,0.800403,0.773897,0.745455,0.75,0.0,0.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [37]:
x_train_s2.describe()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Bad,ShelveLoc_Good,ShelveLoc_Medium,Urban_No,Urban_Yes,US_No,US_Yes
count,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0
mean,-4.112451e-16,2.444804e-16,-4.093947e-17,2.201942e-16,-1.632953e-16,1.674586e-16,4.6259290000000003e-17,4.2558550000000004e-17,9.251859e-17,-1.128727e-16,4.4408920000000007e-17,-7.586524000000001e-17,-7.586524000000001e-17,1.776357e-16
std,1.004193,1.004193,1.004193,1.004193,1.004193,1.004193,1.004193,1.004193,1.004193,1.004193,1.004193,1.004193,1.004193,1.004193
min,-3.398823,-1.923401,-1.050513,-1.633428,-3.883987,-1.851997,-1.548739,-0.5516773,-0.5645187,-1.051315,-0.5901818,-1.694393,-0.6546537,-1.527525
25%,-0.5445822,-0.6279917,-1.050513,-0.8602399,-0.6691339,-0.7588045,-1.164754,-0.5516773,-0.5645187,-1.051315,-0.5901818,-1.694393,-0.6546537,-1.527525
50%,0.1383344,0.12297,-0.05115746,0.01082029,0.1587715,0.07716656,-0.01279949,-0.5516773,-0.5645187,0.9511897,-0.5901818,0.5901818,-0.6546537,0.6546537
75%,0.6811655,0.7519004,0.8054326,0.9569156,0.6426123,0.7845267,0.75517,-0.5516773,-0.5645187,0.9511897,1.694393,0.5901818,1.527525,0.6546537
max,1.924424,1.79386,2.661378,1.60287,1.96511,1.684803,1.52314,1.812654,1.771421,0.9511897,1.694393,0.5901818,1.527525,0.6546537


## 4.모델링 : 선형회귀

* 변수를 조절하며 최소 2개 이상의 모델을 생성하고 예측하고 평가해 봅시다.

* 모델1

* 모델2

## 5.모델링 : KNN

* 하이퍼파라미터를 조절하며 모델을 최소 3가지 이상 생성하시오.

* 모델3

* 모델4

* 모델5

## 6.성능비교