In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.datasets import load_boston, load_iris
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.linear_model import Lasso, ElasticNet,SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor # 딥러닝 neural_network
from sklearn.metrics import r2_score# 선형 모델(Linear Models)

import mglearn
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family']='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False

import warnings
warnings.simplefilter('ignore')

## 데이터 불러오기

In [2]:
train_df = pd.read_excel('../data/hyundaiCar.xlsx', sheet_name= 'train')
train_df.head()

Unnamed: 0,가격,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,1885,2015,준중형,11.8,172,21.0,가솔린,0,1999,1300,자동
1,2190,2015,준중형,12.3,204,27.0,가솔린,0,1591,1300,자동
2,1135,2015,소형,15.0,100,13.6,가솔린,0,1368,1035,수동
3,1645,2014,소형,14.0,140,17.0,가솔린,0,1591,1090,자동
4,1960,2015,대형,9.6,175,46.0,디젤,0,2497,1990,자동


In [3]:
test_df = pd.read_excel('../data/hyundaiCar.xlsx', sheet_name='test')
test_df.head()

Unnamed: 0,가격,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,1915,2015,대형,6.8,159,23.0,LPG,0,2359,1935,수동
1,1164,2012,소형,13.3,108,13.9,가솔린,0,1396,1035,자동
2,2817,2015,중형,14.4,184,41.0,디젤,0,1995,1792,자동
3,2160,2015,대형,10.9,175,46.0,디젤,0,2497,2210,수동
4,1915,2015,대형,6.4,159,23.0,LPG,0,2359,1935,자동


## 가격을 라벨데이터로 지정

In [4]:
x_train = train_df.iloc[:,1:]
y_train = train_df['가격']

print(x_train.shape, y_train.shape)

(71, 10) (71,)


In [5]:
x_test = test_df.iloc[:,1:]
y_test = test_df['가격']

print(x_test.shape, y_test.shape)

(31, 10) (31,)


## 문자열 Encoding
- Label Encoding
- OneHot Encoding

In [6]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

In [7]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   년식      71 non-null     int64  
 1   종류      71 non-null     object 
 2   연비      71 non-null     float64
 3   마력      71 non-null     int64  
 4   토크      71 non-null     float64
 5   연료      71 non-null     object 
 6   하이브리드   71 non-null     int64  
 7   배기량     71 non-null     int64  
 8   중량      71 non-null     int64  
 9   변속기     71 non-null     object 
dtypes: float64(2), int64(5), object(3)
memory usage: 4.8+ KB


### Label_Encoder

In [8]:
lbl = LabelEncoder()
x_trainLabel = lbl.fit_transform(x_train['종류'])
x_trainLabel

array([2, 2, 1, 1, 0, 3, 3, 1, 3, 1, 2, 3, 2, 0, 1, 0, 0, 0, 3, 0, 0, 3,
       2, 0, 3, 3, 3, 1, 1, 2, 0, 0, 0, 0, 1, 0, 2, 0, 2, 1, 1, 1, 0, 0,
       0, 2, 1, 0, 2, 1, 3, 3, 0, 2, 2, 0, 3, 2, 0, 0, 2, 0, 1, 0, 0, 1,
       3, 1, 2, 3, 0])

In [9]:
# 순서대로 인코딩됨
# ex) 대형 : 0, 소형 : 1, ...

print(lbl.classes_)
print(lbl.transform(['대형', '소형', '준중형', '중형']))

['대형' '소형' '준중형' '중형']
[0 1 2 3]


### OneHot_Encoder
- 0과 1로만 인코딩

In [10]:
oneH = OneHotEncoder()
x_trainOne = oneH.fit_transform(x_train['종류'].values.reshape(-1,1))
x_trainOne

<71x4 sparse matrix of type '<class 'numpy.float64'>'
	with 71 stored elements in Compressed Sparse Row format>

In [11]:
# ex) 대형    [1. 0. 0. 0.], 소형 [0. 1. 0. 0.]
#     준중형  [0. 0. 1. 0.],  중형 [0. 0. 0. 1.]
print(oneH.categories_)
print(x_trainOne.toarray()[:5])

[array(['대형', '소형', '준중형', '중형'], dtype=object)]
[[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]


## 판다스 dummy
- 문자로 되어있는 데이터를 숫자로 만들어줌

In [12]:
pd.get_dummies(x_train['종류']).head()

Unnamed: 0,대형,소형,준중형,중형
0,0,0,1,0
1,0,0,1,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0


In [13]:
pd.get_dummies(x_train).head()

Unnamed: 0,년식,연비,마력,토크,하이브리드,배기량,중량,종류_대형,종류_소형,종류_준중형,종류_중형,연료_LPG,연료_가솔린,연료_디젤,변속기_수동,변속기_자동
0,2015,11.8,172,21.0,0,1999,1300,0,0,1,0,0,1,0,0,1
1,2015,12.3,204,27.0,0,1591,1300,0,0,1,0,0,1,0,0,1
2,2015,15.0,100,13.6,0,1368,1035,0,1,0,0,0,1,0,1,0
3,2014,14.0,140,17.0,0,1591,1090,0,1,0,0,0,1,0,0,1
4,2015,9.6,175,46.0,0,2497,1990,1,0,0,0,0,0,1,0,1


In [14]:
# 특정컬럼만 지정가능
pd.get_dummies(x_train, columns=['연료','변속기']).head()

Unnamed: 0,년식,종류,연비,마력,토크,하이브리드,배기량,중량,연료_LPG,연료_가솔린,연료_디젤,변속기_수동,변속기_자동
0,2015,준중형,11.8,172,21.0,0,1999,1300,0,1,0,0,1
1,2015,준중형,12.3,204,27.0,0,1591,1300,0,1,0,0,1
2,2015,소형,15.0,100,13.6,0,1368,1035,0,1,0,1,0
3,2014,소형,14.0,140,17.0,0,1591,1090,0,1,0,0,1
4,2015,대형,9.6,175,46.0,0,2497,1990,0,0,1,0,1


In [15]:
pd.get_dummies(x_train, columns=['연료','변속기'], sparse=True).head()

Unnamed: 0,년식,종류,연비,마력,토크,하이브리드,배기량,중량,연료_LPG,연료_가솔린,연료_디젤,변속기_수동,변속기_자동
0,2015,준중형,11.8,172,21.0,0,1999,1300,0,1,0,0,1
1,2015,준중형,12.3,204,27.0,0,1591,1300,0,1,0,0,1
2,2015,소형,15.0,100,13.6,0,1368,1035,0,1,0,1,0
3,2014,소형,14.0,140,17.0,0,1591,1090,0,1,0,0,1
4,2015,대형,9.6,175,46.0,0,2497,1990,0,0,1,0,1


## Repalce

In [16]:
x_train['종류'].replace(['대형', '중형', '준중형', '소형'],
                        [0,1,2,3]).head()

0    2
1    2
2    3
3    3
4    0
Name: 종류, dtype: int64

## Make_Column_Transformer

In [17]:
from sklearn.compose import make_column_transformer

In [18]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   년식      71 non-null     int64  
 1   종류      71 non-null     object 
 2   연비      71 non-null     float64
 3   마력      71 non-null     int64  
 4   토크      71 non-null     float64
 5   연료      71 non-null     object 
 6   하이브리드   71 non-null     int64  
 7   배기량     71 non-null     int64  
 8   중량      71 non-null     int64  
 9   변속기     71 non-null     object 
dtypes: float64(2), int64(5), object(3)
memory usage: 4.8+ KB


In [19]:
myt = make_column_transformer( (OneHotEncoder(), ['종류','연료','변속기']) )

In [20]:
# 종류 4개    , 연료 3개, 변속기 2개
# [0, 0, 1, 0,  0, 0, 1,  0, 1]
result = myt.fit_transform(x_train)
result[:5]

array([[0., 0., 1., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 1., 0., 1.]])

## 학습

In [21]:
# model = make_pipeline(myt, Ridge(alpha=1))
# gridS.fit(x_train, y_train)

In [24]:
model = make_pipeline(myt, StandardScaler(), Ridge())

# 최적의 파라미터를 찾기 위해 GridSearch 사용
# 모델__파라미터 : 해당 모델에 파라미터이다. ex) ridge_alpha
param_value = {'ridge__alpha':[i for i in range(-100,100)]}
gridS = GridSearchCV(model, param_grid=param_value, scoring='r2')
gridS.fit(x_train, y_train)

print(gridS.best_params_)
print(gridS.best_score_)

{'ridge__alpha': 17}
0.32523885169886174


In [25]:
# 예측값을 그대로 줘도 가능 gridS를 통해 변환과 정규화를 진행
gridS.best_estimator_.predict(x_test)

array([1949.12313244, 1803.10780468, 2740.1113921 , 2561.48761855,
       2526.03150809, 1265.69806703, 1538.10595615, 2561.48761855,
       2115.0143318 , 2075.5156938 , 3277.52112975, 3277.52112975,
       1265.69806703, 2115.0143318 , 3675.80573186,  688.78969137,
       1538.10595615, 1803.10780468, 1803.10780468, 2075.5156938 ,
       1226.19942903, 2075.5156938 , 3675.80573186, 2652.42406946,
       2652.42406946, 2652.42406946, 3675.80573186, 2561.48761855,
       3138.3959942 , 1803.10780468, 2163.20301645])

In [33]:
a = [2015,'대형',6.8,159,23,'LPG',0,2359,1935,'수동']
b = pd.DataFrame(a).T
x_train.iloc[0:1]
b.columns = x_train.columns # 문자열변환이 인식할수 있게끔 컬럼설정
gridS.predict(b)

array([1949.12313244])