In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import load_boston, load_iris

from sklearn.linear_model import Ridge,Lasso,ElasticNet, LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import mglearn
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family']='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False

import warnings
warnings.simplefilter('ignore')


In [2]:
train_df = pd.read_excel('data4/hyundaiCar.xlsx', sheet_name='train')
test_df = pd.read_excel('data4/hyundaiCar.xlsx', sheet_name='test')

In [3]:
x_train = train_df.iloc[:,1:]
y_train = train_df.iloc[:,0:1]

In [4]:
x_test = test_df.iloc[:,1:]
y_test = test_df.iloc[:,0:1]

## 문자열 encoding
- label encoding
- one hot encoding

In [5]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [6]:
x_train['종류']

0     준중형
1     준중형
2      소형
3      소형
4      대형
     ... 
66     중형
67     소형
68    준중형
69     중형
70     대형
Name: 종류, Length: 71, dtype: object

## Label Encoder

In [7]:
lbl = LabelEncoder()
x_trainLabel = lbl.fit_transform(x_train['종류'])
x_trainLabel

array([2, 2, 1, 1, 0, 3, 3, 1, 3, 1, 2, 3, 2, 0, 1, 0, 0, 0, 3, 0, 0, 3,
       2, 0, 3, 3, 3, 1, 1, 2, 0, 0, 0, 0, 1, 0, 2, 0, 2, 1, 1, 1, 0, 0,
       0, 2, 1, 0, 2, 1, 3, 3, 0, 2, 2, 0, 3, 2, 0, 0, 2, 0, 1, 0, 0, 1,
       3, 1, 2, 3, 0])

In [8]:
# 종류별 숫자 부여 확인
lbl.classes_

array(['대형', '소형', '준중형', '중형'], dtype=object)

In [9]:
lbl.classes_[2]

'준중형'

In [10]:
lbl.transform(['소형'])

array([1])

## OneHotEncoder
- 0,1로만 구성되어있다.

In [11]:
oneH = OneHotEncoder()
x_trainOne = oneH.fit_transform(x_train['종류'].values.reshape(-1,1))
x_trainOne

<71x4 sparse matrix of type '<class 'numpy.float64'>'
	with 71 stored elements in Compressed Sparse Row format>

In [12]:
type(x_trainOne)

scipy.sparse.csr.csr_matrix

In [13]:
x_trainOne.toarray()

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],


In [14]:
oneH.categories_

[array(['대형', '소형', '준중형', '중형'], dtype=object)]

# pandas dummpy

In [15]:
pd.get_dummies(x_train['종류'])

Unnamed: 0,대형,소형,준중형,중형
0,0,0,1,0
1,0,0,1,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0
...,...,...,...,...
66,0,0,0,1
67,0,1,0,0
68,0,0,1,0
69,0,0,0,1


In [16]:
pd.get_dummies(x_train)

Unnamed: 0,년식,연비,마력,토크,하이브리드,배기량,중량,종류_대형,종류_소형,종류_준중형,종류_중형,연료_LPG,연료_가솔린,연료_디젤,변속기_수동,변속기_자동
0,2015,11.8,172,21.0,0,1999,1300,0,0,1,0,0,1,0,0,1
1,2015,12.3,204,27.0,0,1591,1300,0,0,1,0,0,1,0,0,1
2,2015,15.0,100,13.6,0,1368,1035,0,1,0,0,0,1,0,1,0
3,2014,14.0,140,17.0,0,1591,1090,0,1,0,0,0,1,0,0,1
4,2015,9.6,175,46.0,0,2497,1990,1,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2015,8.5,290,34.8,0,3342,1901,0,0,0,1,0,1,0,0,1
67,2012,13.3,108,13.9,0,1396,1040,0,1,0,0,0,1,0,0,1
68,2015,12.8,186,41.0,0,1995,1665,0,0,1,0,0,0,1,0,1
69,2015,17.7,156,19.3,1,1999,1585,0,0,0,1,0,1,0,0,1


In [17]:
pd.get_dummies(x_train,columns=['연료', '변속기'])

Unnamed: 0,년식,종류,연비,마력,토크,하이브리드,배기량,중량,연료_LPG,연료_가솔린,연료_디젤,변속기_수동,변속기_자동
0,2015,준중형,11.8,172,21.0,0,1999,1300,0,1,0,0,1
1,2015,준중형,12.3,204,27.0,0,1591,1300,0,1,0,0,1
2,2015,소형,15.0,100,13.6,0,1368,1035,0,1,0,1,0
3,2014,소형,14.0,140,17.0,0,1591,1090,0,1,0,0,1
4,2015,대형,9.6,175,46.0,0,2497,1990,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2015,중형,8.5,290,34.8,0,3342,1901,0,1,0,0,1
67,2012,소형,13.3,108,13.9,0,1396,1040,0,1,0,0,1
68,2015,준중형,12.8,186,41.0,0,1995,1665,0,0,1,0,1
69,2015,중형,17.7,156,19.3,1,1999,1585,0,1,0,0,1


In [18]:
# replace
x_train['종류'].replace( ['대형', '중형', '준중형', '소형'], [0,1,2,3])

0     2
1     2
2     3
3     3
4     0
     ..
66    1
67    3
68    2
69    1
70    0
Name: 종류, Length: 71, dtype: int64

In [19]:
연습문제.

라벨인코딩을 이용하여
2015, 준중형, 12.3, 204, 27,가솔린,0,1591,1300,자동
자동차가격을 예측하시요

팀별로 같이하세요.


SyntaxError: invalid syntax (<ipython-input-19-a7f69ca6079d>, line 1)

In [20]:
# 라벨인코딩

# 1) 종류
lbl_1 = LabelEncoder()
x_train['종류'] = lbl_1.fit_transform(x_train['종류'])

# 2) 연료
lbl_2 = LabelEncoder()
x_train['연료'] = lbl_2.fit_transform(x_train['연료'])

# 3) 변속기
lbl_3 = LabelEncoder()
x_train['변속기'] = lbl_3.fit_transform(x_train['변속기'])

In [21]:
# 라벨엔코딩별 숫자 확인
print(lbl_1.classes_)
print(lbl_2.classes_)
print(lbl_3.classes_)

['대형' '소형' '준중형' '중형']
['LPG' '가솔린' '디젤']
['수동' '자동']


In [25]:
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt

#elastic-net 모델 적용
model_elastic = Pipeline( [ ('scl', StandardScaler() ), ('clf', ElasticNet()) ] )

param_value = {'clf__alpha': [ 0.001,0.01,1,2,3,4],
              'clf__l1_ratio':[0.1,0.3,0.5,0.7,0.9,1]}

# 최적의 모델을 찾아주는 모듈
gridSearch = GridSearchCV( model_elastic, param_grid=param_value ,cv=10 ,verbose=1, scoring='r2')
gridSearch.fit( x_train, y_train)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scl', StandardScaler()),
                                       ('clf', ElasticNet())]),
             param_grid={'clf__alpha': [0.001, 0.01, 1, 2, 3, 4],
                         'clf__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 1]},
             scoring='r2', verbose=1)

In [26]:
# 2015, 준중형, 12.3, 204, 27,가솔린,0,1591,1300,자동  --> 자동차 가격 예측
gridSearch.best_estimator_.predict([[2015,2,12.3,204,27,1,0,1591,1300,1]])

array([2350.01358939])

In [27]:
gridSearch.best_params_

{'clf__alpha': 1, 'clf__l1_ratio': 0.9}