In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import load_boston, load_iris

from sklearn.linear_model import Ridge,Lasso,ElasticNet, LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import mglearn
import matplotlib.pyplot as plt
import matplotlib

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

matplotlib.rcParams['font.family']='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False

import warnings
warnings.simplefilter('ignore')

## 라벨인코딩을 수행한 후 자동차 가격 예측하기

In [18]:
train_df = pd.read_excel('data4/hyundaiCar.xlsx', sheet_name='train')

In [19]:
test_df = pd.read_excel('data4/hyundaiCar.xlsx', sheet_name='test')

In [20]:
x_train = train_df.iloc[:, 1:]
y_train = train_df['가격']

In [21]:
x_test = test_df.iloc[:, 1:]
y_test = test_df['가격']

In [22]:
lbl = LabelEncoder()
x_lbl_size =  lbl.fit_transform(x_train['종류'])
size = lbl.classes_
x_train['종류'] = x_lbl_size
print(size)

x_lbl_fuel =  lbl.fit_transform(x_train['연료'])
fuel = lbl.classes_
x_train['연료'] = x_lbl_fuel
print(fuel)

x_lbl_alt =  lbl.fit_transform(x_train['변속기'])
alt = lbl.classes_
x_train['변속기'] = x_lbl_alt
print(alt)

['대형' '소형' '준중형' '중형']
['LPG' '가솔린' '디젤']
['수동' '자동']


In [23]:
x_train

Unnamed: 0,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,2015,2,11.8,172,21.0,1,0,1999,1300,1
1,2015,2,12.3,204,27.0,1,0,1591,1300,1
2,2015,1,15.0,100,13.6,1,0,1368,1035,0
3,2014,1,14.0,140,17.0,1,0,1591,1090,1
4,2015,0,9.6,175,46.0,2,0,2497,1990,1
...,...,...,...,...,...,...,...,...,...,...
66,2015,3,8.5,290,34.8,1,0,3342,1901,1
67,2012,1,13.3,108,13.9,1,0,1396,1040,1
68,2015,2,12.8,186,41.0,2,0,1995,1665,1
69,2015,3,17.7,156,19.3,1,1,1999,1585,1


In [24]:
x_test['종류'] = x_test['종류'].replace(['대형', '중형', '준중형', '소형'], [0,1,2,3])

In [25]:
x_test['연료'] = x_test['연료'].replace(['LPG', '가솔린', '디젤'], [0,1,2])

In [26]:
x_test['변속기']= x_test['변속기'].replace(['수동', '자동'], [0,1])

In [27]:
model_Ela = Pipeline([('scl', StandardScaler()), ('esn', ElasticNet())])
param_value = {'esn__alpha':[0.001, 0.01, 1,2,3,4], 'esn__l1_ratio':[0,0.01, 0.1, 0.2,0.3, 0.4, 0.5, 0.7, 1.0]}
gridSearch = GridSearchCV(model_Ela, param_grid=param_value, cv=10, verbose=1, scoring='r2')

gridSearch.fit(x_train, y_train)

Fitting 10 folds for each of 54 candidates, totalling 540 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scl', StandardScaler()),
                                       ('esn', ElasticNet())]),
             param_grid={'esn__alpha': [0.001, 0.01, 1, 2, 3, 4],
                         'esn__l1_ratio': [0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5,
                                           0.7, 1.0]},
             scoring='r2', verbose=1)

In [28]:
gridSearch.best_estimator_

Pipeline(steps=[('scl', StandardScaler()),
                ('esn', ElasticNet(alpha=0.01, l1_ratio=0))])

In [29]:
print(gridSearch.score(x_train, y_train))
print(gridSearch.score(x_test, y_test))

0.8736892155670837
0.759643821805259


In [30]:
gridSearch.predict([[2015, 2, 12.3, 204, 27,1,0,1591,1300,1]])

array([2100.64922277])

In [31]:
model_rdg = Pipeline([('scl', StandardScaler()), ('rd', Ridge())])
param_value = {'rd__alpha':[0.001, 0.01, 1,2,3,4]}
gridSearch = GridSearchCV(model_rdg, param_grid=param_value, cv=10, verbose=1, scoring='r2')

gridSearch.fit(x_train, y_train)

Fitting 10 folds for each of 6 candidates, totalling 60 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scl', StandardScaler()),
                                       ('rd', Ridge())]),
             param_grid={'rd__alpha': [0.001, 0.01, 1, 2, 3, 4]}, scoring='r2',
             verbose=1)

In [32]:
gridSearch.best_estimator_

Pipeline(steps=[('scl', StandardScaler()), ('rd', Ridge(alpha=3))])

In [33]:
print(gridSearch.score(x_train, y_train))
print(gridSearch.score(x_test, y_test))

0.8701396090670609
0.7463989336745318


In [34]:
gridSearch.predict([[2015, 2, 12.3, 204, 27,1,0,1591,1300,1]])

array([2252.52136462])

In [35]:
model=make_pipeline(StandardScaler(),Ridge() )
model.fit(x_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()), ('ridge', Ridge())])

In [37]:
print(model.score(x_train, y_train))
print(model.score(x_test, y_test))

0.8733443420869326
0.7577827979122559


In [38]:
model.predict([[2015, 2, 12.3, 204, 27,1,0,1591,1300,1]])

array([2129.91186715])