# auto-mpg 데이터 전처리 연습

In [27]:
import urllib.request as req
import pandas as pd
import numpy as np

## [1] 데이터 파일로 저장

In [28]:
# urlretrieve 사용해서 파일로 저장하기
req.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data", 'mpg.csv')
req.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.names", 'mpg-col.csv')

('mpg-col.csv', <http.client.HTTPMessage at 0x7fa3e0920f90>)

In [29]:
# 파일 불러오기
data = pd.read_csv("mpg.csv", header=None, sep="\s+")
columns = pd.read_csv("mpg-col.csv", header=None, sep="\n", skiprows=32, nrows=9)

In [30]:
# 컬럼명 확인
columns

Unnamed: 0,0
0,1. mpg: continuous
1,2. cylinders: multi-valued discrete
2,3. displacement: continuous
3,4. horsepower: continuous
4,5. weight: continuous
5,6. acceleration: continuous
6,7. model year: multi-valued discrete
7,8. origin: multi-valued discrete
8,9. car name: string (unique for each ...


In [31]:
# 데이터프레임에 컬럼명 지정해줌
data.columns = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model-year", "origin", "car-name"]

In [32]:
# 확인
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model-year,origin,car-name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.00,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.00,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.00,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.00,2625.0,18.6,82,1,ford ranger


## [2] 데이터 전처리

In [33]:
# 데이터 타입 확인
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model-year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car-name      398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [34]:
# 데이터 특성 확인
for i in data.columns:
    print(f"{i} 컬럼의 특성 : ", data[i].unique(), sep="\n")

mpg 컬럼의 특성 : 
[18.  15.  16.  17.  14.  24.  22.  21.  27.  26.  25.  10.  11.   9.
 28.  19.  12.  13.  23.  30.  31.  35.  20.  29.  32.  33.  17.5 15.5
 14.5 22.5 24.5 18.5 29.5 26.5 16.5 31.5 36.  25.5 33.5 20.5 30.5 21.5
 43.1 36.1 32.8 39.4 19.9 19.4 20.2 19.2 25.1 20.6 20.8 18.6 18.1 17.7
 27.5 27.2 30.9 21.1 23.2 23.8 23.9 20.3 21.6 16.2 19.8 22.3 17.6 18.2
 16.9 31.9 34.1 35.7 27.4 25.4 34.2 34.5 31.8 37.3 28.4 28.8 26.8 41.5
 38.1 32.1 37.2 26.4 24.3 19.1 34.3 29.8 31.3 37.  32.2 46.6 27.9 40.8
 44.3 43.4 36.4 44.6 40.9 33.8 32.7 23.7 23.6 32.4 26.6 25.8 23.5 39.1
 39.  35.1 32.3 37.7 34.7 34.4 29.9 33.7 32.9 31.6 28.1 30.7 24.2 22.4
 34.  38.  44. ]
cylinders 컬럼의 특성 : 
[8 4 6 3 5]
displacement 컬럼의 특성 : 
[307.  350.  318.  304.  302.  429.  454.  440.  455.  390.  383.  340.
 400.  113.  198.  199.  200.   97.  110.  107.  104.  121.  360.  140.
  98.  232.  225.  250.  351.  258.  122.  116.   79.   88.   71.   72.
  91.   97.5  70.  120.   96.  108.  155.   68.  114.  156. 

In [35]:
# 타입 맞춰주기
print("horsepower 컬럼 unique : ", data["horsepower"].unique(), sep="\n")
num = data["horsepower"] == "?"
print("? 값의 개수 : ", num.sum())

horsepower 컬럼 unique : 
['130.0' '165.0' '150.0' '140.0' '198.0' '220.0' '215.0' '225.0' '190.0'
 '170.0' '160.0' '95.00' '97.00' '85.00' '88.00' '46.00' '87.00' '90.00'
 '113.0' '200.0' '210.0' '193.0' '?' '100.0' '105.0' '175.0' '153.0'
 '180.0' '110.0' '72.00' '86.00' '70.00' '76.00' '65.00' '69.00' '60.00'
 '80.00' '54.00' '208.0' '155.0' '112.0' '92.00' '145.0' '137.0' '158.0'
 '167.0' '94.00' '107.0' '230.0' '49.00' '75.00' '91.00' '122.0' '67.00'
 '83.00' '78.00' '52.00' '61.00' '93.00' '148.0' '129.0' '96.00' '71.00'
 '98.00' '115.0' '53.00' '81.00' '79.00' '120.0' '152.0' '102.0' '108.0'
 '68.00' '58.00' '149.0' '89.00' '63.00' '48.00' '66.00' '139.0' '103.0'
 '125.0' '133.0' '138.0' '135.0' '142.0' '77.00' '62.00' '132.0' '84.00'
 '64.00' '74.00' '116.0' '82.00']
? 값의 개수 :  6


In [36]:
# ? 행 날리기
index = data[data["horsepower"] == "?"].index
data = data.drop(index=index, axis=0)

In [37]:
# horsepower 컬럼 float로 변경
data["horsepower"] = pd.to_numeric(data["horsepower"])

# kml 컬럼 추가
data["kml"] = data["mpg"] * 0.425

# 필요없는 컬럼 삭제하기
data.drop(columns=["mpg", "car-name"], inplace=True)

In [38]:
# X, y 나누기
X = data.drop(columns=["kml"]).reset_index(drop=True)
y = data["kml"].reset_index(drop=True)

In [39]:
X.head(10)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model-year,origin
0,8,307.0,130.0,3504.0,12.0,70,1
1,8,350.0,165.0,3693.0,11.5,70,1
2,8,318.0,150.0,3436.0,11.0,70,1
3,8,304.0,150.0,3433.0,12.0,70,1
4,8,302.0,140.0,3449.0,10.5,70,1
5,8,429.0,198.0,4341.0,10.0,70,1
6,8,454.0,220.0,4354.0,9.0,70,1
7,8,440.0,215.0,4312.0,8.5,70,1
8,8,455.0,225.0,4425.0,10.0,70,1
9,8,390.0,190.0,3850.0,8.5,70,1


In [40]:
# 범주형 데이터 추출 및 제거
X_class = X.loc[:,["cylinders", "origin"]]
X_count = X.drop(columns=["cylinders", "origin"])

X_class

Unnamed: 0,cylinders,origin
0,8,1
1,8,1
2,8,1
3,8,1
4,8,1
...,...,...
387,4,1
388,4,2
389,4,1
390,4,1


In [41]:
X_count

Unnamed: 0,displacement,horsepower,weight,acceleration,model-year
0,307.0,130.0,3504.0,12.0,70
1,350.0,165.0,3693.0,11.5,70
2,318.0,150.0,3436.0,11.0,70
3,304.0,150.0,3433.0,12.0,70
4,302.0,140.0,3449.0,10.5,70
...,...,...,...,...,...
387,140.0,86.0,2790.0,15.6,82
388,97.0,52.0,2130.0,24.6,82
389,135.0,84.0,2295.0,11.6,82
390,120.0,79.0,2625.0,18.6,82


In [42]:
# 범주형 데이터 onehot encoding 하기
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)

X_cyl = pd.DataFrame(ohe.fit_transform(X_class["cylinders"].values.reshape(-1,1)))
X_ori = pd.DataFrame(ohe.fit_transform(X_class["origin"].values.reshape(-1,1)))

In [43]:
# 수치형 데이터 스케일링 하기
from sklearn.preprocessing import StandardScaler, MinMaxScaler

ss = StandardScaler()
mm = MinMaxScaler()

X_count_scal = pd.DataFrame(ss.fit_transform(X_count))

In [44]:
# 합쳐서 새로운 데이터프레임 만들기
X_new = pd.concat([X_count_scal, X_cyl, X_ori], axis=1)

In [45]:
# 컬럼네임 리셋하기
X_new = X_new.T.reset_index(drop=True).T

columns = list(X_count.columns) + ["cylinders3", "cylinders4", "cylinders5", "cylinders6", "cylinders8", "origin1", "origin2", "origin3"]

X_new.columns = columns
X_new

Unnamed: 0,displacement,horsepower,weight,acceleration,model-year,cylinders3,cylinders4,cylinders5,cylinders6,cylinders8,origin1,origin2,origin3
0,1.077290,0.664133,0.620540,-1.285258,-1.625315,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,1.488732,1.574594,0.843334,-1.466724,-1.625315,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,1.182542,1.184397,0.540382,-1.648189,-1.625315,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,1.048584,1.184397,0.536845,-1.285258,-1.625315,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,1.029447,0.924265,0.555706,-1.829655,-1.625315,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,-0.520637,-0.480448,-0.221125,0.021294,1.636410,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
388,-0.932079,-1.364896,-0.999134,3.287676,1.636410,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
389,-0.568479,-0.532474,-0.804632,-1.430430,1.636410,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
390,-0.712005,-0.662540,-0.415627,1.110088,1.636410,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


## [3] train test 나누기

In [46]:
from sklearn.model_selection import train_test_split

X_tv, X_test, y_tv, y_test = train_test_split(
    X_new, y, random_state=72, test_size=0.2
)

In [47]:
X_train, X_val, y_train, y_val = train_test_split(
    X_tv, y_tv, random_state=72, test_size=0.2
)

## [4] 모델 생성 및 학습

In [48]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

model_xgb = xgb.XGBRegressor(
    objective='reg:squarederror'
)

In [49]:
# params = {
#     'max_depth': [3, 5, 7],
#     'min_child_weight': [1, 3, 5, 7],
#     "reg_alpha": [0.01, 0.1, 1, 10],
#     "subsample": [0.5, 0.75, 1],
#     "colsample_bytree": [0, 0.5, 1]
# }

# grid = GridSearchCV(
#     model_xgb,
#     param_grid = params,
#     cv=5
# )

# grid.fit(
#     X_train,
#     y_train,
#     eval_set=[(X_train, y_train),(X_val, y_val)],
#     eval_metric='rmse'
# )

In [50]:
# grid.best_params_
{'colsample_bytree': 0.5,
 'max_depth': 7,
 'min_child_weight': 7,
 'reg_alpha': 1,
 'subsample': 0.75}

{'colsample_bytree': 0.5,
 'max_depth': 7,
 'min_child_weight': 7,
 'reg_alpha': 1,
 'subsample': 0.75}

In [56]:
model_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    max_depth=7,
    min_child_weight=7,
    subsample=0.75,
    reg_alpha=1,
    colsample_bytree=0.5
)

model_xgb.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train),(X_val, y_val)],
    eval_metric='rmse'
)

[0]	validation_0-rmse:9.10797	validation_1-rmse:9.35648
[1]	validation_0-rmse:8.24772	validation_1-rmse:8.46662
[2]	validation_0-rmse:7.48533	validation_1-rmse:7.68389
[3]	validation_0-rmse:6.77943	validation_1-rmse:6.96653
[4]	validation_0-rmse:6.14558	validation_1-rmse:6.29678
[5]	validation_0-rmse:5.58178	validation_1-rmse:5.7284
[6]	validation_0-rmse:5.06454	validation_1-rmse:5.19484
[7]	validation_0-rmse:4.60293	validation_1-rmse:4.719
[8]	validation_0-rmse:4.18705	validation_1-rmse:4.29885
[9]	validation_0-rmse:3.81344	validation_1-rmse:3.9234
[10]	validation_0-rmse:3.48343	validation_1-rmse:3.5872
[11]	validation_0-rmse:3.18579	validation_1-rmse:3.28093
[12]	validation_0-rmse:2.92881	validation_1-rmse:3.02659
[13]	validation_0-rmse:2.69929	validation_1-rmse:2.80252
[14]	validation_0-rmse:2.48718	validation_1-rmse:2.58711
[15]	validation_0-rmse:2.29841	validation_1-rmse:2.40936
[16]	validation_0-rmse:2.12514	validation_1-rmse:2.24884
[17]	validation_0-rmse:1.98135	validation_1-rm

XGBRegressor(colsample_bytree=0.5, max_depth=7, min_child_weight=7,
             objective='reg:squarederror', reg_alpha=1, subsample=0.75)

In [57]:
print("xgb train score : ", model_xgb.score(X_train, y_train))
print("xgb val score : ", model_xgb.score(X_val, y_val))
print("xgb test score : ", model_xgb.score(X_test, y_test))

xgb train score :  0.9740550304223407
xgb val score :  0.8966701066239334
xgb test score :  0.8186507453018702
