# mpg keras

## [1] 파일 불러오기 및 전처리


In [182]:
import urllib.request as req
import pandas as pd
import numpy as np

In [183]:
# urlretrieve 사용해서 파일로 저장하기
req.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data", 'mpg.csv')
req.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.names", 'mpg-col.csv')

('mpg-col.csv', <http.client.HTTPMessage at 0x7ff007122950>)

In [184]:
# 파일 불러오기
data = pd.read_csv("mpg.csv", header=None, sep="\s+")
columns = pd.read_csv("mpg-col.csv", header=None, sep="\n", skiprows=32, nrows=9)

# 데이터프레임에 컬럼명 지정해줌
data.columns = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model-year", "origin", "car-name"]

# 타입 맞춰주기
num = data["horsepower"] == "?"


# ? 행 날리기
index = data[data["horsepower"] == "?"].index
data = data.drop(index=index, axis=0)

# horsepower 컬럼 float로 변경
data["horsepower"] = pd.to_numeric(data["horsepower"])

# kml 컬럼 추가
data["kml"] = data["mpg"] * 0.425

# 필요없는 컬럼 삭제하기
data.drop(columns=["mpg", "car-name"], inplace=True)

# X, y 나누기
X = data.drop(columns=["kml"]).reset_index(drop=True)
y = data["kml"].reset_index(drop=True)

In [185]:
# 범주형 데이터 추출 및 제거
X_class = X.loc[:,["cylinders", "origin"]]
X_count = X.drop(columns=["cylinders", "origin"])

# 범주형 데이터 onehot encoding 하기
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)

X_cyl = pd.DataFrame(ohe.fit_transform(X_class["cylinders"].values.reshape(-1,1)))
X_ori = pd.DataFrame(ohe.fit_transform(X_class["origin"].values.reshape(-1,1)))

# 수치형 데이터 스케일링 하기
from sklearn.preprocessing import StandardScaler, MinMaxScaler

ss = StandardScaler()
mm = MinMaxScaler()

X_count_scal = pd.DataFrame(ss.fit_transform(X_count))

# 합쳐서 새로운 데이터프레임 만들기
X_new = pd.concat([X_count_scal, X_cyl, X_ori], axis=1)

# 컬럼네임 리셋하기
X_new = X_new.T.reset_index(drop=True).T

columns = list(X_count.columns) + ["cylinders3", "cylinders4", "cylinders5", "cylinders6", "cylinders8", "origin1", "origin2", "origin3"]

X_new.columns = columns
X_new

Unnamed: 0,displacement,horsepower,weight,acceleration,model-year,cylinders3,cylinders4,cylinders5,cylinders6,cylinders8,origin1,origin2,origin3
0,1.077290,0.664133,0.620540,-1.285258,-1.625315,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,1.488732,1.574594,0.843334,-1.466724,-1.625315,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,1.182542,1.184397,0.540382,-1.648189,-1.625315,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,1.048584,1.184397,0.536845,-1.285258,-1.625315,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,1.029447,0.924265,0.555706,-1.829655,-1.625315,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,-0.520637,-0.480448,-0.221125,0.021294,1.636410,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
388,-0.932079,-1.364896,-0.999134,3.287676,1.636410,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
389,-0.568479,-0.532474,-0.804632,-1.430430,1.636410,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
390,-0.712005,-0.662540,-0.415627,1.110088,1.636410,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [186]:
# 데이터 나누기
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_new, y, random_state=42, test_size=0.2
)

## [2] 모델 생성

In [187]:
import tensorflow as tf

In [188]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [189]:
# 모델 생성
model = Sequential(name="mpg_model")

model.add(Dense(10, activation="linear", input_shape=(13,)))
model.add(Dense(50, activation="relu"))
model.add(Dense(30, activation="relu"))
model.add(Dense(30, activation="relu"))
model.add(Dense(1))

In [190]:
model.summary()

Model: "mpg_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_61 (Dense)            (None, 10)                140       
                                                                 
 dense_62 (Dense)            (None, 50)                550       
                                                                 
 dense_63 (Dense)            (None, 30)                1530      
                                                                 
 dense_64 (Dense)            (None, 30)                930       
                                                                 
 dense_65 (Dense)            (None, 1)                 31        
                                                                 
Total params: 3,181
Trainable params: 3,181
Non-trainable params: 0
_________________________________________________________________


In [191]:
model.compile(
    optimizer="adam",
    loss="mean_squared_error",
    metrics=['accuracy']
)

In [192]:
with tf.device("/device:GPU:0"):
    model.fit(
        X_train,
        y_train,
        epochs=100,
        validation_split=0.2,
        verbose=False
    )

In [193]:
model.evaluate(X_test, y_test)



[0.9373010396957397, 0.0]

In [194]:
model.evaluate(X_train, y_train)



[1.0543941259384155, 0.0]