In [1]:
import numpy as np
import pandas as pd
import math

# import tensorflow.compat.v1 as tf
# tf.disable_v2_behavior()
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
pd.options.display.float_format = '{:.2f}'.format
# pd.options.display.max_rows = None  # 모든 row 내용 print
pd.options.display.max_rows = 20

In [3]:
# Provide the names for the columns since the CSV file with the data does not have a header row.
feature_names = ['symboling', 'normalized-losses', 'make', 'fuel-type',
        'aspiration', 'num-doors', 'body-style', 'drive-wheels',
        'engine-location', 'wheel-base', 'length', 'width', 'height', 'weight',
        'engine-type', 'num-cylinders', 'engine-size', 'fuel-system', 'bore',
        'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
        'highway-mpg', 'price']

In [4]:
# Load in the data from a CSV file that is comma separated.
car_data = pd.read_csv('https://storage.googleapis.com/mledu-datasets/cars_data.csv',
                     sep=',', names=feature_names, header=None, encoding='latin-1')

In [5]:
car_data = car_data.reindex(np.random.permutation(car_data.index))
# or equivalently,
# car_data = car_data.sample(frac=1)

In [6]:
print("Data set loaded. Num examples: ", len(car_data))

Data set loaded. Num examples:  205


In [7]:
car_data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
116,0,161,peugot,diesel,turbo,four,sedan,rwd,front,107.9,...,152,idi,3.7,3.52,21.0,95,4150,28,33,17950
161,0,91,toyota,gas,std,four,hatchback,fwd,front,95.7,...,98,2bbl,3.19,3.03,9.0,70,4800,28,34,8358
104,3,194,nissan,gas,std,two,hatchback,rwd,front,91.3,...,181,mpfi,3.43,3.27,9.0,160,5200,19,25,17199
138,2,83,subaru,gas,std,two,hatchback,fwd,front,93.7,...,97,2bbl,3.62,2.36,9.0,69,4900,31,36,5118
90,1,128,nissan,diesel,std,two,sedan,fwd,front,94.5,...,103,idi,2.99,3.47,21.9,55,4800,45,50,7099


In [8]:
LABEL = 'price'

numeric_feature_names = ['symboling', 'normalized-losses', 'wheel-base',
        'length', 'width', 'height', 'weight', 'engine-size', 'horsepower',
        'peak-rpm', 'city-mpg', 'highway-mpg', 'bore', 'stroke',
         'compression-ratio']

In [9]:
# Run to inspect numeric features.
car_data[numeric_feature_names]

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,weight,engine-size,horsepower,peak-rpm,city-mpg,highway-mpg,bore,stroke,compression-ratio
116,0,161,107.90,186.70,68.40,56.70,3252,152,95,4150,28,33,3.70,3.52,21.00
161,0,91,95.70,166.30,64.40,52.80,2122,98,70,4800,28,34,3.19,3.03,9.00
104,3,194,91.30,170.70,67.90,49.70,3071,181,160,5200,19,25,3.43,3.27,9.00
138,2,83,93.70,156.90,63.40,53.70,2050,97,69,4900,31,36,3.62,2.36,9.00
90,1,128,94.50,165.30,63.80,54.50,2017,103,55,4800,45,50,2.99,3.47,21.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,2,134,98.40,176.20,65.60,52.00,2714,146,116,4800,24,30,3.62,3.50,9.30
190,3,256,94.50,165.70,64.00,51.40,2221,109,90,5500,24,29,3.19,3.40,8.50
172,2,134,98.40,176.20,65.60,53.00,2975,146,116,4800,24,30,3.62,3.50,9.30
155,0,91,95.70,169.70,63.60,59.10,3110,92,62,4800,27,32,3.05,3.03,9.00


In [10]:
car_data[numeric_feature_names].describe()

Unnamed: 0,symboling,wheel-base,length,width,height,weight,engine-size,city-mpg,highway-mpg,compression-ratio
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,0.83,98.76,174.05,65.91,53.72,2555.57,126.91,25.22,30.75,10.14
std,1.25,6.02,12.34,2.15,2.44,520.68,41.64,6.54,6.89,3.97
min,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,13.0,16.0,7.0
25%,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,19.0,25.0,8.6
50%,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,24.0,30.0,9.0
75%,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,30.0,34.0,9.4
max,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,49.0,54.0,23.0


In [11]:
# Coerce the numeric features to numbers. This is necessary because the model crashes because not all the values are numeric.
for feature_name in numeric_feature_names + [LABEL]:
    car_data[feature_name] = pd.to_numeric(car_data[feature_name], errors ='coerce')

"""
errors: error는 총 3개의 옵션이 존재합니다.
- errors = 'ignore' -> 만약 숫자로 변경할 수 없는 데이터라면 숫자로 변경하지 않고 원본 데이터를 그대로 반환합니다.
- errors = 'coerce' -> 만약 숫자로 변경할 수 없는 데이터라면 기존 데이터를 지우고 NaN으로 설정하여 반환합니다.
- errors = 'raise' -> 만약 숫자로 변경할 수 없는 데이터라면 에러를 일으키며 코드를 중단합니다."""
    
# Fill missing values with 0.
# Is this an OK thing to do? You may want to come back and revisit this decision later.
car_data.fillna(0, inplace=True)

In [12]:
car_data.shape

(205, 26)

In [13]:
X = car_data[numeric_feature_names].copy()
y = car_data['price'].copy()

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 )

### normalization

In [15]:
car_data_norm = pd.DataFrame()
for feature_name in numeric_feature_names + [LABEL]:
    car_data_norm[feature_name] = (car_data[feature_name]-car_data[feature_name].mean())/car_data[feature_name].std()

In [16]:
car_data_norm.head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,weight,engine-size,horsepower,peak-rpm,city-mpg,highway-mpg,bore,stroke,compression-ratio,price
116,-0.67,1.09,1.52,1.03,1.16,1.22,1.34,0.6,-0.2,-1.33,0.43,0.33,0.81,0.6,2.73,0.62
161,-0.67,-0.11,-0.51,-0.63,-0.7,-0.38,-0.83,-0.69,-0.81,-0.4,0.43,0.47,-0.14,-0.29,-0.29,-0.57
104,1.74,1.65,-1.24,-0.27,0.93,-1.65,0.99,1.3,1.39,0.18,-0.95,-0.84,0.31,0.14,-0.29,0.53
138,0.94,-0.25,-0.84,-1.39,-1.17,-0.01,-0.97,-0.72,-0.84,-0.25,0.88,0.76,0.66,-1.51,-0.29,-0.97
90,0.13,0.52,-0.71,-0.71,-0.98,0.32,-1.03,-0.57,-1.18,-0.4,3.02,2.8,-0.51,0.51,2.96,-0.72


In [55]:
X_norm = car_data_norm.copy()
y_norm = X_norm.pop('price')
X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(X_norm, y_norm, test_size=0.2)

In [None]:
X_train_norm_dict = {name:np.array(value) for name, value in X_train_norm.items()}
y_train_norm_dict = {name:np.array(value) for name, value in y_train_norm.name: }

In [49]:
feature = []
feature_columns = [
    tf.feature_column.numeric_column(feature_name)
    for feature_name in ['symboling', 'width']
]

In [50]:
len(feature_columns)

2

In [51]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [35]:
X_train_dict = {name:np.array(value) for name, value in X_train.items()}

In [52]:
print(feature_layer(X_train_dict))

tf.Tensor(
[[ 2.  66.5]
 [ 0.  65.2]
 [ 2.  66.2]
 [ 2.  65.5]
 [-1.  66.5]
 [ 0.  64.4]
 [ 1.  71.4]
 [ 1.  63.8]
 [ 2.  66.5]
 [ 1.  63.8]
 [ 2.  65.5]
 [ 3.  65.7]
 [ 3.  65. ]
 [ 0.  66.9]
 [ 3.  66.3]
 [ 1.  66. ]
 [-1.  68.9]
 [ 1.  63.6]
 [ 1.  65.4]
 [ 2.  63.9]
 [ 1.  64.2]
 [ 3.  66.3]
 [ 2.  65.6]
 [ 0.  69.6]
 [ 0.  70.6]
 [ 1.  65.5]
 [ 0.  65.4]
 [ 2.  64.4]
 [ 0.  66.5]
 [ 0.  65.2]
 [ 0.  65.4]
 [ 1.  63.8]
 [ 1.  63.6]
 [ 1.  66.5]
 [ 0.  66.9]
 [ 0.  68.4]
 [ 1.  63.8]
 [ 2.  63.8]
 [ 0.  66.1]
 [ 0.  68.3]
 [ 1.  63.8]
 [ 0.  66.5]
 [ 1.  63.8]
 [ 0.  63.6]
 [-1.  67.2]
 [-1.  66.5]
 [ 0.  65.4]
 [ 1.  64.2]
 [ 0.  64.4]
 [ 0.  68.4]
 [ 0.  68.4]
 [ 1.  65.4]
 [ 0.  68.4]
 [ 0.  65.4]
 [ 0.  69.6]
 [-1.  67.2]
 [ 2.  65.5]
 [ 0.  68.4]
 [ 0.  61.8]
 [ 0.  66.5]
 [-1.  64.6]
 [ 2.  65.6]
 [ 0.  65.4]
 [ 3.  65. ]
 [-1.  70.3]
 [ 2.  60.3]
 [ 0.  64.8]
 [ 1.  63.8]
 [ 0.  64.4]
 [ 1.  71.4]
 [-1.  70.3]
 [ 1.  63.8]
 [ 1.  64. ]
 [-1.  65.4]
 [ 1.  72. ]
 [ 2.  64.8]
 

### *********************************************************

In [None]:
sym = {X_train['symboling'].name: X_train['symboling'].values}
sym

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(units=3, input_shape=(15,), activation='relu'))
model.add(tf.keras.layers.Dense(units=1))
model.compile(tf.keras.optimizers.Adam(learning_rate=0.01),
             loss='mean_squared_error',
             metrics=[tf.keras.metrics.MeanSquaredError()])

In [None]:
batch_size=16
epochs=500
model.fit(X_train, y_train, epochs=epochs, batch_size= batch_size)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
X_norm = car_data_norm[numeric_feature_names].copy()
y_norm = car_data_norm['price'].copy()

In [None]:
Xn_train, Xn_test, yn_train, yn_test = train_test_split(X_norm, y_norm, test_size=0.2)

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(units=3, input_shape=(15,), activation='relu'))
model.add(tf.keras.layers.Dense(units=1))
model.compile(tf.keras.optimizers.Adam(learning_rate=0.01),
             loss='mean_squared_error',
             metrics=[tf.keras.metrics.MeanSquaredError()])

In [None]:
batch_size=16
epochs=500
model.fit(Xn_train, yn_train, epochs=epochs, batch_size= batch_size)

In [None]:
model.evaluate(Xn_test, yn_test)

In [None]:
car_data_minmax = pd.DataFrame()
for feature_name in numeric_feature_names + [LABEL]:
    car_data_minmax[feature_name] = (car_data[feature_name]-car_data[feature_name].min())/(car_data[feature_name].max()-car_data[feature_name].min())

In [None]:
Xm_norm = car_data_minmax[numeric_feature_names].copy()
ym_norm = car_data_minmax['price'].copy()

In [None]:
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm_norm, ym_norm, test_size=0.2)

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(units=3, input_shape=(15,), activation='relu'))
model.add(tf.keras.layers.Dense(units=1))
model.compile(tf.keras.optimizers.Adam(learning_rate=0.01),
             loss='mean_squared_error',
             metrics=[tf.keras.metrics.MeanSquaredError()])

In [None]:
batch_size=16
epochs=500
model.fit(Xm_train, ym_train, epochs=epochs, batch_size= batch_size)

In [None]:
model.evaluate(Xm_test, ym_test)

In [None]:
feature_columns = []
# for feature_name in numeric_feature_names + [LABEL]:
for feature_name in numeric_feature_names:
    feature_columns.append(tf.feature_column.numeric_column(feature_name))
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [None]:
model = tf.keras.models.Sequential()
model.add(feature_layer)
model.add(tf.keras.layers.Dense(units=3, input_shape=(1,), activation='relu'))
# model.add(tf.keras.layers.Dense(units=1, activation='relu')) 
#                         feature_layer일 경우 input_dim은 무조건 1?
model.add(tf.keras.layers.Dense(units=1))
model.compile(tf.keras.optimizers.Adam(learning_rate=0.01),
             loss='mean_squared_error',
             metrics=[tf.keras.metrics.MeanSquaredError()])

### convert dataframe to dict
when using feature_layer, <span style='color:red'> use dict </span> for input data in model.fit

In [None]:
Xm_train_dict = {name:np.array(value) for name, value in Xm_train.items()}
# ym_train_dict = {name:np.array(value) for name, value in ym_train.items()} # never use this for Series
ym_train_dict = {ym_train.name:ym_train.values}

In [None]:
print(feature_layer(Xm_train_dict))

In [None]:
batch_size=16
epochs=500
model.fit(Xm_train_dict, ym_train_dict, epochs=epochs, batch_size= batch_size)

In [None]:
Xm_train.head()

### feature column with normalization
using normalizer_fn=

In [None]:
car_data_minmax = pd.DataFrame()
for feature_name in numeric_feature_names + [LABEL]:
    car_data_minmax[feature_name] = (car_data[feature_name]-car_data[feature_name].min())/(car_data[feature_name].max()-car_data[feature_name].min())

In [None]:
a = lambda x: x-10

In [None]:
X_train_dict = {name:np.array(value) for name, value in X_train.items()}
y_train_dict = {name:np.array(value) for name, value in y_train.items()}

In [None]:
car_data.mean()[feature_name], car_data[feature_name].mean()

In [None]:
model_feature_columns = [
    tf.feature_column.numeric_column(feature_name,
                                     normalizer_fn=lambda val: (val - car_data.min()[feature_name]) / (car_data.max()[feature_name] - car_data.min()[feature_name]))
    for feature_name in numeric_feature_names
]
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [None]:
model_feature_columns = [
    tf.feature_column.numeric_column(feature_name,
                                     normalizer_fn=lambda val: (val - 4) / (10 - 5))
    for feature_name in ['price']
]
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [None]:
y_train_dict

In [None]:
y_train_dict = {y_train.name:y_train.values}
y_train_dict

In [None]:
print(feature_layer(y_train_dict))

In [None]:
X_train_dict = {name:np.array(value) for name, value in X_train.items()}
print(feature_layer(X_train_dict))

In [None]:
X_train_dict

In [None]:
feature_columns = []
# for feature_name in numeric_feature_names + [LABEL]:
for feature_name in numeric_feature_names:
    feature_columns.append(tf.feature_column.numeric_column(feature_name))
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)