### 2021_11_02_3

### 딥러닝 모델 구현

In [7]:
import tensorflow as tf
import keras

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
print("tf version : {}".format(tf.__version__))
print("keras version : {}".format(keras.__version__))
print("numpy version : {}".format(np.__version__))
print("matplotlib version : {}".format(matplotlib.__version__))
print("pandas version : {}".format(pd.__version__))

tf version : 2.6.1
keras version : 2.6.0
numpy version : 1.19.5
matplotlib version : 3.4.3
pandas version : 1.3.2


### 데이터 셋 불러오기

In [3]:
## train 데이터 셋 , test 데이터 셋
## train 은 학습을 위한 입력 데이터 셋
## test 은 예측을 위한 새로운 데이터 셋(평가)
## parse_dates : datetime 컬럼을 시간형으로 불러올 수 있음
train = pd.read_csv("../CSV/bike_mod_train.csv", parse_dates=['datetime'])
test = pd.read_csv("../CSV/bike_mod_test.csv", parse_dates=['datetime'])

### 데이터 탐색

In [4]:
train.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'year', 'month', 'day', 'hour', 'minute', 'second', 'dayofweek'],
      dtype='object')

In [5]:
test.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek',
       'hour', 'minute', 'second'],
      dtype='object')

In [6]:
print(train.info())
print()
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 19 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  count       10886 non-null  int64         
 12  year        10886 non-null  int64         
 13  month       10886 non-null  int64         
 14  day         10886 non-null  int64         
 15  hour        10886 non-null  int64         
 16  minute      10886 non-

### 모델을 위한 데이터 선택
* X : hour, temp(시간, 온도)
* y : count - 시간대별 자전거 렌탈 대수

In [8]:
input_col = ["hour", "temp"]
labeled_col = ["count"]

X = train[input_col]
y = train[labeled_col]

X_val = test[input_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   random_state = 0)

print("X_train shape :", X_train.shape)
print("X_test shape :", X_test.shape)

X_train shape : (8164, 2)
X_test shape : (2722, 2)


### 딥러닝 모델 만들기
* 케라스 라이브러리 중에서 Sequential 함수는 딥러닝의 구조를 한층 한층 쉽게 쌓아올릴 수 있다.
* Sequential() 함수 선언 후, 신경망의 층을 쌓기 위해 model.add() 함수를 사용한다
* input_dim 입력층 노드의 수
* activation - 활성화 함수 선언 (relu, sigmoid)
* Dense() 함수를 이용하여 각 층에 세부 내용을 설정해 준다.

In [9]:
from keras.models import Sequential
from keras.layers import Dense

In [10]:
model = Sequential()
model.add(Dense(30, input_dim = 2, activation = "relu"))
model.add(Dense(1))

2021-11-02 11:44:38.832395: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 30)                90        
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 31        
Total params: 121
Trainable params: 121
Non-trainable params: 0
_________________________________________________________________


### 미니배치의 이해
* 이미지를 하나씩 학습시키는 것보다 여러 개를 한꺼번에 학습시키는 쪽이 효과가 좋다.
* 많은 메모리와 높은 컴퓨터 성능이 필요하므로 일반적으로 데이터를 적당한 크기로 잘라서 학습시킨다.
    * 미니배치라고 한다.

### 딥러닝 실행

In [12]:
model.compile(loss = "mean_squared_error", optimizer = "rmsprop")
model.fit(X_train, y_train, epochs = 20, batch_size = 10)

2021-11-02 11:47:12.542054: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f80ee12eaf0>

### 평가 확인

In [14]:
model.evaluate(X_test, y_test)



19340.134765625

In [15]:
pred = model.predict(X_val)
sub = pd.read_csv("../CSV/bike-sharing-demand/sampleSubmission.csv")
sub["count"] = pred

sub.loc[sub["count"] < 0, "count"] = 0

sub.to_csv("../CSV/bike-sharing-demand/10_sub.csv", index = False)

In [16]:
train.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'year', 'month', 'day', 'hour', 'minute', 'second', 'dayofweek'],
      dtype='object')

### 모델 개선
* 01 변수 추가, 성능 개선 확인
* 02 은닉층 추가
* 03 노드수 늘리기

### 은닉층 1개 추가

In [18]:
model = Sequential()
model.add(Dense(16, input_dim = 2, activation = "relu"))
model.add(Dense(16, activation = "relu"))
model.add(Dense(1))

model.compile(loss = "mean_squared_error", optimizer = "rmsprop")
model.fit(X_train, y_train, epochs = 20, batch_size = 10)

pred = model.predict(X_val)
sub = pd.read_csv("../CSV/bike-sharing-demand/sampleSubmission.csv")
sub["count"] = pred

sub.loc[sub["count"] < 0, "count"] = 0

sub.to_csv("../CSV/bike-sharing-demand/11_sub.csv", index = False)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### 변수 1개 추가, 은닉층 3개 추가
* kaggle score = 1.03439

In [25]:
input_col = ["hour", "temp", "atemp"]
labeled_col = ["count"]

X = train[input_col]
y = train[labeled_col]

X_val = test[input_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   random_state = 0)

print("X_train shape :", X_train.shape)
print("X_test shape :", X_test.shape)

model = Sequential()
model.add(Dense(16, input_dim = 3, activation = "relu"))
model.add(Dense(32, activation = "relu"))
model.add(Dense(64, activation = "relu"))
model.add(Dense(32, activation = "relu"))
model.add(Dense(16, activation = "relu"))
model.add(Dense(1))

model.compile(loss = "mean_squared_error", optimizer = "rmsprop")
model.fit(X_train, y_train, epochs = 20, batch_size = 10)

pred = model.predict(X_val)
sub = pd.read_csv("../CSV/bike-sharing-demand/sampleSubmission.csv")
sub["count"] = pred

sub.loc[sub["count"] < 0, "count"] = 0

sub.to_csv("../CSV/bike-sharing-demand/12_sub.csv", index = False)

X_train shape : (8164, 3)
X_test shape : (2722, 3)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### 변수 1개 추가, 은닉층 3개 추가, 노드수 추가, epochs = 100
* kaggle score = 0.75426

In [30]:
input_col = ["hour", "temp", "atemp"]
labeled_col = ["count"]

X = train[input_col]
y = train[labeled_col]

X_val = test[input_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   random_state = 0)

print("X_train shape :", X_train.shape)
print("X_test shape :", X_test.shape)

model = Sequential()
model.add(Dense(20, input_dim = 3, activation = "relu"))
model.add(Dense(32, activation = "relu"))
model.add(Dense(64, activation = "relu"))
model.add(Dense(32, activation = "relu"))
model.add(Dense(16, activation = "relu"))
model.add(Dense(1))

model.compile(loss = "mean_squared_error", optimizer = "rmsprop")
model.fit(X_train, y_train, epochs = 100, batch_size = 10)

pred = model.predict(X_val)
sub = pd.read_csv("../CSV/bike-sharing-demand/sampleSubmission.csv")
sub["count"] = pred

sub.loc[sub["count"] < 0, "count"] = 0

sub.to_csv("../CSV/bike-sharing-demand/13_sub.csv", index = False)

X_train shape : (8164, 3)
X_test shape : (2722, 3)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/

Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
