### 0. 필요한 Library Import

In [None]:
!pip install pydot graphviz

In [2]:
from datetime import datetime
import pandas as pd
import tensorflow as tf

2024-02-13 07:38:19.486662: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-13 07:38:25.983104: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2024-02-13 07:38:25.984693: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/loca

### 1. BigQuery Connector를 활용한 Pandas DataFrame 생성

In [3]:
# The following two lines are only necessary to run once.
# Comment out otherwise for speed-up.
from google.cloud.bigquery import Client, QueryJobConfig
client = Client()

query = """WITH staging AS (
    SELECT
        STRUCT(
            start_stn.name,
            ST_GEOGPOINT(start_stn.longitude, start_stn.latitude) AS POINT,
            start_stn.docks_count,
            start_stn.install_date
        ) AS starting,
        STRUCT(
            end_stn.name,
            ST_GEOGPOINT(end_stn.longitude, end_stn.latitude) AS point,
            end_stn.docks_count,
            end_stn.install_date
        ) AS ending,
        STRUCT(
            rental_id,
            bike_id,
            duration, --seconds
            ST_DISTANCE(
                ST_GEOGPOINT(start_stn.longitude, start_stn.latitude),
                ST_GEOGPOINT(end_stn.longitude, end_stn.latitude)
            ) AS distance, --meters
            start_date,
            end_date
        ) AS bike
        FROM `bigquery-public-data.london_bicycles.cycle_stations` AS start_stn
        LEFT JOIN `bigquery-public-data.london_bicycles.cycle_hire` as b 
        ON start_stn.id = b.start_station_id
        LEFT JOIN `bigquery-public-data.london_bicycles.cycle_stations` AS end_stn
        ON end_stn.id = b.end_station_id
        LIMIT 100000)

SELECT * from STAGING"""
job = client.query(query)
df = job.to_dataframe()

In [4]:
df

Unnamed: 0,starting,ending,bike
0,"{'name': 'Westbourne Grove, Bayswater', 'POINT...","{'name': 'Park Lane , Hyde Park', 'point': 'PO...","{'rental_id': 74377596, 'bike_id': 14110, 'dur..."
1,"{'name': 'Westbourne Grove, Bayswater', 'POINT...","{'name': 'George Street, Marylebone', 'point':...","{'rental_id': 86024974, 'bike_id': 15856, 'dur..."
2,"{'name': 'Westbourne Grove, Bayswater', 'POINT...","{'name': 'Collingham Gardens, Earl's Court', '...","{'rental_id': 96741475, 'bike_id': 9871, 'dura..."
3,"{'name': 'Westbourne Grove, Bayswater', 'POINT...","{'name': 'Limerston Street, West Chelsea', 'po...","{'rental_id': 52067778, 'bike_id': 1752, 'dura..."
4,"{'name': 'Westbourne Grove, Bayswater', 'POINT...","{'name': 'Seville Street, Knightsbridge', 'poi...","{'rental_id': 117308140, 'bike_id': 2765, 'dur..."
...,...,...,...
99995,"{'name': 'Westbourne Grove, Bayswater', 'POINT...","{'name': 'Albert Gate, Hyde Park', 'point': 'P...","{'rental_id': 59561760, 'bike_id': 2958, 'dura..."
99996,"{'name': 'Westbourne Grove, Bayswater', 'POINT...","{'name': 'Albert Gate, Hyde Park', 'point': 'P...","{'rental_id': 84596956, 'bike_id': 242, 'durat..."
99997,"{'name': 'Westbourne Grove, Bayswater', 'POINT...","{'name': 'Albert Gate, Hyde Park', 'point': 'P...","{'rental_id': 69875465, 'bike_id': 14638, 'dur..."
99998,"{'name': 'Westbourne Grove, Bayswater', 'POINT...","{'name': 'Albert Gate, Hyde Park', 'point': 'P...","{'rental_id': 112339479, 'bike_id': 73, 'durat..."


### 2. ML 학습에 활용할 데이터만 정제

In [5]:
values = df['bike'].values
duration = list(map(lambda a: a['duration'], values))
distance = list(map(lambda a: a['distance'], values))
dates = list(map(lambda a: a['start_date'], values))
data = pd.DataFrame(data={'duration': duration, 'distance': distance, 'start_date':dates})
data = data.dropna()

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 98119 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   duration    98119 non-null  float64            
 1   distance    98119 non-null  float64            
 2   start_date  98119 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), float64(2)
memory usage: 3.0 MB


### 3. 데이터 가공 
1. strat_date(datetime64) -> weekday 추출, hour 추출, 기존 컬럼 제거
2. duration -> minute 단위로 변환 

In [9]:
data['weekday'] = data['start_date'].apply(lambda a: a.weekday())
data['hour'] = data['start_date'].apply(lambda a: a.time().hour)
data = data.drop(columns=['start_date'])

data['duration'] = data['duration'].apply(lambda x:float(x / 60))

In [10]:
data.head()

Unnamed: 0,duration,distance,weekday,hour
0,64.0,2362.138868,5,13
1,47.0,2512.004888,4,15
2,30.0,2670.061603,4,10
3,30.0,3387.911177,4,11
4,30.0,2629.830365,2,19


### 4. TensorFlow Model 생성
1. dataset을 train, test set으로 나누기
2. keras functional API를 활용해서, pandas dataframe을 tf.data.Dataset으로 변경

In [11]:
def df_to_dataset(dataframe, label, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop(label)
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [13]:
train_size = int(len(data) * .8)
train_data = data[:train_size]
val_data = data[train_size:]

print(len(train_data)/len(data))
print(len(val_data)/len(data))

0.7999979616588021
0.20000203834119792


In [14]:
train_dataset = df_to_dataset(train_data, 'duration')
validation_dataset = df_to_dataset(val_data, 'duration')

2024-02-13 07:49:01.072289: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2024-02-13 07:49:01.073418: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2024-02-13 07:49:01.073485: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (instance-20240213-161422): /proc/driver/nvidia/version does not exist
2024-02-13 07:49:01.080866: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow 

3. model layer를 생성
 - Normalization layer : distance
 - CategoryEncoding layer, IntegerLookup layer : weekday, hour
 - Input layer를 생성

In [15]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for our feature.
  normalizer = tf.keras.layers.Normalization(axis=None)

  # Prepare a Dataset that only yields our feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer

def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)

  # Prepare a Dataset that only yields our feature
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Create a Discretization for our integer indices.
  encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply one-hot encoding to our indices. The lambda function captures the
  # layer so we can use them, or include them in the functional model later.
  return lambda feature: encoder(index(feature))


In [16]:
# Create a Keras input layer for each feature
numeric_col = tf.keras.Input(shape=(1,), name='distance')
hour_col = tf.keras.Input(shape=(1,), name='hour', dtype='int64')
weekday_col = tf.keras.Input(shape=(1,), name='weekday', dtype='int64')

4. Layer 결합

In [17]:
all_inputs = []
encoded_features = []

# Pass 'distance' input to normalization layer
normalization_layer = get_normalization_layer('distance', train_dataset)
encoded_numeric_col = normalization_layer(numeric_col)
all_inputs.append(numeric_col)
encoded_features.append(encoded_numeric_col)

# Pass 'hour' input to category encoding layer
encoding_layer = get_category_encoding_layer('hour', train_dataset, dtype='int64')
encoded_hour_col = encoding_layer(hour_col)
all_inputs.append(hour_col)
encoded_features.append(encoded_hour_col)

# Pass 'weekday' input to category encoding layer
encoding_layer = get_category_encoding_layer('weekday', train_dataset, dtype='int64')
encoded_weekday_col = encoding_layer(weekday_col)
all_inputs.append(weekday_col)
encoded_features.append(encoded_weekday_col)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [18]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(64, activation="relu")(all_features)
output = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(all_inputs, output)

In [19]:
model.compile(optimizer = tf.keras.optimizers.Adam(0.001),
              loss='mean_squared_logarithmic_error')

In [25]:
tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


### 5. TensorFlow Model 학습

In [26]:
model.fit(train_dataset, validation_data = validation_dataset, epochs = 5)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fdc38ff65c0>