In [1]:
from copy import deepcopy

import numpy as np
import pandas as pd

import datetime

import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.losses import MeanSquaredError

In [2]:
CONFIGS = {
    'data_path': '../data/',
    'model_path': '../model/',
    'model_name': 'multi_task_learning',
    'model_type': 'cnn1d',
    
    'valid_start_date_time': '2020-08-11 00',
    'test_start_index': '2020-08-18 00',
    
    'batch_size': 64,
    'learning_rate': 1e-4,
    'epochs': 100,
    'es_patience': 10,
    
    'window_size': 7*24,
    'target_length': 3,
}

In [3]:
train_origin = pd.read_csv(CONFIGS['data_path']+'train.csv', encoding='cp949')

In [4]:
data = deepcopy(train_origin)

data.columns = [
    'num', 'date_time', 'target', 'temp', 'wind',
    'humid', 'rain', 'sun', 'non_elec_eq', 'sunlight_eq'
]

data['num'] -= 1

print(f'data.shape: {data.shape}')

data.shape: (122400, 10)


In [5]:
def mk_time_data(data):
    
    new_data = data.copy()

    new_data['date_time'] = data['date_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H'))
    
    new_data['time_stamp'] = new_data['date_time'].apply(lambda x: x.timestamp())
    
    new_data['year'] = new_data['date_time'].apply(lambda x: x.year)
    new_data['month'] = new_data['date_time'].apply(lambda x: x.month)
    new_data['day'] = new_data['date_time'].apply(lambda x: x.day)
    
    new_data['hour'] = new_data['date_time'].apply(lambda x: x.hour)
    new_data['cos_hour'] = np.cos(2*np.pi*(new_data['hour']/24))
    new_data['sin_hour'] = np.sin(2*np.pi*(new_data['hour']/24))

    new_data['weekday'] = new_data['date_time'].apply(lambda x: x.weekday())
    new_data['cos_weekday'] = np.cos(2*np.pi*(new_data['weekday']/7))
    new_data['sin_weekday'] = np.sin(2*np.pi*(new_data['weekday']/7))
    
    new_data['is_holiday'] = 0
    new_data.loc[(new_data['weekday'] == 5) | (new_data['weekday'] == 6), 'is_holiday'] = 1
    new_data.loc[(new_data['month'] == 8) & (new_data['day'] == 17), 'is_holiday'] = 1
    
    return new_data

In [6]:
new_data = mk_time_data(data)

In [7]:
def mk_mean_std_dict(data, scaling_cols):
    mean_std_dict = {}
    for num in range(60):
        building = data.query(f'num == {num}')
        mean_std_dict[num] = {
            col: {
                'mean': building[col].mean(),
                'std': building[col].std()
            } for col in scaling_cols
        }
    return mean_std_dict

In [8]:
scaling_cols = [
    'temp', 'wind', 'humid', 'rain', 'sun', 'time_stamp', 'target'
]

mean_std_dict = mk_mean_std_dict(
    new_data[new_data['date_time'] < CONFIGS['valid_start_date_time']],
    scaling_cols
)
CONFIGS['mean_std_dict'] = mean_std_dict

In [9]:
def standard_scaling(data, scaling_cols, mean_std_dict=None):
    if not mean_std_dict:
        mean_std_dict = mk_mean_std_dict(data, scaling_cols)
        
    new_data = data.copy()
    for num in range(60):
        for col in scaling_cols:
            new_data.loc[new_data['num']==num, col] -= mean_std_dict[num][col]['mean']
            new_data.loc[new_data['num']==num, col] /= mean_std_dict[num][col]['std']
    return new_data

In [10]:
new_data = standard_scaling(new_data, scaling_cols, mean_std_dict)

In [23]:
time_series_cols = [
    'temp', 'wind', 'humid', 'rain', 'sun', 'time_stamp',
    'cos_hour', 'sin_hour', 'cos_weekday', 'sin_weekday',
    'is_holiday', 'target',
]
building_num_cols = ['num']
target_time_info_cols = [
    'temp', 'wind', 'humid', 'rain', 'sun', 'time_stamp',
    'cos_hour', 'sin_hour', 'cos_weekday', 'sin_weekday',
    'is_holiday',
]
target_cols = ['target']

CONFIGS['time_series_cols'] = time_series_cols
CONFIGS['building_num_cols'] = building_num_cols
CONFIGS['target_time_info_cols'] = target_time_info_cols
CONFIGS['target_cols'] = target_cols

input_cols = list(set(time_series_cols+building_num_cols+target_time_info_cols+target_cols))

In [17]:
data = new_data.copy()

In [24]:
data = data[input_cols]

building_length = data.query('num == 0').shape[0]
ds = Dataset.from_tensor_slices(data).batch(building_length)

In [25]:
iter(ds).next()

<tf.Tensor: shape=(2040, 13), dtype=float64, numpy=
array([[-0.54498631, -1.73052664,  0.11081364, ...,  0.        ,
         0.        , -2.02148701],
       [-0.54498631, -1.72849431,  0.46409045, ...,  0.25881905,
         0.        , -1.99078964],
       [-0.54498631, -1.72646198,  0.72904805, ...,  0.5       ,
         0.        , -2.05218439],
       ...,
       [-0.54498631,  2.40932393, -1.12565516, ..., -0.70710678,
         0.        ,  1.26313214],
       [-0.54498631,  2.41135626,  0.02249444, ..., -0.5       ,
         0.        ,  1.26313214],
       [-0.54498631,  2.41338859, -0.06582476, ..., -0.25881905,
         0.        ,  1.20173739]])>

In [30]:
CONFIGS['valid_start_date_time']-datetime.timedelta(hours=CONFIGS['window_size'])

datetime.datetime(2020, 8, 4, 0, 0)