In [115]:
import pandas as pd
import numpy as np
import random
import gc

In [135]:
# load data
time_data = pd.read_csv('data/original/time_data.txt', sep  = ' ') 
nontime_data = pd.read_csv('data/original/nontime_data.txt', sep = ' ')

# select features
time_data = time_data[['flow_pop', 'HDONG_CD', 'time',
                    'card_use', 'holiday', 'day_corona', 'ondo', 'subdo',
                    'rain_snow', 'STD_YMD']]

# change string time to int time
time_data.time[time_data.time == 'morning'] = 0 # morning
time_data.time[time_data.time == 'lunch'] = 1 # lunch
time_data.time[time_data.time == 'evening'] = 2 # evening

# to datetime
time_data.STD_YMD = pd.to_datetime(time_data.STD_YMD)

# make dayofyear weekday
time_data['dayofyear'] = time_data.STD_YMD.dt.dayofyear
time_data['weekday'] = time_data.STD_YMD.dt.weekday

time_data['dayofyear_sin'] = np.sin(2 * np.pi * (time_data['dayofyear'])/365)
time_data['dayofyear_cos'] = np.cos(2 * np.pi * (time_data['dayofyear'])/365)
time_data['weekday_sin'] = np.sin(2 * np.pi * (time_data['weekday'])/7)# 월화수목금토일
time_data['weekday_cos'] = np.cos(2 * np.pi * (time_data['weekday'])/7)

# reselect features
time_data = time_data[['flow_pop', 'HDONG_CD', 'time',
                    'card_use', 'holiday', 'day_corona', 'ondo', 'subdo',
                    'rain_snow', 'dayofyear_sin', 'dayofyear_cos', 'weekday_sin', 'weekday_cos']]

# set constant
LOC_SIZE = 69
TIME_SIZE = 3
DATE_SIZE = 241
FEATURE_SIZE = 13

# table -> matrix
time_data = np.array(time_data).reshape(LOC_SIZE, TIME_SIZE, DATE_SIZE, FEATURE_SIZE)# 지역, 시간, 날짜, features


# flow_tot -> flow_trend + flow_cycle 분리
def split_trend_cycle(flow_pop):
    flow_pop = flow_pop.reshape(LOC_SIZE, TIME_SIZE, DATE_SIZE, 1)
    trend = flow_pop
    cycle = flow_pop
    return np.concatenate([trend, cycle], axis = 3)

time_data_new = np.empty([LOC_SIZE, TIME_SIZE, DATE_SIZE, FEATURE_SIZE + 2])
time_data_new[:,:,:,:-2] = time_data
# tren, cycle 순서로 resturn
time_data_new[:,:,:,-2:] = split_trend_cycle(time_data[:,:,:,0])# flow_pop의 index는 0

time_data = time_data_new
del time_data_new
gc.collect()

22405

In [136]:
time_data.shape

(69, 3, 241, 15)

In [151]:
# make_random 
loc_index = [i for i in range(69)]
random.seed(1015)
random.shuffle(loc_index)

#set window size
WINDOW_SIZE = 20

#
train_time = time_data[loc_index[ :55], : , :201, :]

valid_time_1 = time_data[loc_index[ :55], :, 201 -WINDOW_SIZE : -20, :] # train 지역& valid 기간
valid_time_2 = time_data[loc_index[55:62], :, : -20, :] # valid 지역 & (train + valid) 기간

test_time_1 = time_data[loc_index[:62], :, 221 -WINDOW_SIZE : , :] # train,valid 지역& test 기간
test_time_2 = time_data[loc_index[62:], :, :, :] # test 지역 & (train + valid + test) 기간

print('\n'.join([f'train_time : {train_time.shape}',
                f'valid_time_1 : {valid_time_1.shape}',
                f'valid_time_2 : {valid_time_2.shape}',
                f'test_time_1  : {test_time_1.shape}',
                f'test_time_2  : {test_time_2.shape}']      ))

train_time : (55, 3, 201, 15)
valid_time_1 : (55, 3, 40, 15)
valid_time_2 : (7, 3, 221, 15)
test_time_1  : (62, 3, 40, 15)
test_time_2  : (7, 3, 241, 15)


In [152]:
pd.DataFrame(time_data[0,0,:,:],
columns = [['flow_pop', 'HDONG_CD', 'time',
                    'card_use', 'holiday', 'day_corona', 'ondo', 'subdo',
                    'rain_snow', 'dayofyear_sin', 'dayofyear_cos', 'weekday_sin', 'weekday_cos','flow_trend', 'flow_cycle']])

Unnamed: 0,flow_pop,HDONG_CD,time,card_use,holiday,day_corona,ondo,subdo,rain_snow,dayofyear_sin,dayofyear_cos,weekday_sin,weekday_cos,flow_trend,flow_cycle
0,43561.37,2.726067e+09,2.0,1135084.0,0.0,0.0,2.84,39.8,0.0,0.523416,0.852078,-0.433884,-0.900969,43561.37,43561.37
1,42651.66,2.726067e+09,2.0,1339086.0,0.0,0.0,6.96,37.4,0.0,0.538005,0.842942,-0.974928,-0.222521,42651.66,42651.66
2,38005.95,2.726067e+09,2.0,1127973.0,0.0,0.0,6.70,93.8,0.0,0.552435,0.833556,-0.781831,0.623490,38005.95,38005.95
3,34415.38,2.726067e+09,2.0,941383.0,1.0,0.0,5.96,17.4,0.0,0.566702,0.823923,0.000000,1.000000,34415.38,34415.38
4,39604.13,2.726067e+09,2.0,423065.0,1.0,0.0,8.70,21.4,0.0,0.580800,0.814046,0.781831,0.623490,39604.13,39604.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,39267.48,2.726067e+09,2.0,1003500.0,0.0,40.0,25.28,28.0,0.0,0.559589,-0.828770,0.974928,-0.222521,39267.48,39267.48
237,44782.75,2.726067e+09,2.0,861754.0,0.0,79.0,24.66,36.4,0.0,0.545240,-0.838280,0.433884,-0.900969,44782.75,44782.75
238,45447.02,2.726067e+09,2.0,986674.0,0.0,58.0,25.10,36.6,0.0,0.530730,-0.847541,-0.433884,-0.900969,45447.02,45447.02
239,42518.02,2.726067e+09,2.0,1210856.0,0.0,39.0,25.66,39.6,0.0,0.516062,-0.856551,-0.974928,-0.222521,42518.02,42518.02
