In [325]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import StandardScaler
import gc

In [338]:
# load data
time_data = pd.read_csv('data/original/time_data.txt', sep  = ' ') 

# make loc_list(dong code)
loc_list = list(time_data.HDONG_CD.unique())


# select features
time_data = time_data[['flow_pop', 'HDONG_CD', 'time',
                    'card_use', 'holiday', 'day_corona', 'ondo', 'subdo',
                    'rain_snow', 'STD_YMD']]

# change string time to int time
time_data.time[time_data.time == 'morning'] = 0 # morning
time_data.time[time_data.time == 'lunch'] = 1 # lunch
time_data.time[time_data.time == 'evening'] = 2 # evening

# to datetime
time_data.STD_YMD = pd.to_datetime(time_data.STD_YMD)

# make dayofyear weekday
time_data['dayofyear'] = time_data.STD_YMD.dt.dayofyear
time_data['weekday'] = time_data.STD_YMD.dt.weekday
time_data['dayofyear_sin'] = np.sin(2 * np.pi * (time_data['dayofyear'])/365)
time_data['dayofyear_cos'] = np.cos(2 * np.pi * (time_data['dayofyear'])/365)
time_data['weekday_sin'] = np.sin(2 * np.pi * (time_data['weekday'])/7)# 월화수목금토일
time_data['weekday_cos'] = np.cos(2 * np.pi * (time_data['weekday'])/7)

# reselect features
time_data = time_data[['HDONG_CD', 'time','flow_pop',
                    'card_use', 'holiday', 'day_corona', 'ondo', 'subdo',
                    'rain_snow', 'dayofyear_sin', 'dayofyear_cos', 'weekday_sin', 'weekday_cos']]

# set constant
LOC_SIZE = 69
TIME_SIZE = 3
DATE_SIZE = 241
FEATURE_SIZE = 13

# table -> matrix
time_data = np.array(time_data).reshape(LOC_SIZE, TIME_SIZE, DATE_SIZE, FEATURE_SIZE)# 지역, 시간, 날짜, features

In [339]:
# flow_tot -> flow_trend + flow_cycle 분리
def split_trend_cycle(flow_pop):
    flow_pop = flow_pop.reshape(LOC_SIZE, TIME_SIZE, DATE_SIZE, 1)
    trend = flow_pop
    cycle = flow_pop
    return np.concatenate([trend, cycle], axis = 3)

time_data_new = np.empty([LOC_SIZE, TIME_SIZE, DATE_SIZE, FEATURE_SIZE + 2])
time_data_new[:,:,:,:-2] = time_data
# trend, cycle 순서로 resturn
time_data_new[:,:,:,-2:] = split_trend_cycle(time_data[:,:,:,0])# flow_pop의 index는 2

time_data = time_data_new
del time_data_new
gc.collect()


# make_random 
loc_index = [i for i in range(69)]
random.seed(1015)
random.shuffle(loc_index)

#set window size
WINDOW_SIZE = 20
# split time data
train_time = time_data[loc_index[ :55], : , :201, :]
valid_time_1 = time_data[loc_index[ :55], :, 201 -WINDOW_SIZE : -20, :] # train 지역& valid 기간
valid_time_2 = time_data[loc_index[55:62], :, : -20, :] # valid 지역 & (train + valid) 기간
test_time_1 = time_data[loc_index[:62], :, 221 -WINDOW_SIZE : , :] # train,valid 지역& test 기간
test_time_2 = time_data[loc_index[62:], :, :, :] # test 지역 & (train + valid + test) 기간

print('\n'.join([f'train_time : {train_time.shape}',
                f'valid_time_1 : {valid_time_1.shape}',
                f'valid_time_2 : {valid_time_2.shape}',
                f'test_time_1 : {test_time_1.shape}',
                f'test_time_2 : {test_time_2.shape}\n'] ))



# set loc index
train_loc_index = list(set(train_time[:,0,0,0].astype(np.int64)))
valid_loc_index = list(set(valid_time_2[:,0,0,0].astype(np.int64)))
test_loc_index = list(set(test_time_2[:,0,0,0].astype(np.int64)))
print('\n'.join([f'size of train_loc_index : {len(train_loc_index)}',
                f'size of valid_loc_index : {len(valid_loc_index)}',
                f'size of test_loc_index : {len(test_loc_index)}\n'] ))


def scaleing_time(data, scaler = None):
    shape = data.shape
    data = data.reshape(-1, shape[-1])
    if scaler == None:
        scaler = StandardScaler()
        scaler.fit(data)
    scaled_data = scaler.transform(data)
    return scaler, scaled_data.reshape(shape)

def scaleing_no_time(data, scaler = None):
    df_index = data.index
    df_columns = data.columns
    if scaler == None:
        scaler = StandardScaler()
        scaler.fit(data.values.reshape(-1,data.shape[-1]))
    data = scaler.transform(data.values.reshape(-1,data.shape[-1]))
    data = pd.DataFrame(data, index = df_index, columns = df_columns)
    return scaler, data

scaler_dic = {}

#scaling - time # 지역별 스케일링
for loc in range(69)
    time_scaler, train_time[loc,:,:,2:] = scaleing_time(train_time[loc,:,:,2:])
    scaler_dic[train_time[loc,:,:,:,0]] = time_scaler

_, valid_time_1[:,:,:,2:] = scaleing_time(valid_time_1[:,:,:,2:], time_scaler)
_, valid_time_2[:,:,:,2:] = scaleing_time(valid_time_2[:,:,:,2:], time_scaler)
_, test_time_1[:,:,:,2:] = scaleing_time(test_time_1[:,:,:,2:], time_scaler)
_, test_time_2[:,:,:,2:] = scaleing_time(test_time_2[:,:,:,2:], time_scaler)


print('\n'.join([f'train_time : {train_time.shape}',
                f'valid_time_1 : {valid_time_1.shape}',
                f'valid_time_2 : {valid_time_2.shape}',
                f'test_time_1  : {test_time_1.shape}',
                f'test_time_2  : {test_time_2.shape}\n'] ))



# make no time data
nontime_data = pd.read_csv('data/original/nontime_data.txt', sep = ' ')
nontime_data = nontime_data[['HDONG_CD', 'time', 'tot_pop', 'age_80U', 'AREA']]
nontime_data = nontime_data.groupby(['HDONG_CD','time']).sum()

# split no time data
train_no_time = nontime_data.loc[train_loc_index]
valid_no_time = nontime_data.loc[valid_loc_index]
test_no_time = nontime_data.loc[test_loc_index]

# scaleing no time data*
no_time_scaler, train_no_time = scaleing_no_time(train_no_time)
_,              valid_no_time = scaleing_no_time(valid_no_time)
_,              test_no_time  = scaleing_no_time(test_no_time)

train_time : (55, 3, 201, 15)
valid_time_1 : (55, 3, 40, 15)
valid_time_2 : (7, 3, 221, 15)
test_time_1 : (62, 3, 40, 15)
test_time_2 : (7, 3, 241, 15)

size of train_loc_index : 55
size of valid_loc_index : 7
size of test_loc_index : 7

train_time : (55, 3, 201, 15)
valid_time_1 : (55, 3, 40, 15)
valid_time_2 : (7, 3, 221, 15)
test_time_1  : (62, 3, 40, 15)
test_time_2  : (7, 3, 241, 15)



In [342]:
pd.DataFrame(train_time[0,0,:,:])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.114065e+09,2.0,-0.432789,-0.484521,-0.228814,-0.410405,-1.318818,0.680222,0.012310,-2.216370,1.476984,-0.604139,-1.281223,-1.070237,-1.070237
1,1.114065e+09,2.0,-0.632178,-0.524583,-0.228814,-0.410405,-0.862892,0.274338,-0.198547,-2.116541,1.458587,-1.369857,-0.322384,-1.070237,-1.070237
2,1.114065e+09,2.0,-0.772432,-0.633989,-0.228814,-0.410405,-0.921208,2.350954,0.328596,-2.017802,1.439687,-1.096576,0.873268,-1.070237,-1.070237
3,1.114065e+09,2.0,-0.776940,-0.675009,4.370355,-0.410405,-1.355928,-0.679017,-0.198547,-1.920184,1.420290,0.009920,1.405383,-1.070237,-1.070237
4,1.114065e+09,2.0,-0.700818,-0.720382,4.370355,-0.410405,-0.494440,-1.103780,-0.198547,-1.823714,1.400400,1.116415,0.873268,-1.070237,-1.070237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,1.114065e+09,2.0,-0.567383,-0.515784,-0.228814,-0.240272,0.279574,1.709091,-0.198547,0.762196,-0.811451,-0.604139,-1.281223,-1.070237,-1.070237
197,1.114065e+09,2.0,-0.716493,-0.466720,-0.228814,-0.271205,0.743453,0.434804,-0.198547,0.727732,-0.844598,-1.369857,-0.322384,-1.070237,-1.070237
198,1.114065e+09,2.0,-0.822934,-0.644504,-0.228814,-0.348538,0.094023,1.784604,4.967453,0.691334,-0.877565,-1.096576,0.873268,-1.070237,-1.070237
199,1.114065e+09,2.0,-0.564031,-0.586175,-0.228814,-0.309872,0.019803,0.991714,-0.198547,0.653013,-0.910343,0.009920,1.405383,-1.070237,-1.070237
