In [3]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import StandardScaler
import gc

In [195]:
# set constant
LOC_SIZE = 69
TIME_SIZE = 3
DATE_SIZE = 241
FEATURE_SIZE = 13
#set window size
INPUT_WINDOW = 20
OUTPUT_WINDOW = 7


# flow_tot -> flow_trend + flow_cycle 분리
def split_trend_cycle(flow_pop):
    flow_pop = flow_pop.reshape(LOC_SIZE, DATE_SIZE, 1)
    trend = flow_pop
    cycle = flow_pop
    return np.concatenate([trend, cycle], axis = 2)

def df2npy(time_data):
    # make loc_list(dong code)
    loc_list = list(time_data.HDONG_CD.unique())


    # select features
    time_data = time_data[['flow_pop', 'HDONG_CD', 'time',
                        'card_use', 'holiday', 'day_corona', 'ondo', 'subdo',
                        'rain_snow', 'STD_YMD']]

    # change string time to int time
    time_data.time[time_data.time == 'morning'] = 0 # morning
    time_data.time[time_data.time == 'lunch'] = 1 # lunch
    time_data.time[time_data.time == 'evening'] = 2 # evening

    # to datetime
    time_data.STD_YMD = pd.to_datetime(time_data.STD_YMD)

    # make dayofyear weekday
    time_data['dayofyear'] = time_data.STD_YMD.dt.dayofyear
    time_data['weekday'] = time_data.STD_YMD.dt.weekday
    time_data['dayofyear_sin'] = np.sin(2 * np.pi * (time_data['dayofyear'])/365)
    time_data['dayofyear_cos'] = np.cos(2 * np.pi * (time_data['dayofyear'])/365)
    time_data['weekday_sin'] = np.sin(2 * np.pi * (time_data['weekday'])/7)# 월화수목금토일
    time_data['weekday_cos'] = np.cos(2 * np.pi * (time_data['weekday'])/7)

    # reselect features
    time_data = time_data[['HDONG_CD', 'time','flow_pop',
                        'card_use', 'holiday', 'day_corona', 'ondo', 'subdo',
                        'rain_snow', 'dayofyear_sin', 'dayofyear_cos', 'weekday_sin', 'weekday_cos']]

    # table -> matrix
    time_data = np.array(time_data).reshape(LOC_SIZE, TIME_SIZE, DATE_SIZE, FEATURE_SIZE)# 지역, 시간, 날짜, features
    return time_data

def append_trend_cycle(time_data):
    time_data_new = np.empty([LOC_SIZE, DATE_SIZE, FEATURE_SIZE + 2])
    time_data_new[:,:,:-2] = time_data
    # trend, cycle 순서로 return
    time_data_new[:,:,-2:] = split_trend_cycle(time_data[:,:,2])# flow_pop의 index는 2
    time_data = time_data_new
    return time_data

def scaleing_time(data, scaler = None):
    shape = data.shape
    data = data.reshape(-1, shape[-1])
    if scaler == None:
        scaler = StandardScaler()
        scaler.fit(data)
    scaled_data = scaler.transform(data)
    return scaler, scaled_data.reshape(shape)

def scaleing_no_time(data, scaler = None):
    df_index = data.index
    df_columns = data.columns
    if scaler == None:
        scaler = StandardScaler()
        scaler.fit(data.values.reshape(-1,data.shape[-1]))
    data = scaler.transform(data.values.reshape(-1,data.shape[-1]))
    data = pd.DataFrame(data, index = df_index, columns = df_columns)
    return scaler, data

def split_train_valid_test(time_data):
    # make_random 
    loc_index = [i for i in range(69)]
    random.seed(1015)
    random.shuffle(loc_index)

    # split time data
    train_time = time_data[loc_index[ :55], :201, :]
    valid_time_1 = time_data[loc_index[ :55], 201 -INPUT_WINDOW : -20, :] # train 지역& valid 기간
    valid_time_2 = time_data[loc_index[55:62], : -20, :] # valid 지역 & (train + valid) 기간
    test_time_1 = time_data[loc_index[:62], 221 -INPUT_WINDOW : , :] # train,valid 지역& test 기간
    test_time_2 = time_data[loc_index[62:], :, :] # test 지역 & (train + valid + test) 기간

    # set loc index
    train_loc_index = list(set(train_time[:,0,0].astype(np.int64)))
    valid_loc_index = list(set(valid_time_2[:,0,0].astype(np.int64)))
    test_loc_index = list(set(test_time_2[:,0,0].astype(np.int64)))

    #scaling - time # 지역별 스케일링
    time_scaler, train_time[:,:,2:] = scaleing_time(train_time[:,:,2:])
    _, valid_time_1[:,:,2:] = scaleing_time(valid_time_1[:,:,2:], time_scaler)
    _, valid_time_2[:,:,2:] = scaleing_time(valid_time_2[:,:,2:], time_scaler)
    _, test_time_1[:,:,2:] = scaleing_time(test_time_1[:,:,2:], time_scaler)
    _, test_time_2[:,:,2:] = scaleing_time(test_time_2[:,:,2:], time_scaler)

    train_valid_test = [train_time, valid_time_1, valid_time_2, test_time_1, test_time_2]
    train_valid_test_index = [train_loc_index, valid_loc_index, test_loc_index]

    return train_valid_test, train_valid_test_index

def split_notime_data(nontime_data, train_valid_test_index):
    train_loc_index, valid_loc_index, test_loc_index= train_valid_test_index
    # make no time data
    nontime_data = nontime_data[['HDONG_CD', 'time', 'tot_pop', 'age_80U', 'AREA']]
    nontime_data = nontime_data.groupby(['HDONG_CD','time']).sum()

    # split no time data
    train_no_time = nontime_data.loc[train_loc_index]
    valid_no_time = nontime_data.loc[valid_loc_index]
    test_no_time = nontime_data.loc[test_loc_index]

    # scaleing no time data*
    no_time_scaler, train_no_time = scaleing_no_time(train_no_time)
    _,              valid_no_time = scaleing_no_time(valid_no_time)
    _,              test_no_time  = scaleing_no_time(test_no_time)

    notime = [train_no_time, valid_no_time, test_no_time]
    return no_time_scaler, notime

In [197]:
# load data
time_data = pd.read_csv('data/original/time_data.txt', sep  = ' ') 
time_data = df2npy(time_data)

# 아침 점심 저녁 분리
morning_data = time_data[:,2,:,:]
lunch_data = time_data[:,1,:,:]
evening_data = time_data[:,0,:,:]

# tend, cycle 추가
morning_data = append_trend_cycle(morning_data)
lunch_data = append_trend_cycle(lunch_data)
evening_data = append_trend_cycle(evening_data)


# train_validation_test split  & scaling
morning_data, train_valid_test_loc_index = split_train_valid_test(morning_data,)
morning_data, _ = split_train_valid_test(morning_data)
morning_data, _ = split_train_valid_test(morning_data)

nontime_data = pd.read_csv('data/original/nontime_data.txt', sep = ' ')
no_time_scaler, notime = split_notime_data(nontime_data)



In [70]:
train_time_evening, train_time_evening,  = train_time[:,0,:,:].reshape(-1, 15) # evening
train_time_lunch = train_time[:,1,:,:].reshape(-1, 15) # lunch
train_time_morning = train_time[:,2,:,:].reshape(-1, 15) # morning

(55, 201, 15)

In [61]:
pd.DataFrame(train_time.reshape(3, -1, 15)[0,:,:])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.114065e+09,2.0,-0.432789,-0.484521,-0.228814,-0.410405,-1.318818,0.680222,0.012310,-2.216370,1.476984,-0.604139,-1.281223,-1.070237,-1.070237
1,1.114065e+09,2.0,-0.632178,-0.524583,-0.228814,-0.410405,-0.862892,0.274338,-0.198547,-2.116541,1.458587,-1.369857,-0.322384,-1.070237,-1.070237
2,1.114065e+09,2.0,-0.772432,-0.633989,-0.228814,-0.410405,-0.921208,2.350954,0.328596,-2.017802,1.439687,-1.096576,0.873268,-1.070237,-1.070237
3,1.114065e+09,2.0,-0.776940,-0.675009,4.370355,-0.410405,-1.355928,-0.679017,-0.198547,-1.920184,1.420290,0.009920,1.405383,-1.070237,-1.070237
4,1.114065e+09,2.0,-0.700818,-0.720382,4.370355,-0.410405,-0.494440,-1.103780,-0.198547,-1.823714,1.400400,1.116415,0.873268,-1.070237,-1.070237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11050,1.135070e+09,2.0,-0.173808,-0.667451,-0.228814,-0.240272,0.279574,1.709091,-0.198547,0.762196,-0.811451,-0.604139,-1.281223,-1.043887,-1.043887
11051,1.135070e+09,2.0,-0.175710,-0.655189,-0.228814,-0.271205,0.743453,0.434804,-0.198547,0.727732,-0.844598,-1.369857,-0.322384,-1.043887,-1.043887
11052,1.135070e+09,2.0,-0.307042,-0.707934,-0.228814,-0.348538,0.094023,1.784604,4.967453,0.691334,-0.877565,-1.096576,0.873268,-1.043887,-1.043887
11053,1.135070e+09,2.0,-0.169877,-0.650937,-0.228814,-0.309872,0.019803,0.991714,-0.198547,0.653013,-0.910343,0.009920,1.405383,-1.043887,-1.043887


In [163]:
def split_sequence(sequence, input_window = 20, output_window = 7, target_index  = 2):
    x, y = list(), list()
    #print(sequence.shape)
    for day in range(sequence.shape[0]):
        end_ix = day + input_window
        if end_ix > (len(sequence)- output_window) -1:#
            break
        seq_x, seq_y = sequence[day:end_ix, :], sequence[end_ix:end_ix+output_window, 2]
        x.append(seq_x)
        y.append(seq_y)
    return np.array(x), np.array(y)

def make_sequence_data(time_data, no_time_data, input_window = 20, out_window = 7):
    x_time_batch, x_notime_batch, y_batch = list(), list(), list()
    
    for time_idx, time in enumerate(['evening', 'lunch', 'morning']):
        x_time = []
        x_notime = []
        y_time = []

        for loc in range(time_data.shape[0]):
            loc_code = time_data[loc,0,0,0]
            #print(time_data[loc_code,time_idx,0,0])
            x, y = split_sequence(time_data[loc,time_idx,:,:], input_window, out_window)
            notime = no_time_data.loc[loc_code, time]
            #print(no_time_data.loc[loc_code, time].shape)
            print(x.shape)
            break
            x_time.append(x)
            x_notime.append(notime)
            y_time.append(y)
        
        x_time = np.stack(np.concatenate(x_time))
        x_notime = np.stack(x_notime)
        y_time = np.stack(np.concatenate(y_time))

        x_time_batch.append(x_time)
        x_notime_batch.append(x_notime)
        y_batch.append(y_time)

    return x_time_batch, x_notime_batch, y_batch

In [164]:
x_time_train, x_notime_train, y_train = make_sequence_data(train_time, train_no_time)
x_time_train_eve, x_time_train_lunch, x_time_train_morning = x_time_train
x_notime_train_eve, x_notime_train_lunch, x_notime_train_morning = x_notime_train
y_train_eve, y_train_lunch, y_train_morning = y_train

(174, 20, 15)


ValueError: need at least one array to concatenate

In [146]:
x_time_train_eve

(55, 174)

In [127]:
y_train_lunch.shape

(9570, 7)

In [None]:
train_time, train_no_time = (-1, 20, time_features), (-1, no_time_features)

In [26]:
train_time[2,0,0,0]

2726066200.0

In [46]:
i = 3
print(train_time[i,0,0,0])
print(train_loc_index[i])
train_no_time.loc[train_loc_index].index[i]

2726061000.0
1135056000


(1135072000, 'evening')

In [35]:
train_loc_index

[1135062400,
 1135072000,
 2726064000,
 1135056000,
 1135066500,
 2726053000,
 2726069000,
 2711066000,
 2726066200,
 2726063000,
 1114068000,
 2726068000,
 1135060000,
 2726052000,
 1114062500,
 2711054500,
 1114052000,
 2726057000,
 2711065000,
 2726060200,
 1135059500,
 2726062000,
 1135070000,
 2726065200,
 2711051700,
 2726066100,
 1114067000,
 2726051000,
 2726059000,
 2726067000,
 1114061500,
 1135069500,
 2711064000,
 2726056000,
 1114066500,
 2726060100,
 2726061000,
 2711069000,
 2726065100,
 1135061200,
 1135067000,
 1114060500,
 1114055000,
 1135063000,
 2726055000,
 2711068000,
 1114059000,
 1135062500,
 1114065000,
 1135057000,
 1135061100,
 1114054000,
 2726054000,
 1114064500,
 2711067000]

In [28]:
list(set(train_time[:,0,0,0].astype(np.int64)))

[1135062400,
 1135072000,
 2726064000,
 1135056000,
 1135066500,
 2726053000,
 2726069000,
 2711066000,
 2726066200,
 2726063000,
 1114068000,
 2726068000,
 1135060000,
 2726052000,
 1114062500,
 2711054500,
 1114052000,
 2726057000,
 2711065000,
 2726060200,
 1135059500,
 2726062000,
 1135070000,
 2726065200,
 2711051700,
 2726066100,
 1114067000,
 2726051000,
 2726059000,
 2726067000,
 1114061500,
 1135069500,
 2711064000,
 2726056000,
 1114066500,
 2726060100,
 2726061000,
 2711069000,
 2726065100,
 1135061200,
 1135067000,
 1114060500,
 1114055000,
 1135063000,
 2726055000,
 2711068000,
 1114059000,
 1135062500,
 1114065000,
 1135057000,
 1135061100,
 1114054000,
 2726054000,
 1114064500,
 2711067000]