In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import StandardScaler
import gc

In [8]:
time_data = pd.read_csv('data/original/time_data.txt', sep  = ' ') 
time_data = df2npy(time_data)

# 아침 점심 저녁 분리
morning_data = time_data[:,2,:,:]
morning_data[:,:,:].shape

(69, 241, 13)

In [9]:
# tend, cycle 추가
morning_data = append_trend_cycle(morning_data)
lunch_data = append_trend_cycle(lunch_data)
evening_data = append_trend_cycle(evening_data)

ValueError: could not broadcast input array from shape (69,234,13) into shape (69,235,13)

In [None]:
morning_data.shape

In [7]:
# set constant
LOC_SIZE = 69
TIME_SIZE = 3
DATE_SIZE = 241
FEATURE_SIZE = 13
#set window size
INPUT_WINDOW = 20
OUTPUT_WINDOW = 7
ROLLSIZE = 7

def seq2cycle(seq):
    return pd.Series(seq).rolling(ROLLSIZE).mean()

# flow_tot -> flow_trend + flow_cycle 분리
def split_trend_cycle(flow_pop):
    new_flow_pop = np.zeros([LOC_SIZE, flow_pop.shape[1] - (ROLLSIZE-1), 2])

    for loc in range(LOC_SIZE):
        new_flow_pop[loc,:, 0] = seq2cycle(flow_pop[loc,:])[ROLLSIZE-1:]
        new_flow_pop[loc,:, 1] = flow_pop[loc, ROLLSIZE-1:] - new_flow_pop[loc,:, 0]
    print(new_flow_pop)
    return new_flow_pop

def append_trend_cycle(time_data):
    time_data_new = np.empty([LOC_SIZE, DATE_SIZE - (ROLLSIZE-1), FEATURE_SIZE + 2])
    time_data_new[:,:,:-2] = time_data[:,ROLLSIZE:,:]
    # trend, cycle 순서로 return
    time_data_new[:,:,-2:] = split_trend_cycle(time_data[:,:,2])# flow_pop의 index는 2
    return time_data_new

def df2npy(time_data):
    # make loc_list(dong code)
    loc_list = list(time_data.HDONG_CD.unique())

    # select features
    time_data = time_data[['flow_pop', 'HDONG_CD', 'time',
                        'card_use', 'holiday', 'day_corona', 'ondo', 'subdo',
                        'rain_snow', 'STD_YMD']]

    # change string time to int time
    time_data.time[time_data.time == 'morning'] = 0 # morning
    time_data.time[time_data.time == 'lunch'] = 1 # lunch
    time_data.time[time_data.time == 'evening'] = 2 # evening

    # to datetime
    time_data.STD_YMD = pd.to_datetime(time_data.STD_YMD)

    # make dayofyear weekday
    time_data['dayofyear'] = time_data.STD_YMD.dt.dayofyear
    time_data['weekday'] = time_data.STD_YMD.dt.weekday
    time_data['dayofyear_sin'] = np.sin(2 * np.pi * (time_data['dayofyear'])/365)
    time_data['dayofyear_cos'] = np.cos(2 * np.pi * (time_data['dayofyear'])/365)
    time_data['weekday_sin'] = np.sin(2 * np.pi * (time_data['weekday'])/7)# 월화수목금토일
    time_data['weekday_cos'] = np.cos(2 * np.pi * (time_data['weekday'])/7)

    # reselect features
    time_data = time_data[['HDONG_CD', 'time','flow_pop',
                        'card_use', 'holiday', 'day_corona', 'ondo', 'subdo',
                        'rain_snow', 'dayofyear_sin', 'dayofyear_cos', 'weekday_sin', 'weekday_cos']]

    # table -> matrix
    time_data = np.array(time_data).reshape(LOC_SIZE, TIME_SIZE, DATE_SIZE, FEATURE_SIZE)# 지역, 시간, 날짜, features
    return time_data

def scaleing_time(data, scaler = None):
    shape = data.shape
    data = data.reshape(-1, shape[-1])
    if scaler == None:
        scaler = StandardScaler()
        scaler.fit(data)
    scaled_data = scaler.transform(data)
    return scaler, scaled_data.reshape(shape)

def scaleing_no_time(data, scaler = None):
    df_index = data.index
    df_columns = data.columns
    if scaler == None:
        scaler = StandardScaler()
        scaler.fit(data.values.reshape(-1,data.shape[-1]))
    data = scaler.transform(data.values.reshape(-1,data.shape[-1]))
    data = pd.DataFrame(data, index = df_index, columns = df_columns)
    return scaler, data

def split_train_valid_test(time_data):
    # make_random 
    loc_index = [i for i in range(69)]
    random.seed(1015)
    random.shuffle(loc_index)

    # split time data
    train_time = time_data[loc_index[ :55], :201, :]
    valid_time_1 = time_data[loc_index[ :55], 201 -INPUT_WINDOW : -20, :] # train 지역& valid 기간
    valid_time_2 = time_data[loc_index[55:62], : -20, :] # valid 지역 & (train + valid) 기간
    test_time_1 = time_data[loc_index[:62], 221 -INPUT_WINDOW : , :] # train,valid 지역& test 기간
    test_time_2 = time_data[loc_index[62:], :, :] # test 지역 & (train + valid + test) 기간

    # set loc index
    train_loc_index = list(set(train_time[:,0,0].astype(np.int64)))
    valid_loc_index = list(set(valid_time_2[:,0,0].astype(np.int64)))
    test_loc_index = list(set(test_time_2[:,0,0].astype(np.int64)))

    #scaling - time # 지역별 스케일링
    time_scaler, train_time[:,:,2:] = scaleing_time(train_time[:,:,2:])
    _, valid_time_1[:,:,2:] = scaleing_time(valid_time_1[:,:,2:], time_scaler)
    _, valid_time_2[:,:,2:] = scaleing_time(valid_time_2[:,:,2:], time_scaler)
    _, test_time_1[:,:,2:] = scaleing_time(test_time_1[:,:,2:], time_scaler)
    _, test_time_2[:,:,2:] = scaleing_time(test_time_2[:,:,2:], time_scaler)

    train_valid_test = [train_time, valid_time_1, valid_time_2, test_time_1, test_time_2]
    train_valid_test_index = [train_loc_index, valid_loc_index, test_loc_index]

    return train_valid_test, train_valid_test_index, time_scaler

def split_notime_data(nontime_data, train_valid_test_index):
    train_loc_index, valid_loc_index, test_loc_index= train_valid_test_index
    # make no time data
    nontime_data = nontime_data[['HDONG_CD', 'time', 'tot_pop', 'age_80U', 'AREA']]
    nontime_data = nontime_data.groupby(['HDONG_CD']).sum()

    # split no time data
    train_no_time = nontime_data.loc[train_loc_index]
    valid_no_time = nontime_data.loc[valid_loc_index]
    test_no_time = nontime_data.loc[test_loc_index]

    # scaleing no time data*
    no_time_scaler, train_no_time = scaleing_no_time(train_no_time)
    _,              valid_no_time = scaleing_no_time(valid_no_time)
    _,              test_no_time  = scaleing_no_time(test_no_time)

    notime = [train_no_time, valid_no_time, test_no_time]
    return pd.concat(notime), no_time_scaler

def split_sequence(sequence, input_window = 20, output_window = 7, target_index  = 2):
    x, y = list(), list()
    #print(sequence.shape)
    for day in range(sequence.shape[0]):
        end_ix = day + input_window
        if end_ix > (len(sequence)- output_window) -1:#
            break
        seq_x, seq_y = sequence[day:end_ix, :], sequence[end_ix:end_ix+output_window, 2]
        x.append(seq_x)
        y.append(seq_y)
    return np.array(x), np.array(y)

def make_time_notime_data(time_data, notime_data, input_window = 20, out_window = 7):
    x_time_batch, x_notime_batch, y_batch = list(), list(), list()

    x_time = []
    x_notime = []
    y_time = []

    for loc in range(time_data.shape[0]):
        loc_code = time_data[loc,0,0]
        #print(time_data[loc_code,time_idx,0,0])
        x, y = split_sequence(time_data[loc,:,:], input_window, out_window)
        notime = notime_data.loc[loc_code]
        aug_notime = np.zeros([x.shape[0], notime.shape[0]])
        aug_notime[:,:] = notime
        
        x_time.append(x)
        x_notime.append(aug_notime)
        y_time.append(y)
    
    x_time = np.concatenate(x_time)
    x_notime = np.concatenate(x_notime)
    y_time = np.concatenate(y_time)

    print(x_time.shape)
    print(x_notime.shape)
    print(y_time.shape)

    return x_time, x_notime, y_time

In [None]:
# load data
time_data = pd.read_csv('data/original/time_data.txt', sep  = ' ') 
time_data = df2npy(time_data)

# 아침 점심 저녁 분리
morning_data = time_data[:,2,:,:]
lunch_data = time_data[:,1,:,:]
evening_data = time_data[:,0,:,:]

# tend, cycle 추가
morning_data = append_trend_cycle(morning_data)
lunch_data = append_trend_cycle(lunch_data)
evening_data = append_trend_cycle(evening_data)

# train_validation_test split  & scaling
morning_data, train_valid_test_loc_index, m_time_scaler = split_train_valid_test(morning_data)
# m_time_train, m_time_valid, m_time_test = morning_data
lunch_data, _, l_time_scaler = split_train_valid_test(lunch_data)
evening_data, _, e_time_scaler = split_train_valid_test(evening_data)

nontime_data = pd.read_csv('data/original/nontime_data.txt', sep = ' ')
notime, no_time_scaler = split_notime_data(nontime_data, train_valid_test_loc_index)
# notime_train, notime_valid, notime_test = notime

print('\n morning')
m_train_time, m_train_notime, m_train_y = make_time_notime_data(morning_data[0], notime)
m_valid_time, m_valid_notime, m_valid_y = make_time_notime_data(morning_data[1], notime)
m_test_time, m_test_notime, m_test_y = make_time_notime_data(morning_data[2], notime)

print('\n lunch')
l_train_time, l_train_notime, l_train_y = make_time_notime_data(lunch_data[0], notime)
l_valid_time, l_valid_notime, l_valid_y = make_time_notime_data(lunch_data[1], notime)
l_test_time, l_test_notime, l_test_y = make_time_notime_data(lunch_data[2], notime)

print('\n evening')
e_train_time, e_train_notime, e_train_y = make_time_notime_data(evening_data[0], notime)
e_valid_time, e_valid_notime, e_valid_y = make_time_notime_data(evening_data[1], notime)
e_test_time, e_test_notime, e_test_y = make_time_notime_data(evening_data[2], notime)