In [30]:
import torch.nn as nn
import torch

emb = nn.Embedding(num_embeddings=2000, embedding_dim=16)

In [31]:
labels = torch.tensor([[1,2], [3,3]], dtype=torch.int32).int()
labels

tensor([[1, 2],
        [3, 3]], dtype=torch.int32)

In [32]:
labels[:,0]

tensor([1, 3], dtype=torch.int32)

In [34]:
emb.weight

Parameter containing:
tensor([[-0.2070,  0.0868,  0.2793,  ...,  0.1696,  0.0606,  1.5118],
        [ 0.7185,  1.7097, -0.3803,  ..., -1.4022,  1.5750, -2.4132],
        [-0.6919, -0.4048,  0.5235,  ...,  0.2458, -1.1409, -1.1572],
        ...,
        [-0.0375,  1.1583, -0.4485,  ...,  0.6072,  0.5381, -0.7682],
        [ 1.0264,  0.3425,  0.2802,  ...,  1.2693, -0.2440,  1.7639],
        [-0.6822, -0.5398, -0.6730,  ..., -0.6008,  0.4224, -0.8169]],
       requires_grad=True)

In [33]:
emb(labels[:,0])

tensor([[ 0.7185,  1.7097, -0.3803, -0.9844, -2.1273,  1.0098, -1.0400,  0.6035,
          0.5341, -1.3867, -0.1350, -0.8774,  1.0256, -1.4022,  1.5750, -2.4132],
        [ 0.5814, -0.5005, -1.0239, -0.2890,  0.1840, -0.3994,  0.0467,  0.2294,
         -1.4097,  0.2304, -0.7351, -0.2720, -0.7556,  0.0195,  1.5493, -1.1773]],
       grad_fn=<EmbeddingBackward>)

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

# split a multivariate sequence into samples
def split_sequences(sequences, n_steps_in, n_steps_out):
	"""station should be in axis0 (index), timestamp should be axis1 (columns)

	Args:
		sequences (_type_): _description_
		n_steps_in (_type_): _description_
		n_steps_out (_type_): _description_

	Returns:
		_type_: _description_
	"""
	size = sequences.shape[1]
	xs = np.empty((0,n_steps_in))
	ys = np.empty((0,n_steps_out))

	for idx in tqdm(range(1008-n_steps_in, size - (n_steps_in + n_steps_out))):
		x = sequences[:,idx:idx+n_steps_in]
		xs = np.vstack([xs, x])
		y = sequences[:, idx+n_steps_in:idx+n_steps_in+n_steps_out]
		ys = np.vstack([ys, y])
	return xs, ys


def station_features(station_array, station_df, n_windows):
	df = pd.DataFrame(data=station_array, columns=['station_name']).merge(station_df[['station_name', 'dcode']], how='left', on='station_name')
	name_encoder = {name:idx for idx, name in enumerate(df.station_name.unique())}
	dcode_encoder = {name:idx for idx, name in enumerate(df.dcode.unique())}

	df.station_name = df.station_name.map(name_encoder)
	df.dcode = df.dcode.map(dcode_encoder)

	return np.tile(df.values, (n_windows,1))


def time_features(time_idx, n_steps_in, n_steps_out, n_stations):
	df = pd.DataFrame(data=pd.to_datetime(time_idx), columns=['time'])	

	# df['seconds'] = df['time'].dt.hour.multiply(3600) + df['time'].dt.minute.multiply(60)
	# seconds_in_day = 24*60*60
	# df['sin_time'] = df['seconds'].divide(seconds_in_day).multiply(2*np.pi).map(np.sin)
	# df['cos_time'] = df['seconds'].divide(seconds_in_day).multiply(2*np.pi).map(np.cos)
	df['t_index']  = df['time'].dt.hour.multiply(60).add(df['time'].dt.minute).floordiv(30)
	df['dow'] = df['time'].dt.dayofweek
	df['weekend'] = df.dow.isin([5,6]).astype(np.int64)
	del df['time']

	ts = np.empty((0,n_steps_out,3))
	for idx in tqdm(range(1008-n_steps_in, len(time_idx) - (n_steps_in + n_steps_out))):
		t = df.values[np.newaxis, idx+n_steps_in:idx+n_steps_in+n_steps_out, :]
		ts = np.vstack([ts, t])

	return np.repeat(ts, n_stations, axis=0)

In [2]:
import pandas as pd

history = pd.read_csv('../data/input_table/history_by_station.csv', parse_dates=['time'])
station = pd.read_csv('../data/input_table/station_info.csv')


data = history.set_index('time').T.reset_index().rename(columns={'index':'station_name'})
data = data[data.station_name.isin(station.station_name)].set_index('station_name')

In [3]:
data.head()

time,2021-07-01 00:00:00,2021-07-01 00:30:00,2021-07-01 01:00:00,2021-07-01 01:30:00,2021-07-01 02:00:00,2021-07-01 02:30:00,2021-07-01 03:00:00,2021-07-01 03:30:00,2021-07-01 04:00:00,2021-07-01 04:30:00,...,2021-09-30 19:00:00,2021-09-30 19:30:00,2021-09-30 20:00:00,2021-09-30 20:30:00,2021-09-30 21:00:00,2021-09-30 21:30:00,2021-09-30 22:00:00,2021-09-30 22:30:00,2021-09-30 23:00:00,2021-09-30 23:30:00
station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(주)플러스 전용 전기버스 충전소,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.5,0.786111,1.0,1.0,1.0,1.0,0.541667,0.0,0.075,0.752778
2생활권 환승주차장1(A),1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.327778,0.772222
2호선 양산역,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CGV 신대점,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
DMC아이파크 아파트,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
data.mean(axis=1).le(0.9)

station_name
LH강원본부           False
LH경기지사           False
LH경남본부           False
LH인천본부           False
가락2동 주민센터         True
                 ...  
회차지(월드컵경기장)       True
회차지(인라인스케이트장)    False
회천1동주민센터         False
효자종합시장공영주차장      False
후평1동행정복지센터       False
Length: 250, dtype: bool

In [16]:
data = data[data.mean(axis=1).le(0.8)]

In [17]:
data.index

Index(['가락2동 주민센터', '강남구청 공영주차장', '강동송파지사', '강변테크노마트 주차장', '거여2동 주민센터', '계양구청',
       '관악동작 견인차량 보관소', '교1동주민센터', '구로구청 주차장', '금천구청 지상 주차장', '남인천지사',
       '녹산산단 공영주차장', '대전역사박물관', '대평동 BRT전기버스 차고지', '도봉구청', '뚝도충전소',
       '매월동 전기버스 충전소', '빛가람 주민센터', '빛가람동 공용차고지', '상인대성스카이렉스 아파트', '오천읍사무소',
       '용산전자상가1 공영주차장', '은평평화공원 공영주차장', '을숙도 공영주차장', '인천업사이클 에코센터', '장등공용차고지',
       '전기공사협회 주차장', '중부산지사', '지식산업센터(동일테크노타운8차)', '첨단공용차고지', '한밭종합운동장 주차장',
       '한일병원', '회차지(대성공영주차장)', '회차지(완산체련공원)', '회차지(월드컵경기장)'],
      dtype='object', name='station_name')

In [18]:
name = data.index[42]
print(name)

df = data.T[[name]].reset_index().copy()
df['step'] = df.time.dt.hour.multiply(60).add(df['time'].dt.minute).floordiv(30)
df.groupby('step')[name].mean().plot()

IndexError: index 42 is out of bounds for axis 0 with size 35

In [19]:
def history_sequences(sequences, n_steps_in, n_steps_out):
	size = sequences.shape[1]
	hs = np.empty((0,3))

	for idx in tqdm(range(1008-n_steps_in, size - (n_steps_in + n_steps_out))):
		h = sequences[:, [idx+n_steps_in-336, idx+n_steps_in-672, idx+n_steps_in-1008]]
		hs = np.vstack([hs, h])
	return hs

In [20]:
station.nunique()

station_name    1899
dcode             18
dtype: int64

In [21]:
N_STEPS_IN = 12
N_STEPS_OUT = 6

n_windows = data.shape[1] - (N_STEPS_IN + N_STEPS_OUT)
n_stations = data.shape[0]

S = station_features(station_array=data.index, station_df=station, n_windows=n_windows)
T = time_features(data.columns, N_STEPS_IN, N_STEPS_OUT, n_stations)

R, Y = split_sequences(data.values, N_STEPS_IN, N_STEPS_OUT)
H = history_sequences(data.values, 12,6)

100%|██████████| 3402/3402 [00:00<00:00, 47337.76it/s]
100%|██████████| 3402/3402 [00:03<00:00, 946.17it/s] 
100%|██████████| 3402/3402 [00:00<00:00, 6229.09it/s] 


- X: input sequence
- S: station features (category -> embedding)
- T: Time features (sin / cos / dow / weekend)
- Y: output sequence

## 1. select output

- 최초실험 single output
- 예측시점 step2 (index=1) 30분 ~ 1시간 뒤 구간의 충전소 상태

In [22]:
T.shape, Y.shape

((119070, 6, 3), (119070, 6))

In [23]:
OUTPUT_IDX = 0

T = T[:,OUTPUT_IDX,:]
Y = Y[:,OUTPUT_IDX, np.newaxis]

In [24]:
T.shape, Y.shape

((119070, 3), (119070, 1))

## 2. sequential input 차원변환

-  input sequence ->  sequence length * input size

In [25]:
print(H.shape)
H = H[:, :, np.newaxis]
print(H.shape)

(119070, 3)
(119070, 3, 1)


In [26]:
print(R.shape)
R = R[:, :, np.newaxis]
print(R.shape)

(119070, 12)
(119070, 12, 1)


## 3. Dataset

In [27]:
import torch
from torch.utils.data import Dataset, DataLoader

class EvcDataset(Dataset):
    def __init__(self, rs, hs, ts, ss, ys):
        assert len(rs) == len(ys)

        self.rs = torch.tensor(rs).float()
        self.hs = torch.tensor(hs).float()
        self.ts = torch.tensor(ts).int()  # keep int dtype -> goes to embedding layer
        self.ss = torch.tensor(ss).int()  # keep int dtype -> goes to embedding layer
        self.ys = torch.tensor(ys).float()


    def __len__(self):
        return len(self.rs)

    def __getitem__(self, i):
        r, h, t, s, y = self.rs[i], self.hs[i], self.ts[i], self.ss[i], self.ys[i]
        return r, h, t, s, y

## 4. Model

In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class BaseHybrid(nn.Module):
    def __init__(self, hidden_size, station_embedding_dim, embedding_dim):
        super().__init__()
        self.lstm_r1 = nn.LSTM(1, hidden_size, 1)
        self.dropout_r1 = nn.Dropout(p=0.2)
        self.fc_r1 = nn.Linear(hidden_size, 16)
    
        self.lstm_h1 = nn.LSTM(1, hidden_size, 1)
        self.dropout_h1 = nn.Dropout(p=0.2)
        self.fc_h1 = nn.Linear(hidden_size, 16)

        self.station_embedding = nn.Embedding(num_embeddings=2000, embedding_dim=station_embedding_dim)
        self.timeslot_embedding = nn.Embedding(num_embeddings=48, embedding_dim=embedding_dim)
        self.dow_embedding = nn.Embedding(num_embeddings=7, embedding_dim=embedding_dim)
        self.we_embedding = nn.Embedding(num_embeddings=2, embedding_dim=embedding_dim)

        self.fc_b1 = nn.Linear(station_embedding_dim+ 3*embedding_dim, 128)
        self.fc_b2 = nn.Linear(128, 64)
        self.fc_b3 = nn.Linear(64, 64)

        self.fc_cat = nn.Linear(32+64, 64)
        self.dropout_cat = nn.Dropout(p=0.2)
        self.top = nn.Linear(64,1)

    def forward(self, r, h, t, s):
        # realtime sequence
        lstm_out_r, (hn, cn) = self.lstm_r1(r)
        last_state_r = lstm_out_r[:,-1,:]
        realtime_vec = self.dropout_r1(last_state_r)
        realtime_vec = F.relu(self.fc_r1(realtime_vec))
        
        # history sequence
        lstm_out_h, (hn, cn) = self.lstm_h1(h)
        last_state_h = lstm_out_h[:,-1,:]
        history_vec = self.dropout_h1(last_state_h)
        history_vec = F.relu(self.fc_h1(history_vec))

        # non-sequenctials
        station_vec = self.station_embedding(s[:,0])
        timeslot_vec = self.timeslot_embedding(t[:,0])
        dow_vec = self.dow_embedding(t[:,1])
        we_vec = self.we_embedding(t[:,2])

        fc_in = torch.cat((station_vec, timeslot_vec, dow_vec, we_vec), dim=1)
        feature_vec = F.relu(self.fc_b1(fc_in))
        feature_vec = F.relu(self.fc_b2(feature_vec))
        feature_vec = F.relu(self.fc_b3(feature_vec))

        # concatenation
        cat_vec = torch.cat((realtime_vec, history_vec, feature_vec), dim=1)
        fc_out = F.relu(self.fc_cat(cat_vec))
        fc_out = self.dropout_cat(fc_out)
        fc_out = self.top(fc_out)
        return torch.sigmoid(fc_out)


class GroupedHybrid(nn.Module):
    def __init__(self, hidden_size, embedding_dim):
        super().__init__()
        self.lstm_r1 = nn.LSTM(1, hidden_size, 1)
        self.dropout_r1 = nn.Dropout(p=0.2)
        self.fc_r1 = nn.Linear(hidden_size, 16)
    
        self.lstm_h1 = nn.LSTM(1, hidden_size, 1)
        self.dropout_h1 = nn.Dropout(p=0.2)
        self.fc_h1 = nn.Linear(hidden_size, 16)


        self.station_embedding = nn.Embedding(num_embeddings=2000, embedding_dim=embedding_dim)
        self.dcode_embedding = nn.Embedding(num_embeddings=20, embedding_dim=embedding_dim)
        self.fc_b1 = nn.Linear((embedding_dim*2)+3, 128)
        self.fc_b2 = nn.Linear(128, 64)
        self.fc_b3 = nn.Linear(64, 32)

        self.fc_cat = nn.Linear(16*2+32, 128) 
        self.dropout_cat = nn.Dropout(p=0.2)
        self.top = nn.Linear(128,1)

    def forward(self, r, h, t, s):
        # realtime sequence
        lstm_out_r, (hn, cn) = self.lstm_r1(r)
        last_state_r = lstm_out_r[:,-1,:]
        realtime_vec = self.dropout_r1(last_state_r)
        realtime_vec = F.relu(self.fc_r1(realtime_vec))
        
        # realtime sequence
        lstm_out_h, (hn, cn) = self.lstm_h1(h)
        last_state_h = lstm_out_h[:,-1,:]
        history_vec = self.dropout_r1(last_state_h)
        history_vec = F.relu(self.fc_h1(history_vec))

        # non-sequenctials
        station_vec = self.station_embedding(s[:,0])
        dcode_vec = self.dcode_embedding(s[:1])

        fc_in = torch.cat((station_vec, dcode_vec, t), dim=1)
        feature_vec = F.relu(self.fc_b1(fc_in))
        feature_vec = F.relu(self.fc_b2(feature_vec))
        feature_vec = F.relu(self.fc_b3(feature_vec))

        # concatenation
        cat_vec = torch.cat((history_vec, feature_vec), dim=1)
        fc_out = F.relu(self.fc_cat(cat_vec))
        fc_out = self.dropout_cat(fc_out)
        fc_out = self.top(fc_out)
        return torch.sigmoid(fc_out)

## 5.  Train

In [29]:
data.shape[0]

35

In [30]:
VALID_FRAC = 0.2
num_valid = int(3402 * VALID_FRAC) * data.shape[0]

trainset = EvcDataset(R[:-num_valid,], H[:-num_valid], T[:-num_valid,], S[:-num_valid,], Y[:-num_valid,])
validset = EvcDataset(R[-num_valid:,], H[-num_valid:,], T[-num_valid:,], S[-num_valid:,], Y[-num_valid:,])

In [31]:
R.shape, H.shape, T.shape, S.shape, Y.shape

((119070, 12, 1), (119070, 3, 1), (119070, 3), (153930, 2), (119070, 1))

In [32]:
len(trainset), len(validset)

(95270, 23800)

In [33]:
train_loader = DataLoader(trainset, batch_size=32, shuffle=True)
valid_loader = DataLoader(validset, batch_size=1024)

In [38]:
next(iter(train_loader))[0].shape

torch.Size([32, 12, 1])

In [40]:
r = next(iter(train_loader))[0]

In [41]:
r.shape

torch.Size([32, 12, 1])

In [51]:
rnn = nn.LSTM(1, 16, 1)
out, hidden = rnn(r)
out.shape

torch.Size([32, 12, 16])

In [52]:
out[:,-1,:]

tensor([[-8.0211e-03,  2.6962e-02, -1.2582e-01,  1.3827e-02,  3.8166e-03,
         -3.2350e-02,  2.8341e-03,  2.3226e-02,  2.1473e-02, -2.5127e-03,
          4.9307e-02,  1.6179e-02, -5.6218e-02, -1.5274e-02, -4.0569e-02,
         -5.1519e-02],
        [-1.5412e-02,  2.4869e-02, -1.9483e-01,  1.7900e-02,  2.1373e-02,
         -4.1099e-02, -4.5790e-03,  3.1797e-02,  2.9766e-02, -1.4371e-03,
          7.1979e-02,  1.6556e-02, -7.2634e-02, -2.1721e-02, -7.5205e-02,
         -7.3225e-02],
        [-2.0491e-02,  3.9258e-02, -2.5169e-01,  3.7159e-02,  4.2464e-03,
         -5.5926e-02, -9.7950e-04,  5.6211e-02,  2.2276e-02,  3.0211e-03,
          1.0131e-01,  3.7265e-02, -8.2557e-02, -3.1744e-02, -1.0936e-01,
         -7.5722e-02],
        [-2.4658e-02,  6.1908e-03, -2.5218e-01,  1.5830e-02,  4.7510e-02,
         -3.6343e-02, -2.3426e-02,  3.1158e-02,  3.1563e-02,  7.1803e-04,
          8.8331e-02,  4.2758e-03, -7.3128e-02, -2.5658e-02, -1.0889e-01,
         -8.3126e-02],
        [-2.4884e-02

In [53]:
rnn = nn.LSTM(1, 16, 1, batch_first=True)
out, hidden = rnn(r)
out.shape

torch.Size([32, 12, 16])

In [54]:
out[:,-1,:]

tensor([[-3.2015e-02,  6.9692e-02,  2.5801e-01,  9.5976e-02,  3.1905e-02,
          4.7648e-02, -1.0323e-01,  1.4276e-02,  1.4304e-02,  7.7959e-03,
          1.1104e-01,  7.8984e-03, -1.2351e-01,  1.8536e-02,  1.3944e-01,
          2.0081e-01],
        [-4.3550e-02,  5.0355e-02,  2.1877e-01,  5.8685e-02, -8.4632e-04,
          7.8423e-03, -1.2262e-01,  1.7988e-03, -2.0463e-02,  3.2695e-02,
          9.5255e-02,  5.3094e-02, -1.2216e-01,  4.7252e-03,  1.1579e-01,
          1.9031e-01],
        [-2.8334e-02,  7.6498e-02,  2.7718e-01,  9.7952e-02,  4.5652e-02,
          5.2140e-02, -9.5030e-02,  1.5746e-02,  2.4069e-02, -2.1760e-03,
          1.1701e-01, -2.9211e-03, -1.2693e-01,  2.4762e-02,  1.4707e-01,
          2.0077e-01],
        [-5.1653e-02,  3.3647e-02,  1.7348e-01,  3.5897e-02, -3.7829e-02,
         -1.6830e-02, -1.4453e-01, -2.4953e-03, -4.8511e-02,  5.0389e-02,
          8.2108e-02,  8.1897e-02, -1.1834e-01, -1.4323e-02,  9.7605e-02,
          1.8510e-01],
        [-2.8798e-02

In [132]:
import torch
from sklearn.metrics import r2_score

def train(model, train_dataloader, optim, epoch):
    model.train()
    criterion = nn.MSELoss()
    for b_i, (R, H, T, S, y) in enumerate(train_dataloader):

        optim.zero_grad()
        pred = model(R, H, T, S)
    
        loss = criterion(pred, y)
        loss.backward()
        optim.step()
        
        if b_i % 3000 == 0:
            print('epoch: {} [{}/{} ({:.0f}%)]\t training loss: {:.6f}'.format(
                epoch, b_i * len(R), len(train_dataloader.dataset),
                100 * b_i / len(train_dataloader), loss.item()
            ))

def calc_accuracy(y_true, y_pred):
    error = y_true - y_pred
    return 1- (torch.norm(error) / torch.norm(y_true))


def test(model, test_dataloader):
    model.eval()
    criterion = nn.MSELoss(reduction='sum')
    loss = 0

    with torch.no_grad():
        pred_total = torch.Tensor()
        y_total = torch.Tensor()

        for R, H, T, S, y in test_dataloader:
            pred = model(R, H, T, S)
            loss += criterion(pred, y).item()
            pred_total = torch.cat((pred_total, pred.flatten()), dim=0)
            y_total = torch.cat((y_total, y.flatten()), dim=0)

    loss /= len(test_dataloader.dataset)
    accuracy = calc_accuracy(y_total, pred_total)
    r2 = r2_score(y_total, pred_total)

    print('\nTest dataset:  Loss: {:.4f}, Accuracy: {:.4f}, R2: {:.4f}'.format(loss, accuracy, r2))

In [133]:
model = BaseHybrid(hidden_size=32, station_embedding_dim=30, embedding_dim=16)
optim = torch.optim.Adam(model.parameters(), lr=1e-2)

N_EPOCH = 10


for epoch in range(1,N_EPOCH+1):
    train(model, train_loader, optim, epoch)
    test(model, valid_loader)
    print()


Test dataset:  Loss: 0.1085, Accuracy: 0.6375, R2: -0.2256



KeyboardInterrupt: 

In [112]:
model_2 = GroupedHybrid(hidden_size=16, embedding_dim=16)
optim = torch.optim.Adam(model_2.parameters(), lr=1e-3)

N_EPOCH = 20

for epoch in range(1,N_EPOCH+1):
    train(model_2, train_loader, optim, epoch)
    test(model_2, valid_loader)
    print()


Test dataset:  Loss: 0.0788, Accuracy: 0.6909, R2: 0.1170


Test dataset:  Loss: 0.0779, Accuracy: 0.6927, R2: 0.1268


Test dataset:  Loss: 0.0775, Accuracy: 0.6934, R2: 0.1313


Test dataset:  Loss: 0.0777, Accuracy: 0.6931, R2: 0.1295



KeyboardInterrupt: 