In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

from utils import split_sequences, station_features, time_features
from dataset import EvcDataset
from basemodels import HistoricBase, RealtimeBase, MultiSeqBase, MultiSeqHybrid, MultiSeqUmap

### 1. Load Data

In [2]:
history = pd.read_csv('./data/input_table/history_by_station_pub.csv', parse_dates=['time'])
station_attributes = pd.read_csv('./data/input_table/pubstation_feature_scaled.csv')
station_embeddings = pd.read_csv('./data/input_table/pubstation_umap-embedding.csv')

sid_encoder = {name:idx for idx, name in enumerate(station_embeddings.sid)}
station_embeddings.sid = station_embeddings.sid.map(sid_encoder)
station_attributes.sid = station_attributes.sid.map(sid_encoder)


# transforms targer var. to binary indicator (1:high availabiltity, 0: low availability)
data = history.set_index('time').mask(lambda x: x < 0.5,  1).mask(lambda x: x != 1, 0)
data = data.T.reset_index().rename(columns={'index':'sid'})
data.sid = data.sid.map(sid_encoder)

data = data[data.sid.isin(station_attributes.sid)].set_index('sid')  # station feature가 있는 데이터로 한정
data = data[data.mean(axis=1).le(0.9)]  # False 라벨이 10% 이상 존재하는 데이터 사용
print(data.shape)

(75, 6624)


In [3]:
umap_embedding = torch.tensor(station_embeddings.drop(columns=['sid']).values).float()

### 2. feature generation

In [4]:
print('generating inputs...')
N_IN = 12
N_OUT = 6
N_HIST = 4

n_stations = data.shape[0]
n_windows = data.shape[1] - (N_OUT + 504*N_HIST)

R_seq, H_seq, Y_seq = split_sequences(sequences=data.values, n_steps_in=N_IN, n_steps_out=N_OUT, n_history=N_HIST)
T = time_features(time_idx=data.columns, n_steps_in=N_IN, n_steps_out=N_OUT, n_history=N_HIST, n_stations=n_stations)
S = station_features(station_array=data.index, station_df=station_attributes, n_windows=n_windows) 
print('done!')

generating inputs...
done!


### 3. Set dimension

In [5]:
R_seq = R_seq[:, :, np.newaxis]

OUTPUT_IDX = 1
H_seq = H_seq[:, OUTPUT_IDX, :, np.newaxis]
T = T[:,OUTPUT_IDX,:]
Y = Y_seq[:,OUTPUT_IDX, np.newaxis]

In [6]:
print(R_seq.shape, H_seq.shape, Y.shape, T.shape, S.shape)
print('done!')

(345150, 12, 1) (345150, 4, 1) (345150, 1) (345150, 3) (345150, 16)
done!


### 4. Split Train:Valid

In [7]:
TRAIN_FRAC = 0.9
n_train = int(data.shape[0] * TRAIN_FRAC * n_windows)

trainset = EvcDataset(R_seq[:n_train,], H_seq[:n_train], T[:n_train,], S[:n_train,], Y[:n_train,])
validset = EvcDataset(R_seq[n_train:,], H_seq[n_train:,], T[n_train:,], S[n_train:,], Y[n_train:,])
print(f'Trainset Size: {len(trainset)}, Validset Size: {len(validset)}')

Trainset Size: 310635, Validset Size: 34515


### 5. Dataloader

In [8]:
# with negative over sampling
weights = np.where(trainset[:][-1].flatten() == 0., 5, 1)  # 5배
num_samples = len(trainset)
sampler = torch.utils.data.WeightedRandomSampler(weights, num_samples, replacement=True, generator=None)
train_loader = DataLoader(trainset, batch_size=32, sampler=sampler)

# without sampling
# train_loader = DataLoader(trainset, batch_size=32, shuffle=True)
valid_loader = DataLoader(validset, batch_size=1024, shuffle=False)

### 6. Train Test Functions

In [9]:
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, roc_auc_score, balanced_accuracy_score

def train(model, train_dataloader, optim, epoch, verbose=0):
    model.train()
    criterion = nn.BCELoss()
    for b_i, (R, H, T, S, y) in enumerate(train_dataloader):
        optim.zero_grad()
        pred = model(R, H, T, S)
        loss = criterion(pred, y)
        loss.backward()
        optim.step()
        
        if verbose:
            if b_i % 3000 == 0:
                print('epoch: {} [{}/{} ({:.0f}%)]\t training loss: {:.6f}'.format(
                    epoch, b_i * len(R), len(train_dataloader.dataset),
                    100 * b_i / len(train_dataloader), loss.item()
                ))

def test(model, test_dataloader):
    model.eval()
    criterion = nn.MSELoss(reduction='sum')
    loss = 0

    with torch.no_grad():
        pred_total = torch.Tensor()
        y_total = torch.Tensor()

        for R, H, T, S, y in test_dataloader:
            pred = model(R, H, T, S)
            loss += criterion(pred, y).item()
            pred_total = torch.cat((pred_total, pred.flatten()), dim=0)
            y_total = torch.cat((y_total, y.flatten()), dim=0)

    loss /= len(test_dataloader.dataset)
    y_total = y_total.int().numpy()
    pred_total = pred_total.numpy()
    pred_label = np.where(pred_total > 0.5, 1, 0)


    recall = recall_score(y_total, pred_label)
    precision = precision_score(y_total, pred_label)
    f1 = f1_score(y_total, pred_label)
    accuracy = accuracy_score(y_total, pred_label)
    bal_accuracy = balanced_accuracy_score(y_total, pred_label)
    auc = roc_auc_score(y_total, pred_total)

    # print('Test dataset:  Loss: {:.4f}, Recall: {:.4f}, Precision: {:.4f}, F1: {:.4f}, Accuracy: {:.4f}, Balanced-Accuracy: {:.4f}, AUC: {:.4f}' \
    # .format(loss, recall, precision, f1, accuracy, bal_accuracy, auc))
    print('Test dataset:  Loss: {:.4f}, Accuracy: {:.4f}, Balanced-Accuracy: {:.4f}, AUC: {:.4f}' \
    .format(loss, accuracy, bal_accuracy, auc))


In [10]:
models = {'HistoricBase':HistoricBase, 'RealtimeBase':RealtimeBase, 'MultiSeqBase':MultiSeqBase, 'MultiSeqHybrid':MultiSeqHybrid, 'MultiSeqUmap':MultiSeqUmap}
for name, basemodel in models.items():
    print(f'-------{name}-------')
    if name == 'MultiSeqUmap':
        model = basemodel(hidden_size=16, embedding_dim=8, pretrained_embedding=umap_embedding)
    else:
        model = basemodel(hidden_size=16, embedding_dim=8)
    optim = torch.optim.Adam(model.parameters())

    N_EPOCH = 10
    for epoch in range(1,N_EPOCH+1):
        print(f'<<Epoch {epoch}>>', end='\t')
        train(model, train_loader, optim, epoch, verbose=0)
        test(model, valid_loader)

    # y_true = sample_data[4].flatten().detach().numpy()
    # y_pred = model(*sample_data[:4]).flatten().detach().numpy()
    # y_pred = np.where(y_pred > 0.5, 1, 0)

    # fig, ax = plt.subplots(figsize=(40,8))
    # ax.plot(y_true, color='g')
    # ax.plot(y_pred, color='r')
    # plt.savefig(f'./images/{name}_out-{OUTPUT_IDX}_result.png')

-------HistoricBase-------
<<Epoch 1>>	Test dataset:  Loss: 0.2014, Accuracy: 0.5960, Balanced-Accuracy: 0.6327, AUC: 0.6852
<<Epoch 2>>	Test dataset:  Loss: 0.1976, Accuracy: 0.6142, Balanced-Accuracy: 0.6381, AUC: 0.6868
<<Epoch 3>>	Test dataset:  Loss: 0.2043, Accuracy: 0.6246, Balanced-Accuracy: 0.6340, AUC: 0.6875
<<Epoch 4>>	Test dataset:  Loss: 0.2016, Accuracy: 0.6201, Balanced-Accuracy: 0.6350, AUC: 0.6870
<<Epoch 5>>	Test dataset:  Loss: 0.2022, Accuracy: 0.5934, Balanced-Accuracy: 0.6373, AUC: 0.6862
<<Epoch 6>>	Test dataset:  Loss: 0.2070, Accuracy: 0.6110, Balanced-Accuracy: 0.6384, AUC: 0.6856
<<Epoch 7>>	Test dataset:  Loss: 0.2169, Accuracy: 0.5966, Balanced-Accuracy: 0.6414, AUC: 0.6883
<<Epoch 8>>	Test dataset:  Loss: 0.2115, Accuracy: 0.5965, Balanced-Accuracy: 0.6349, AUC: 0.6830
<<Epoch 9>>	Test dataset:  Loss: 0.2024, Accuracy: 0.6077, Balanced-Accuracy: 0.6358, AUC: 0.6856
<<Epoch 10>>	Test dataset:  Loss: 0.2071, Accuracy: 0.6091, Balanced-Accuracy: 0.6393, AUC: