In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

from utils import split_sequences, station_features, time_features
from dataset import EvcDataset
from models import BaseHybrid, GatingHybrid
from basemodels import HistoricBase, RealtimeBase, MultiSeqBase, GatingSeqBase, GatingSeqEmbedding

In [3]:
def train(model, train_dataloader, optim, epoch, verbose=0):
    model.train()
    criterion = nn.MSELoss()
    for b_i, (R, H, T, S, y) in enumerate(train_dataloader):
        optim.zero_grad()
        pred = model(R, H, T, S)
        loss = criterion(pred, y)
        loss.backward()
        optim.step()
        
        if verbose:
            if b_i % 3000 == 0:
                print('epoch: {} [{}/{} ({:.0f}%)]\t training loss: {:.6f}'.format(
                    epoch, b_i * len(R), len(train_dataloader.dataset),
                    100 * b_i / len(train_dataloader), loss.item()
                ))

def test(model, test_dataloader):
    model.eval()
    criterion = nn.MSELoss(reduction='sum')
    loss = 0

    with torch.no_grad():
        pred_total = torch.Tensor()
        y_total = torch.Tensor()

        for R, H, T, S, y in test_dataloader:
            pred = model(R, H, T, S)
            loss += criterion(pred, y).item()
            pred_total = torch.cat((pred_total, pred.flatten()), dim=0)
            y_total = torch.cat((y_total, y.flatten()), dim=0)

    loss /= len(test_dataloader.dataset)
    error = y_total - pred_total
    accuracy = 1- (torch.norm(error) / torch.norm(y_total))
    r2 = r2_score(y_total, pred_total)

    print('Test dataset:  Loss: {:.4f}, Accuracy: {:.4f}, R2: {:.4f}'.format(loss, accuracy, r2))

In [6]:
history = pd.read_csv('./data/input_table/history_by_station.csv', parse_dates=['time'])
station = pd.read_csv('./data/input_table/station_info.csv')
data = history.set_index('time').T.reset_index().rename(columns={'index':'station_name'})
data = data[data.station_name.isin(station.station_name)].set_index('station_name')
data = data[data.mean(axis=1).le(0.9)]
data = data[:30]

print('generating inputs...')
N_STEPS_IN = 12
N_STEPS_OUT = 6
N_HISTORY = 4

n_stations = data.shape[0]
n_windows = data.shape[1] - (N_STEPS_OUT + 336*N_HISTORY)
R, H, Y = split_sequences(data.values, N_STEPS_IN, N_STEPS_OUT, N_HISTORY)
T = time_features(data.columns, N_STEPS_IN, N_STEPS_OUT, N_HISTORY, n_stations)
S = station_features(station_array=data.index, station_df=station, n_windows=n_windows) 
print('done!')

R = R[:, :, np.newaxis]

OUTPUT_IDX = 1
H = H[:, OUTPUT_IDX, :, np.newaxis]
T = T[:,OUTPUT_IDX,:]
Y = Y[:,OUTPUT_IDX, np.newaxis]

VALID_FRAC = 0.1
num_valid = int(data.shape[0] * VALID_FRAC * n_windows)

trainset = EvcDataset(R[:-num_valid,], H[:-num_valid], T[:-num_valid,], S[:-num_valid,], Y[:-num_valid,])
validset = EvcDataset(R[-num_valid:,], H[-num_valid:,], T[-num_valid:,], S[-num_valid:,], Y[-num_valid:,])
print(f'Trainset Size: {len(trainset)}, Validset Size: {len(validset)}')

generating inputs...
done!
Trainset Size: 82782, Validset Size: 9198


In [11]:
trainset[0]

(tensor([[1.0000],
         [1.0000],
         [0.5556],
         [0.7556],
         [1.0000],
         [1.0000],
         [1.0000],
         [1.0000],
         [1.0000],
         [1.0000],
         [1.0000],
         [1.0000]]),
 tensor([[1.],
         [1.],
         [1.],
         [1.]]),
 tensor([1, 3, 0], dtype=torch.int32),
 tensor([0, 0], dtype=torch.int32),
 tensor([1.]))

In [72]:
weights = np.where(trainset[:][-1].flatten() == 0., 1, 0.1)
num_samples = len(trainset)

sampler = torch.utils.data.WeightedRandomSampler(weights, num_samples, replacement=True, generator=None)


In [98]:
train_loader = DataLoader(trainset, batch_size=32, sampler=sampler)

In [87]:
train_loader = DataLoader(trainset, batch_size=32, shuffle=True)