An attempt at very basic models on the post processing data.
1. Linear regression
2. Simple station embedding

In [None]:
import dask.dataframe as dd
import numpy as np
import os
import pathlib
import torch
import torch.nn as nn
import pandas as pd

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))
INPUT_DIR = DATA_DIR / '2021-03-17-ppdataset'

In [None]:
df = dd.read_parquet(INPUT_DIR / '*.parquet')
df = df.compute()

In [None]:
df['step_td'] = pd.to_timedelta(df['step'], unit='S')
df['valid'] = df['date'] + df['step_td']
df['error_2t'] = df['gdps_2t'] - df['obs_2t']

In [None]:
df['step'].max()

In [None]:
yearly_component = np.sin((df.date.dt.dayofyear / 366) * 2*np.pi ).values.astype(np.float32)

In [None]:
daily_component = np.sin((df['valid'].dt.hour / 24) * 2*np.pi).values.astype(np.float32)

In [None]:
step_component = (df['step'] / (237 * 60 * 60)).values

In [None]:
temporal_embedding = torch.empty((len(df), 3))
temporal_embedding[:,0] = torch.from_numpy(yearly_component)
temporal_embedding[:,1] = torch.from_numpy(daily_component)
temporal_embedding[:,2] = torch.from_numpy(step_component)

In [None]:
temporal_embedding[0:10]

In [None]:
temporal_embedding.mean(dim=0)

In [None]:
df['station'].value_counts().plot()

In [None]:
station_list = sorted(list(set(df['station'])))

In [None]:
len(station_list)

In [None]:
station_ids = df['station'].astype('category').cat.codes.values

In [None]:
station_embedding = torch.from_numpy(station_ids)

In [None]:
feature_cols = [c for c in df.columns if c.startswith('gdps')]

In [None]:
feature_cols

In [None]:
features = torch.empty((len(df), len(feature_cols)))

In [None]:
for i, c in enumerate(feature_cols):
    features[:,i] = torch.from_numpy(df[c].values)

In [None]:
features.shape

In [None]:
ys = torch.from_numpy(df['error_2t'].values)

In [None]:
class WeatherInMemoryDataset:
    def __init__(self, station_embedding, temporal_embedding, features, y):
        self.station = station_embedding
        self.temporal = temporal_embedding
        self.x = features
        self.y = y
        
    def __len__(self):
        return self.x.shape[0]
    
    def __getitem__(self, idx):
        return self.station[idx], self.temporal[idx], self.x[idx], self.y[idx]

In [None]:
station_ids

In [None]:
dataset = WeatherInMemoryDataset(torch.from_numpy(station_ids).long(), temporal_embedding, features, ys)

In [None]:
dataset[0]

In [None]:
station_ids.shape

In [None]:
station_embedding.float().shape

In [None]:
loader = torch.utils.data.DataLoader(dataset, batch_size=20000, shuffle=True)

In [None]:
x

In [None]:
class DumbWeatherModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.station_embedding = nn.Parameter(torch.randn(1678, 128, requires_grad=True).double() / 128)
        self.temporal_embedding = nn.Linear(in_features=3, out_features=32).double()
        
        self.kernel = nn.Parameter(torch.randn(128, 32, 20, requires_grad=True).double() / (128*32))
        
        self.bias = nn.Parameter(torch.randn(128, 32, requires_grad=True)).double()
        self.scale = nn.Parameter(torch.randn(128, 32, requires_grad=True)).double()
        
    def forward(self, station_code, temporal_code, x):
        station = self.station_embedding[station_code]
        temporal = self.temporal_embedding(temporal_code)
        
        pred = torch.einsum('ijk,bi,bj,bk->b',self.kernel, station, temporal, x)
        bias = torch.einsum('bi,ij,bj->b', station, self.bias, temporal)
        scale = torch.einsum('bi,ij,bj->b', station, self.scale, temporal)
        
        return pred * scale + bias

In [None]:
model = DumbWeatherModel()

In [None]:
b = next(iter(loader))

In [None]:
station_code, temporal_code, x, y = b

In [None]:
optimizer = torch.optim.SGD(model.parameters(), 1e-4)

In [None]:
for epoch in range(10):
    for b in loader:
        
        station_code, temporal_code, x, y = b
        
        y_hat = model(station_code, temporal_code.double(), x.double())
        loss = loss_fn(y_hat, y)
        
        print(loss)
        
        loss.backward()
        optimizer.step()

In [None]:
y_hat

In [None]:
y

In [None]:
model.bias

In [None]:
model.scale

In [None]:
model.station_embedding

In [None]:
model.temporal_embedding

In [None]:
model.temporal_embedding.weight