In [1]:
import pandas as pd
import numpy as np

weather = pd.read_csv("weather.csv", index_col="DATE")

In [2]:
null_pct = weather.apply(pd.isnull).sum()/weather.shape[0]
valid_columns = weather.columns[null_pct < .05]

In [3]:
weather = weather[valid_columns].copy()
weather.columns = weather.columns.str.lower()

In [4]:
weather = weather.ffill()

In [5]:
weather.index = pd.to_datetime(weather.index)

In [6]:
weather = weather[(weather.index.month != 2) | (weather.index.day != 28)]

In [7]:
weather = weather.ffill()

In [8]:
weather.shape

(19234, 7)

In [9]:
predictors = weather.columns[~weather.columns.isin(["name", "station"])]

In [10]:
import torch
torch.manual_seed(0)

DEVICE = "mps"
device = torch.device(DEVICE)

In [11]:
from torch.utils.data import Dataset
import math
from statistics import mean

SEQ_LEN = 365 * 2

class WeatherDataset(Dataset):
    def __init__(self, dataset, predictors):
        self.dataset = dataset.copy()
        self.width = SEQ_LEN
        self.predictors = predictors

    def __len__(self):
        return self.dataset.shape[0] - self.width
    
    def column_to_mat(self, col):
        return col.values.reshape((self.rows, self.columns))
        
    def __getitem__(self, idx):
        history = self.dataset.iloc[idx:(idx+self.width),:]
        target = history.iloc[-1]["tmax"]
        history = history[:-1][self.predictors]
        
        mats = torch.tensor(history.T.to_numpy(), dtype=torch.float32)
        return mats, target, idx + self.width - 1

In [12]:
data = WeatherDataset(weather, predictors)
train_size = int(0.7 * len(data))
valid_size = int(0.1 * len(data))
test_size = len(data) - (train_size + valid_size)
train_data, valid_data, test_data = torch.utils.data.random_split(data, [train_size, valid_size, test_size], generator=torch.Generator().manual_seed(1))

In [13]:
from torch.utils.data import DataLoader

BATCH_SIZE = 512
EPOCHS = 25

In [14]:
train = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
valid = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=True)
test = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

In [16]:
from torch import nn

class NeuralNetwork(nn.Module):
    def __init__(self, seq_len):
        super(NeuralNetwork, self).__init__()
        
        self.hidden_layer = 256
        
        self.lstm = nn.Sequential(
            nn.LSTM(seq_len, self.hidden_layer, 1)
        )
        
        self.dense = nn.Sequential(
            nn.Linear(self.hidden_layer * len(predictors), 64),
            nn.Linear(64, 1),
        )

    def forward(self, x):
        batch_size = x.shape[0]
        x, _ = self.lstm(x)
        x = x.view(batch_size, -1)
        x = self.dense(x)
        return x

In [17]:
model = NeuralNetwork(SEQ_LEN - 1).to(device)
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-7)

In [None]:
size = len(train.dataset)

for epoch in range(EPOCHS):
    for batch, (tensors, targets, _) in enumerate(train): 
        optimizer.zero_grad()
        
        tensors = tensors.to(device)
        pred = model(tensors).squeeze(1)
        
        targets = targets.float().to(device)
        
        loss = loss_fn(pred, targets)

        loss.backward()
        optimizer.step()
    
    losses = []
    with torch.no_grad():
        for batch, (tensors, targets, _) in enumerate(valid):

            tensors = tensors.to(device)
            pred = model(tensors).squeeze(1)
            
            targets = targets.float().to(device)
            loss = loss_fn(pred, targets)
            losses.append(loss.item())
    
    print(f"loss: {mean(losses):>7f}  [{epoch}]")

loss: 621765.156250  [0]
loss: 86471.001953  [1]
loss: 42469.603516  [2]
loss: 36846.265625  [3]
loss: 34507.838379  [4]
loss: 32460.539551  [5]
loss: 30683.089844  [6]
loss: 30081.360352  [7]
loss: 29648.869141  [8]
loss: 28867.395508  [9]
loss: 28875.926758  [10]
loss: 27989.248047  [11]
loss: 27857.603027  [12]
loss: 27452.366699  [13]
loss: 27604.967773  [14]
loss: 26981.446777  [15]
loss: 27087.321289  [16]
loss: 27017.665039  [17]
loss: 26929.313965  [18]
loss: 27013.280273  [19]
loss: 27029.138672  [20]
loss: 26658.986328  [21]


In [None]:
preds = []
targets = []
idxs = []

with torch.no_grad():
    for batch, (tensors, target, idx) in enumerate(test):
        
        tensors = tensors.to(device)
        pred = model(tensors).squeeze(1)
        
        targets.append(target)
        preds.append(pred)
        idxs.append(idx)

In [None]:
import numpy as np

preds = np.concatenate([p.cpu().numpy() for p in preds])
targets = np.concatenate([p.cpu().numpy() for p in targets])
idxs = np.concatenate([p.cpu().numpy() for p in idxs])

rows = weather.iloc[idxs,:]

In [None]:
preds

In [None]:
predictions = pd.DataFrame(dict(pred=preds, actual=targets), index=rows.index)
predictions = pd.concat([predictions, rows], axis=1)

In [None]:
predictions["se"] = ((preds - targets) ** 2)
print(predictions["se"].mean())

In [None]:
predictions