In [74]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import MinMaxScaler

# Bucharest record min and max temperatures, adjusted according to source
# Source: https://www.extremeweatherwatch.com/cities/bucharest/lowest-temperatures
BUC_MIN_TEMP = -35
BUC_MAX_TEMP = 45

N_LAGS = 7


def clean_data(df):
    # Set the date as the index
    index = pd.to_datetime(df[["Year", "Month", "Day"]])
    df.set_index(index, inplace=True)

    # Clean rows
    # Sea Level Pressure, gets dropped, too many 0, not enough relevant info
    corrupted = df[df["Sea_Level_Pressure"] == "True"]
    df.drop(index=corrupted.index, inplace=True)
    df.drop("Sea_Level_Pressure", axis=1, inplace=True)

    # Humidity Average, 11 rows with NaN, can drop them
    corrupted = df[df["Humidity_Avg"].isna()]
    df.drop(index=corrupted.index, inplace=True)

    # Precipitation Total, fill Nan with 0
    df["Precipitation_Total"].fillna("0", inplace=True)

    # Visibility Average, gets dropped, too many NaN values > 10%
    df.drop("Visibility_Avg", axis=1, inplace=True)

    # Wind Max, contains mostly NaN values, can drop
    # We have Wind Sustained Max which is clean
    df.drop("Wind_Max", axis=1, inplace=True)

    return df


def normalize_temp_data(df):
    # Create new dataframe
    scaled = df.copy(deep=True)

    # Scale with record min and max
    temp_scaler = MinMaxScaler()
    temp_scaler.fit(np.array((BUC_MIN_TEMP, BUC_MAX_TEMP)).reshape(-1, 1))

    cols = temp_cols
    temp_data = np.array(df[cols]).T
    scaled = df.copy(deep=True)
    temp_scaled = [temp_scaler.transform(X.reshape(-1, 1)) for X in temp_data]
    for idx, col in enumerate(cols):
        scaled[col] = temp_scaled[idx]
    return scaled, temp_scaler


def normalize_data(df):
    scaler = MinMaxScaler()
    cols = misc_cont_cols
    df[cols] = scaler.fit_transform(df[cols])
    return df, scaler


def build_set(ds, lags=10):
    train = []
    true = []
    for idx in range(len(ds) - lags):
        current = ds[idx : idx + lags]
        pred = ds[idx + lags][temp_avg_idx]
        train.append(current)
        true.append(pred)

    train = torch.cat([t for t in train]).reshape(-1, lags, ds.shape[1])
    true = torch.tensor(true).reshape(-1, 1)
    return train, true


df = pd.read_csv("data.csv")
df = clean_data(df)

# Boolean columns
cat_cols = [col for col in df.columns if col.startswith("Is")]
# Continous columns
cont_cols = list(
    set(df.columns) - set(cat_cols) - set(["Year", "Month", "Day"])
)
# Temperature columns
temp_cols = [col for col in cont_cols if col.startswith("Temperature")]
# Miscellaneous continuous columns -> Humidity, Precipitation, Wind
misc_cont_cols = list(set(cont_cols) - set(temp_cols))

# Set correct datatypes
df[cont_cols] = df[cont_cols].astype("float")

# Scale temperatures on the whole dataset
scaled, temp_scaler = normalize_temp_data(df)

valid_start = "2022-01-01"
train_df = scaled[:valid_start][:-1]
valid_df = scaled[valid_start:]

# Scale the rest of the continous columns
train_df, scaler = normalize_data(train_df)
valid_df[misc_cont_cols] = scaler.transform(valid_df[misc_cont_cols])

# Convert to tensors
train_cont = torch.tensor(np.stack([train_df[cont_cols]]), dtype=torch.float)
train_cat = torch.tensor(np.stack([train_df[cat_cols]]), dtype=torch.float)
valid_cont = torch.tensor(np.stack([valid_df[cont_cols]]), dtype=torch.float)
valid_cat = torch.tensor(np.stack([valid_df[cat_cols]]), dtype=torch.float)

# Create a train and valid dataset
train_ds = torch.cat((train_cont, train_cat), dim=2).squeeze()
valid_ds = torch.cat((valid_cont, valid_cat), dim=2).squeeze()

temp_avg_idx = cont_cols.index("Temperature_Avg")

x_train, y_train = build_set(train_ds, N_LAGS)

# Add the last N_LAGS elements to valid_ds to ensure continuity
x_valid, y_valid = build_set(torch.cat((train_ds[-N_LAGS:], valid_ds)), N_LAGS)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[misc_cont_cols] = scaler.transform(valid_df[misc_cont_cols])


In [76]:
train_ds[-N_LAGS:]

tensor([[0.6216, 0.3419, 0.0000, 0.6150, 0.4750, 0.5325, 0.3318, 1.0000, 0.0000,
         0.0000, 0.0000],
        [0.9459, 0.4791, 0.0000, 0.5725, 0.4250, 0.4475, 0.3079, 1.0000, 1.0000,
         0.0000, 0.0000],
        [0.9865, 0.4349, 0.0195, 0.4400, 0.4250, 0.4363, 0.3662, 1.0000, 0.0000,
         0.0000, 0.0000],
        [1.0000, 0.1465, 0.0332, 0.4412, 0.4250, 0.4338, 0.0867, 1.0000, 1.0000,
         0.0000, 1.0000],
        [0.9865, 0.1977, 0.0000, 0.4375, 0.4162, 0.4275, 0.1136, 1.0000, 1.0000,
         0.0000, 1.0000],
        [1.0000, 0.2070, 0.0371, 0.4638, 0.4275, 0.4475, 0.1390, 1.0000, 0.0000,
         0.0000, 0.0000],
        [1.0000, 0.2767, 0.0039, 0.5088, 0.4500, 0.4737, 0.2242, 1.0000, 0.0000,
         0.0000, 1.0000]])

In [77]:
valid_ds[:N_LAGS]

tensor([[0.9324, 0.1372, 0.0019, 0.6000, 0.4625, 0.5238, 0.0867, 1.0000, 0.0000,
         0.0000, 1.0000],
        [0.8514, 0.1605, 0.0000, 0.5625, 0.4350, 0.5075, 0.1659, 0.0000, 0.0000,
         0.0000, 1.0000],
        [0.8514, 0.2628, 0.0000, 0.5813, 0.4650, 0.5225, 0.2242, 0.0000, 0.0000,
         0.0000, 1.0000],
        [0.7027, 0.2279, 0.0000, 0.6012, 0.4663, 0.5200, 0.1659, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.7838, 0.2535, 0.0000, 0.6263, 0.4750, 0.5288, 0.2242, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.7297, 0.2535, 0.0000, 0.6388, 0.4500, 0.5312, 0.2242, 1.0000, 0.0000,
         0.0000, 0.0000],
        [0.7568, 0.3140, 0.0000, 0.5638, 0.4563, 0.4750, 0.3318, 1.0000, 0.0000,
         0.0000, 0.0000]])

In [78]:
x_train[-2:]

tensor([[[0.6892, 0.1093, 0.0000, 0.4375, 0.3500, 0.3862, 0.1136, 0.0000,
          0.0000, 0.0000, 0.0000],
         [0.6757, 0.3698, 0.0000, 0.5400, 0.3663, 0.4512, 0.3318, 0.0000,
          0.0000, 0.0000, 0.0000],
         [0.6216, 0.3419, 0.0000, 0.6150, 0.4750, 0.5325, 0.3318, 1.0000,
          0.0000, 0.0000, 0.0000],
         [0.9459, 0.4791, 0.0000, 0.5725, 0.4250, 0.4475, 0.3079, 1.0000,
          1.0000, 0.0000, 0.0000],
         [0.9865, 0.4349, 0.0195, 0.4400, 0.4250, 0.4363, 0.3662, 1.0000,
          0.0000, 0.0000, 0.0000],
         [1.0000, 0.1465, 0.0332, 0.4412, 0.4250, 0.4338, 0.0867, 1.0000,
          1.0000, 0.0000, 1.0000],
         [0.9865, 0.1977, 0.0000, 0.4375, 0.4162, 0.4275, 0.1136, 1.0000,
          1.0000, 0.0000, 1.0000]],

        [[0.6757, 0.3698, 0.0000, 0.5400, 0.3663, 0.4512, 0.3318, 0.0000,
          0.0000, 0.0000, 0.0000],
         [0.6216, 0.3419, 0.0000, 0.6150, 0.4750, 0.5325, 0.3318, 1.0000,
          0.0000, 0.0000, 0.0000],
         [0.9459,

In [79]:
y_train[-2:]

tensor([[0.4475],
        [0.4737]])

In [82]:
valid_ds[:N_LAGS]

tensor([[0.9324, 0.1372, 0.0019, 0.6000, 0.4625, 0.5238, 0.0867, 1.0000, 0.0000,
         0.0000, 1.0000],
        [0.8514, 0.1605, 0.0000, 0.5625, 0.4350, 0.5075, 0.1659, 0.0000, 0.0000,
         0.0000, 1.0000],
        [0.8514, 0.2628, 0.0000, 0.5813, 0.4650, 0.5225, 0.2242, 0.0000, 0.0000,
         0.0000, 1.0000],
        [0.7027, 0.2279, 0.0000, 0.6012, 0.4663, 0.5200, 0.1659, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.7838, 0.2535, 0.0000, 0.6263, 0.4750, 0.5288, 0.2242, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.7297, 0.2535, 0.0000, 0.6388, 0.4500, 0.5312, 0.2242, 1.0000, 0.0000,
         0.0000, 0.0000],
        [0.7568, 0.3140, 0.0000, 0.5638, 0.4563, 0.4750, 0.3318, 1.0000, 0.0000,
         0.0000, 0.0000]])

In [80]:
x_valid[:2]

tensor([[[0.6216, 0.3419, 0.0000, 0.6150, 0.4750, 0.5325, 0.3318, 1.0000,
          0.0000, 0.0000, 0.0000],
         [0.9459, 0.4791, 0.0000, 0.5725, 0.4250, 0.4475, 0.3079, 1.0000,
          1.0000, 0.0000, 0.0000],
         [0.9865, 0.4349, 0.0195, 0.4400, 0.4250, 0.4363, 0.3662, 1.0000,
          0.0000, 0.0000, 0.0000],
         [1.0000, 0.1465, 0.0332, 0.4412, 0.4250, 0.4338, 0.0867, 1.0000,
          1.0000, 0.0000, 1.0000],
         [0.9865, 0.1977, 0.0000, 0.4375, 0.4162, 0.4275, 0.1136, 1.0000,
          1.0000, 0.0000, 1.0000],
         [1.0000, 0.2070, 0.0371, 0.4638, 0.4275, 0.4475, 0.1390, 1.0000,
          0.0000, 0.0000, 0.0000],
         [1.0000, 0.2767, 0.0039, 0.5088, 0.4500, 0.4737, 0.2242, 1.0000,
          0.0000, 0.0000, 1.0000]],

        [[0.9459, 0.4791, 0.0000, 0.5725, 0.4250, 0.4475, 0.3079, 1.0000,
          1.0000, 0.0000, 0.0000],
         [0.9865, 0.4349, 0.0195, 0.4400, 0.4250, 0.4363, 0.3662, 1.0000,
          0.0000, 0.0000, 0.0000],
         [1.0000,

In [83]:
y_valid[:2]

tensor([[0.5238],
        [0.5075]])

In [84]:
print(x_train.shape)
print(x_valid.shape)

torch.Size([1788, 7, 11])
torch.Size([361, 7, 11])


## Build LSTM model

In [91]:
x_train[0].shape

torch.Size([7, 11])

In [90]:
y_train[0]

tensor([0.2925])

In [106]:
import torch.nn as nn

NUM_LAYERS = 3
HIDDEN_SIZE = 128

lstm = nn.LSTM(input_size=train_ds.shape[1], hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS)
hidden = (
    torch.zeros(NUM_LAYERS, HIDDEN_SIZE),
    torch.zeros(NUM_LAYERS, HIDDEN_SIZE),
)

out, hidden = lstm(x_train[0], hidden)

dropout = nn.Dropout(0.5)

out = dropout(out)

fc = nn.Linear(in_features=HIDDEN_SIZE, out_features=1)

out = fc(out)

In [107]:
out.shape

torch.Size([7, 1])

In [109]:
out

tensor([[-0.0717],
        [-0.0736],
        [-0.0753],
        [-0.0826],
        [-0.1186],
        [-0.0416],
        [-0.0884]], grad_fn=<AddmmBackward0>)

In [140]:
class MyLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=4, output_size=1, dropout=0.5):
        super().__init__()

        self.num_layers = num_layers
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(in_features=hidden_size, out_features=output_size)

    def forward(self, X, hidden):
        out, hidden = self.lstm(X, hidden)
        out = self.dropout(out)
        out = self.fc(out)
        # Only care about last prediction
        return out[-1], hidden

    def initialize_hidden(self):
        hidden = (
            torch.zeros(self.num_layers, self.hidden_size),
            torch.zeros(self.num_layers, self.hidden_size),
        )
        return hidden

In [141]:
model = MyLSTM(input_size=train_ds.shape[1])
model

MyLSTM(
  (lstm): LSTM(11, 128, num_layers=4, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [142]:
count = 0
for p in model.parameters():
    print(p.shape)
    if p.requires_grad:
        print(p.numel())
        count += p.numel()
print('---------\nNumber of parameters:', count)

torch.Size([512, 11])
5632
torch.Size([512, 128])
65536
torch.Size([512])
512
torch.Size([512])
512
torch.Size([512, 128])
65536
torch.Size([512, 128])
65536
torch.Size([512])
512
torch.Size([512])
512
torch.Size([512, 128])
65536
torch.Size([512, 128])
65536
torch.Size([512])
512
torch.Size([512])
512
torch.Size([512, 128])
65536
torch.Size([512, 128])
65536
torch.Size([512])
512
torch.Size([512])
512
torch.Size([1, 128])
128
torch.Size([1])
1
---------
Number of parameters: 468609


In [143]:
hidden = model.initialize_hidden()
out, hidden = model.forward(x_train[0], hidden)

In [144]:
out

tensor([-0.0869], grad_fn=<SelectBackward0>)

## Train the model