> ## US Drought & Meteorological Data Starter Notebook
This notebook will walk you trough loading the data and create a Dummy Classifier, showing a range of F1 scores that correspond to random predictions if given theclass priors.

## Loading the Data
In this section, we load the training and validation data into numpy arrays and visualize the drought classes and meteorological attributes.

We load the json files for training, validation and testing into the ``files`` dictionary.

In [4]:
import numpy as np
import pandas as pd
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
sns.set_style('white')

files = {}

for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        if 'train' in filename:
            files['train'] = os.path.join(dirname, filename)
        if 'valid' in filename:
            files['valid'] = os.path.join(dirname, filename)
        if 'test' in filename:
            files['test'] = os.path.join(dirname, filename)

The following classes exist, ranging from no drought (``None``), to extreme drought (``D4``).
This could be treated as a regression, ordinal or classification problem, but for now we will treat it as 5 distinct classes.

In [5]:
class2id = {
    'None': 0,
    'D0': 1,
    'D1': 2,
    'D2': 3,
    'D3': 4,
    'D4': 5,
}
id2class = {v: k for k, v in class2id.items()}

Let's also create a dictionary for the meteorological attributes.

In [6]:
valid_dict = json.load(open(files['valid'], 'r'))
_, first = next(iter(valid_dict['root'].items()))
attributes = sorted(first['values'].keys())

id2attr = {i: k for i, k in enumerate(attributes)}
attr2id = {v: k for k, v in id2attr.items()}

del valid_dict
attr2id

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

Now we'll define a helper method to load the datasets. This just walks through the json and discards the few samples that are corrupted.

In [None]:
# load one of 'train', 'valid' or 'test'
def loadXY(dataset, shuffle=True, random_state=None):
    data_dict = json.load(open(files[dataset], 'r'))
    keys = sorted(list(data_dict['root'].keys()))
    
    if shuffle:
        if random_state is not None:
            np.random.seed(random_state)
        np.random.shuffle(keys)
        
    # float16 should be enough and saves some memory
    X = np.zeros([len(keys), 90, 18], dtype=np.float16)
    y = np.zeros([len(keys)], dtype=np.float16)
    # track how many samples are skipped
    skip_count = 0
    for i, key in tqdm(enumerate(keys), total=len(keys), desc=f'loading {dataset} dataset'):
        sample = data_dict['root'][key]
        input_arr = np.zeros([90, 18])
        try:
            for a, j in attr2id.items():
                input_arr[:,j] = sample['values'][a]
            X[i-skip_count] = input_arr
            y[i-skip_count] = float(class2id[sample['class']])
        except:
            skip_count += 1
    print(f'[{dataset}]: skipped {skip_count} samples ({round(skip_count/len(keys)*100, 3)}%), loaded {len(keys)-skip_count} samples')
    del data_dict
    return X, y

We now load the datasets, this will take a few minutes and use ~8GB of RAM.

In [None]:
X_train, y_train = loadXY('train', random_state=42)
X_valid, y_valid = loadXY('valid', random_state=42)

## LSTM
Let's train a simple LSTM on the data, treating this as a regression problem.

### Preprocessing and Loading

In [None]:
batch_size_factor = 3
batch_size = 256 * batch_size_factor

In [None]:
from sklearn.preprocessing import RobustScaler

scaler_dict = {}

for attr_id in id2attr.keys():
    scaler_dict[attr_id] = RobustScaler().fit(
        X_train[:,:,attr_id].reshape(-1, 1)
    )
    X_train[:,:,attr_id] = scaler_dict[attr_id].transform(X_train[:,:,attr_id].reshape(-1, 1)).reshape(-1, 90)

In [None]:
for attr_id in id2attr.keys():
    X_valid[:,:,attr_id] = scaler_dict[attr_id].transform(X_valid[:,:,attr_id].reshape(-1, 1)).reshape(-1, 90)

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

train_data = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
train_loader = DataLoader(train_data, shuffle=False, batch_size=batch_size, drop_last=True)

In [None]:
valid_data = TensorDataset(torch.tensor(X_valid), torch.tensor(y_valid))
valid_loader = DataLoader(valid_data, shuffle=False, batch_size=batch_size, drop_last=True)

### Model

In [None]:
# hyper parameters
lr = 7e-5 * batch_size_factor
output_size = 1
hidden_dim = 512
dropout = 0.1
n_layers = 4
epochs = 10
clip = 5

In [None]:
import torch
from torch import nn

class DroughtNetLSTM(nn.Module):
    def __init__(self, output_size, num_input_features, hidden_dim, n_layers, drop_prob=0.2):
        super(DroughtNetLSTM, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.lstm = nn.LSTM(num_input_features, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.cuda().to(dtype=torch.float32)
        lstm_out, hidden = self.lstm(x, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (
            weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
            weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)
        )
        return hidden

In [None]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print('using GPU')
else:
    device = torch.device("cpu")
    print('using CPU')


model = DroughtNetLSTM(output_size, len(id2attr), hidden_dim, n_layers, dropout)
model.to(device)

In [None]:
loss_function = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.1)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=lr, steps_per_epoch=len(train_loader), epochs=epochs)

In [None]:
from sklearn.metrics import f1_score

counter = 0
valid_loss_min = np.Inf

torch.manual_seed(42)
np.random.seed(42)

model.train()

for i in range(epochs):
    h = model.init_hidden(batch_size)
    
    for k, (inputs, labels) in tqdm(enumerate(train_loader), desc=f'epoch {i+1}/{epochs}', total=len(train_loader)):
        counter += 1
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        loss = loss_function(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        scheduler.step()
        
        if k == len(train_loader) - 1 or k == (len(train_loader) - 1) // 2:
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            labels = []
            preds = []
            for inp, lab in valid_loader:
                val_h = tuple([each.data for each in val_h])
                inp, lab = inp.to(device), lab.to(device)
                out, val_h = model(inp, val_h)
                val_loss = loss_function(out.squeeze(), lab.float())
                val_losses.append(val_loss.item())
                for l in lab:
                    labels.append(int(l))
                for p in out.round():
                    if p > 5:
                        p = 5
                    if p < 0:
                        p = 0
                    preds.append(int(p))
            
            # log data
            log_dict = {
                'loss': float(loss),
                'epoch': counter/len(train_loader),
                'step': counter,
                'lr': scheduler.get_last_lr()[0]
            }
            log_dict['validation_loss'] = np.mean(val_losses)
            log_dict[f'macro_f1'] = f1_score(labels, preds, average='macro')
            log_dict[f'micro_f1'] = f1_score(labels, preds, average='micro')
            for j, f1 in enumerate(f1_score(labels, preds, average=None)):
                log_dict[f'{id2class[j]}_f1'] = f1
            print(log_dict)
            
            model.train()
            
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

Best Macro F1 - **0.304**