# **Homework 1: COVID-19 Cases Prediction (Regression)**

Objectives:
* Solve a regression problem with deep neural networks (DNN).
* Understand basic DNN training tips.
* Familiarize yourself with PyTorch.

If you have any questions, please contact the TAs via TA hours, NTU COOL, or email to mlta-2023-spring@googlegroups.com

# Import packages

In [1]:
# Numerical Operations
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv

# For Progress Bar
from tqdm import tqdm

# Pytorch
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter

# Some Utility Functions

You do not need to modify this part.

In [2]:
def same_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def train_valid_split(data_set, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    valid_set_size = int(valid_ratio * len(data_set)) 
    train_set_size = len(data_set) - valid_set_size
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

def predict(test_loader, model, device):
    model.eval() # Set your model to evaluation mode.
    preds = []
    for x in tqdm(test_loader):
        x = x.to(device)                        
        with torch.no_grad():                   
            pred = model(x)                     
            preds.append(pred.detach().cpu())   
    preds = torch.cat(preds, dim=0).numpy()  
    return preds

# Dataset

In [3]:
class COVID19Dataset(Dataset):
    '''
    x: Features.
    y: Targets, if none, do prediction.
    '''
    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)

# Neural Network Model
Try out different model architectures by modifying the class below.

In [4]:
class My_Model(nn.Module):
    def __init__(self, input_dim):
        super(My_Model, self).__init__()
        # TODO: modify model's structure, be aware of dimensions. 
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 1)
        )

    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1) # (B, 1) -> (B)
        return x

In [5]:
# 自定义多层GRU模型
class GRU_Model(nn.Module):
  def __init__(self, input_dim, hidden_size=16, num_layers=1):
    super().__init__()

    self.gru = nn.GRU(input_size=input_dim,
              hidden_size=hidden_size,
              num_layers=num_layers,
              batch_first=True)
    self.fc = nn.Sequential(
        nn.Linear(hidden_size, hidden_size // 2),
        nn.ReLU(),
        nn.Linear(hidden_size // 2, 1)
    )

  def forward(self, x):
    batch_size, feature_size = x.shape
    x = x.view(batch_size, 1, feature_size)
    output, _ = self.gru(x)
    output = output[:, -1, :]  # (batch_size, hidden_size)
    output = self.fc(output)  # (batch_size, 1)
    return output.squeeze(1) # (batch_size)

In [6]:
# setup model
class MLP_Model(nn.Module):
    """
    简单的MLP模型
    """
    def __init__(self, input_dim, hidden_layers_dim=[64, 32, 8]):
        super().__init__()
        self.layers = []
        for i in range(len(hidden_layers_dim)):
            if i == 0: 
                self.layers.append(nn.Linear(input_dim, hidden_layers_dim[i]))
            else: 
                self.layers.append(nn.Linear(hidden_layers_dim[i-1], hidden_layers_dim[i]))
            self.layers.append(nn.ReLU())
        self.layers.append(nn.Linear(hidden_layers_dim[-1], 1))
        self.layers = nn.Sequential(*self.layers)
        
    def forward(self, x):
        x = self.layers(x)  # [B, 1]
        x = x.squeeze(1)
        return x

# Feature Selection
Choose features you deem useful by modifying the function below.

In [7]:
def select_feat(train_data, valid_data, test_data, select_all=True):
    '''Selects useful features to perform regression'''
    y_train, y_valid = train_data[:,-1], valid_data[:,-1]
    raw_x_train, raw_x_valid, raw_x_test = train_data[:,:-1], valid_data[:,:-1], test_data

    if select_all:
        feat_idx = list(range(raw_x_train.shape[1]))
    else:
        # [35:length - 1]
        feat_idx = list(range(35, raw_x_train.shape[1])) # TODO: Select suitable feature columns.
        
    return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid

# Training Loop

In [8]:
def trainer(train_loader, valid_loader, model, config, device):

    criterion = nn.MSELoss(reduction='mean') # Define your loss function, do not modify this.

    # Define your optimization algorithm. 
    # TODO: Please check https://pytorch.org/docs/stable/optim.html to get more available algorithms.
    # TODO: L2 regularization (optimizer(weight decay...) or implement by your self).
    # optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.7) 
    # change SGD to Adam
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])  
    writer = SummaryWriter() # Writer of tensoboard.

    if not os.path.isdir('./models'):
        os.mkdir('./models') # Create directory of saving models.

    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.
        loss_record = []

        # tqdm is a package to visualize your training progress.
        train_pbar = tqdm(train_loader, position=0, leave=True)

        for x, y in train_pbar:
            optimizer.zero_grad()               # Set gradient to zero.
            x, y = x.to(device), y.to(device)   # Move your data to device. 
            pred = model(x)             
            loss = criterion(pred, y)
            loss.backward()                     # Compute gradient(backpropagation).
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())
            
            # Display current epoch number and loss on tqdm progress bar.
            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})

        mean_train_loss = sum(loss_record)/len(loss_record)
        writer.add_scalar('Loss/train', mean_train_loss, step)

        model.eval() # Set your model to evaluation mode.
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)

            loss_record.append(loss.item())
            
        mean_valid_loss = sum(loss_record)/len(loss_record)
        
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
            
        writer.add_scalar('Loss/valid', mean_valid_loss, step)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path']) # Save your best model
            # print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            return

# Configurations
`config` contains hyper-parameters for training and the path to save your model.

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
    'seed': 5201314,      # Your seed number, you can pick your lucky number. :)
    'select_all': False,   # Whether to use all features.
    'valid_ratio': 0.1,   # validation_size = train_size * valid_ratio
    'n_epochs': 1000,     # Number of epochs.            
    'batch_size': 256, 
    'learning_rate': 1e-5,              
    'weight_decay': 1e-5,  # L2 Regularization (weight decay).
    'early_stop': 600,    # If model has not improved for this many consecutive epochs, stop training.     
    'save_path': './models/model.ckpt'  # Your model will be saved here.
}

# Dataloader
Read data from files and set up training, validation, and testing sets. You do not need to modify this part.

In [10]:
# Set seed for reproducibility
same_seed(config['seed'])


# train_data size: 3009 x 89 (35 states + 18 features x 3 days) 
# test_data size: 997 x 88 (without last day's positive rate)
train_data, test_data = pd.read_csv('./covid_train.csv').values, pd.read_csv('./covid_test.csv').values
train_data, valid_data = train_valid_split(train_data, config['valid_ratio'], config['seed'])

# Print out the data size.
print(f"""train_data size: {train_data.shape} 
valid_data size: {valid_data.shape} 
test_data size: {test_data.shape}""")

# Select features
x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['select_all'])

# Print out the number of features.
print(f'number of features: {x_train.shape[1]}')

train_dataset, valid_dataset, test_dataset = COVID19Dataset(x_train, y_train), \
                                            COVID19Dataset(x_valid, y_valid), \
                                            COVID19Dataset(x_test)

# Pytorch data loader loads pytorch dataset into batches.
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

train_data size: (2709, 89) 
valid_data size: (300, 89) 
test_data size: (997, 88)
number of features: 53


# Start training!

In [11]:
config['save_path'] = './models/mlp_model.ckpt'
config['n_epochs'] = 5000

mlp_model = MLP_Model(input_dim=x_train.shape[1]).to(device)
trainer(train_loader, valid_loader, mlp_model, config, device)

Epoch [1/5000]: Train loss: 297.8206, Valid loss: 329.5889
Epoch [101/5000]: Train loss: 71.1066, Valid loss: 67.7874
Epoch [201/5000]: Train loss: 39.7642, Valid loss: 40.7335
Epoch [301/5000]: Train loss: 14.3153, Valid loss: 15.9319
Epoch [401/5000]: Train loss: 8.2726, Valid loss: 9.7267
Epoch [501/5000]: Train loss: 7.0340, Valid loss: 8.1743
Epoch [601/5000]: Train loss: 5.6929, Valid loss: 6.8278
Epoch [701/5000]: Train loss: 4.3485, Valid loss: 4.9007
Epoch [801/5000]: Train loss: 3.0026, Valid loss: 3.9864
Epoch [901/5000]: Train loss: 2.1507, Valid loss: 2.4618
Epoch [1001/5000]: Train loss: 1.7536, Valid loss: 2.3425
Epoch [1101/5000]: Train loss: 1.5750, Valid loss: 1.7562
Epoch [1201/5000]: Train loss: 1.4672, Valid loss: 1.6212
Epoch [1301/5000]: Train loss: 1.4010, Valid loss: 1.5785
Epoch [1401/5000]: Train loss: 1.3464, Valid loss: 1.4413
Epoch [1501/5000]: Train loss: 1.2982, Valid loss: 1.8244
Epoch [1601/5000]: Train loss: 1.2702, Valid loss: 1.2780
Epoch [1701/5000

In [None]:
config['save_path'] = './models/my_model.ckpt'
model = My_Model(input_dim=x_train.shape[1]).to(device) # put your model and data on the same computation device.
trainer(train_loader, valid_loader, model, config, device)

In [None]:
config['save_path'] = './models/gru_model.ckpt'
model = GRU_Model(input_dim=x_train.shape[1]).to(device) # put your model and data on the same computation device.
trainer(train_loader, valid_loader, model, config, device)

# Testing
The predictions of your model on testing set will be stored at `pred.csv`.

In [19]:
def save_pred(preds, file):
    ''' Save predictions to specified file '''
    id_list = []
    tested_positive_list = []
    for i, p in enumerate(preds):
        id_list.append(i)
        tested_positive_list.append(p)
    df = pd.DataFrame({'id': id_list, 'tested_positive': tested_positive_list})
    df.to_csv(file, index=False)
    

model = MLP_Model(input_dim=x_train.shape[1]).to(device)
model.load_state_dict(torch.load('models/mlp_model.ckpt'))
preds = predict(test_loader, model, device) 
save_pred(preds, 'pred.csv')

100%|██████████| 4/4 [00:00<00:00, 666.79it/s]


# Download

Run this block to download the `pred.csv` by clicking.

In [None]:
from IPython.display import FileLink
FileLink(r'pred.csv')

# Reference
This notebook uses code written by Heng-Jui Chang @ NTUEE (https://github.com/ga642381/ML2021-Spring/blob/main/HW01/HW01.ipynb)

# My Methods

## TSNE - 特征降维

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import datasets
from sklearn import manifold

In [5]:
train_df = pd.read_csv('covid.train.csv')
feature, label = train_df.iloc[:, :-1], train_df.iloc[:, -1]

In [6]:
tsne = manifold.TSNE(2, random_state=22)
transformed_data = tsne.fit_transform(feature)

In [None]:
tsne_df = pd.DataFrame(np.column_stack((transformed_data, label)), columns=["x", "y", "targets"])
tsne_df.loc[:, "targets"] = tsne_df.targets.astype(int)
tsne_df

In [None]:
grid = sns.FacetGrid(tsne_df, hue="targets")
grid.map(plt.scatter, "x", "y").add_legend()

## 源码详解

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import random_split, Dataset

In [None]:
# 设置随机数种子（保证随机结果的一致性）
def same_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [None]:
# 划分训练集和验证集（用于模型训练）
def train_valid_split(data_set, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    valid_set_size = int(valid_ratio * len(data_set)) 
    train_set_size = len(data_set) - valid_set_size
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

In [None]:
# setup dataset
class MyDataset(Dataset):
    """
    x: Features
    y: Targets, if None, do prediction
    """
    def __init__(self, x, y=None):
        self.x = torch.FloatTensor(x)
        if y is not None:
            self.y = torch.FloatTensor(y)
        else:
            self.y = None
        
    def __getitem__(self, index):
        if self.y is not None:
            return self.x[index], self.y[index]
        else:
            return self.x[index]
    
    def __len__(self):
        return len(self.x)

In [12]:
# setup model
class MyModel(nn.Module):
    """
    简单的MLP模型
    """
    def __init__(self, input_dim, hidden_layers_dim=[64, 32, 8]):
        super().__init__()
        self.layers = []
        for i in range(len(hidden_layers_dim)):
            if i == 0:
                self.layers.append(nn.Linear(input_dim, hidden_layers_dim[i]))
            else:
                self.layers.append(nn.Linear(hidden_layers_dim[i-1], hidden_layers_dim[i]))
            self.layers.append(nn.ReLU())
        self.layers.append(nn.Linear(hidden_layers_dim[-1], 1))
        self.layers = nn.Sequential(*self.layers)
        
    def forward(self, x):
        x = self.layers(x)  # [B, 1]
        x = x.squeeze(1)
        return x

In [14]:
# trainer
model = MLP_Model(input_dim=x_train.shape[1]).to(device)
n_epochs = config['n_epochs']
criterion = nn.MSELoss(reduction='mean')  # define loss function
optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.7)  # define optimizer

for epoch in range(n_epochs):
    # train
    model.train()
    loss_record = []
    for X, y in train_loader:
        optimizer.zero_grad()
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()
        loss_record.append(loss.detach().item())  # loss value of a batch : loss.detach().item()
    mean_train_loss = sum(loss_record) / len(loss_record)
    
    print(f'Epoch [{epoch + 1:03d}/{n_epochs:03d}] Train loss: {mean_train_loss:.4f}')
    
    # evaluate
    model.eval()
    loss_record = []
    
    with torch.no_grad():
        for X, y in valid_loader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            loss = criterion(pred, y)
            loss_record.append(loss.detach().item())  # loss value of a batch : loss.detach().item()
    mean_eval_loss = sum(loss_record) / len(loss_record)
    
    print(f'Epoch [{epoch + 1:03d}/{n_epochs:03d}] Eval loss: {mean_eval_loss:.4f}')

[Epoch 001/1000] Train loss: 168.5975
[Epoch 001/1000] Eval loss: 93.9501
[Epoch 002/1000] Train loss: 90.6762
[Epoch 002/1000] Eval loss: 82.7290
[Epoch 003/1000] Train loss: 83.9212
[Epoch 003/1000] Eval loss: 81.2391
[Epoch 004/1000] Train loss: 78.1389
[Epoch 004/1000] Eval loss: 72.4522
[Epoch 005/1000] Train loss: 73.2024
[Epoch 005/1000] Eval loss: 71.7948
[Epoch 006/1000] Train loss: 67.1212
[Epoch 006/1000] Eval loss: 65.0780
[Epoch 007/1000] Train loss: 62.9391
[Epoch 007/1000] Eval loss: 59.2695
[Epoch 008/1000] Train loss: 56.5817
[Epoch 008/1000] Eval loss: 51.1962
[Epoch 009/1000] Train loss: 50.4657
[Epoch 009/1000] Eval loss: 46.3813
[Epoch 010/1000] Train loss: 43.7894
[Epoch 010/1000] Eval loss: 40.8719
[Epoch 011/1000] Train loss: 37.2479
[Epoch 011/1000] Eval loss: 36.6337
[Epoch 012/1000] Train loss: 30.4682
[Epoch 012/1000] Eval loss: 26.8779
[Epoch 013/1000] Train loss: 24.5607
[Epoch 013/1000] Eval loss: 20.3865
[Epoch 014/1000] Train loss: 18.8649
[Epoch 014/10