<a href="https://colab.research.google.com/github/chuuuuu/machine_learning_2021/blob/main/homework/hw01/hw1_emsembler_average.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Homework 1: COVID-19 Cases Prediction (Regression)**

In [None]:
from os import path, makedirs
from google.colab import drive
drive.mount('/content/drive')
WORKSPACE = 'drive/MyDrive/ColabNotebooks/HW1'

if not path.exists(f'{WORKSPACE}/dataset'):
  makedirs(f'{WORKSPACE}/dataset')
  !gdown --id '19CCyCgJrUxtvgZF53vnctJiOJ23T5mqF' --output '{WORKSPACE}/dataset/covid.train.csv'
  !gdown --id '1CE240jLm2npU-tdz81-oVKEF3T2yfT1O' --output '{WORKSPACE}/dataset/covid.test.csv'

DATA_PATH = f'{WORKSPACE}/dataset'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For data preprocess
import numpy as np
import csv
import os

# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

# set a random seed for reproducibility
myseed = 42069
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

os.makedirs('models', exist_ok=True)

config = {
    # 'INPUT_DIM': 14,
    'INPUT_DIM': 90,
    'TRAIN_PATH': f'{DATA_PATH}/covid.train.csv',
    'TEST_PATH': f'{DATA_PATH}/covid.test.csv',
    'MODEL_PATH': 'models/model.pth',
    'PRED_PATH': 'pred.csv',

    'EPOCH_NUM': 30000,
    'BATCH_SIZE': 4096,
    'VAL_RATIO': 0.1,
    'OPTIMIZER': 'Adam',
    'OPTIM_PARAMS': {
        'lr': 5e-2,
        # 'weight_decay': 1e-4,
    },
    # 'DECAY_RATE': 0.999,
    'DECAY_RATE': 1,
    'MIN_LR': 1e-4,
    'EARLY_STOP': 300,
    'MODEL_NUM': 3,
    'STATE': 1,
}

In [None]:
class Drawer():
    def plot_learning_curve(self, loss_record, title=''):
        ''' Plot learning curve of your DNN (train & dev loss) '''
        total_steps = len(loss_record['train'])
        x_1 = range(total_steps)
        x_2 = x_1[::len(loss_record['train']) // len(loss_record['val'])]
        figure(figsize=(6, 4))
        plt.plot(x_1, loss_record['train'], c='tab:red', label='train')
        plt.plot(x_2, loss_record['val'], c='tab:cyan', label='val')
        plt.ylim(0.0, 5.)
        plt.xlabel('Training steps')
        plt.ylabel('MSE loss')
        plt.title('Learning curve of {}'.format(title))
        plt.legend()
        plt.show()


    def plot_pred(self, dv_set, model, device, lim=35., preds=None, targets=None):
        ''' Plot prediction of your DNN '''
        if preds is None or targets is None:
            model.eval()
            preds, targets = [], []
            for x, y in dv_set:
                x, y = x.to(device), y.to(device)
                with torch.no_grad():
                    pred = model(x)
                    preds.append(pred.detach().cpu())
                    targets.append(y.detach().cpu())
            preds = torch.cat(preds, dim=0).numpy()
            targets = torch.cat(targets, dim=0).numpy()

        figure(figsize=(5, 5))
        plt.scatter(targets, preds, c='r', alpha=0.5)
        plt.plot([-0.2, lim], [-0.2, lim], c='b')
        plt.xlim(-0.2, lim)
        plt.ylim(-0.2, lim)
        plt.xlabel('ground truth value')
        plt.ylabel('predicted value')
        plt.title('Ground Truth v.s. Prediction')
        plt.show()

In [None]:
from sklearn.feature_selection import f_regression, SelectKBest, mutual_info_regression
from sklearn.model_selection import train_test_split

class DataManager():
    def __init__(self):
        print('init data manager...')
        TRAIN_PATH = config['TRAIN_PATH']
        INPUT_DIM = config['INPUT_DIM']

        with open(TRAIN_PATH, 'r') as f:
            self.data = list(csv.reader(f))
            self.data = np.array(self.data[1:])[:, 1:].astype(np.float32)

        self.X = self.data[:, :-1]
        self.y = self.data[:, -1]

        selector = SelectKBest(f_regression, k=INPUT_DIM)
        selector.fit(self.X, self.y)
        self.cols = selector.get_support(indices=True)

        self.X = self.X[:, self.cols]

    def get_train_data(self):
        print('getting train data...')
        VAL_RATIO = config['VAL_RATIO']
        STATE = config['STATE']

        X_train, X_val, y_train, y_val = train_test_split(self.X, self.y, test_size=VAL_RATIO, random_state=STATE)
        config['STATE'] += 1

        return X_train, X_val, y_train, y_val
    
    def get_test_data(self):
        print('getting test data...')
        TEST_PATH = config['TEST_PATH']
        with open(TEST_PATH, 'r') as f:
            data = list(csv.reader(f))
            data = np.array(data[1:])[:, 1:].astype(np.float32)
        X_test = data[:, self.cols]
        return X_test


In [None]:
class CovidDataset(Dataset):
    def __init__(self, X, y=None):
        print('init dataset...')
        self.X = torch.from_numpy(X).float()
        if y is None:
            self.y = None
        else:
            self.y = torch.from_numpy(y).float()

    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]

        return self.X[idx], self.y[idx]

    def __len__(self):
        return len(self.X)

In [None]:
from torchsummary import summary

class NeuralNet(nn.Module):
    def __init__(self):
        print('init neural net...')
        super(NeuralNet, self).__init__()

        INPUT_DIM = config['INPUT_DIM']

        # for input_dim == 14
        # 12: 0.844
        # 16: 0.860
        # 14: 0.855
        self.net = nn.Sequential(
            # nn.BatchNorm1d(INPUT_DIM),
            # nn.Linear(INPUT_DIM, 16),
            # nn.ReLU(),
            # nn.Linear(16, 1),
            nn.BatchNorm1d(INPUT_DIM),
            nn.Linear(INPUT_DIM, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
        )

        self.criterion = nn.MSELoss(reduction='mean')

    def forward(self, x):
        return self.net(x).squeeze(1)

    def get_loss(self, y_pred, y):
        return self.criterion(y_pred, y)

    def summary(self):
        INPUT_DIM = config['INPUT_DIM']
        summary(self, (INPUT_DIM, ))

In [None]:
class Trainer():
    def __init__(self):
        print('init trainer')
        self.set_device()
        self.set_model()
        self.set_data_loader()
        self.set_drawer()
        self.set_optim()

        self.loss_record = {'train': [], 'val': []}

    def set_drawer(self):
        self.drawer = Drawer()

    def draw_learning_curve(self):
        self.drawer.plot_learning_curve(self.loss_record, title='deep model')

    def draw_val_results(self):
        MODEL_PATH = config['MODEL_PATH']
        del self.model
        self.set_model()

        ckpt = torch.load(MODEL_PATH, map_location='cpu')
        self.model.load_state_dict(ckpt)
        self.drawer.plot_pred(self.val_loader, self.model, self.device)

    def pred_y_test(self):
        print('predicting...')
        BATCH_SIZE = config['BATCH_SIZE']
        PRED_PATH = config['PRED_PATH']

        X_test = self.dataManager.get_test_data()
        test_loader = DataLoader(X_test, BATCH_SIZE, False, drop_last=False, num_workers=0, pin_memory=True)

        self.model.eval()
        y_preds = []
        for x in test_loader:
            x = x.to(self.device)
            with torch.no_grad():
                y_pred = self.model(x)
                y_preds.append(y_pred.detach().cpu())
        
        y_preds = torch.cat(y_preds, dim=0).numpy()
        return y_preds

    def set_device(self):
        ''' Get device (if GPU is available, use GPU) '''
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

    def set_data_loader(self):
        BATCH_SIZE = config['BATCH_SIZE']
        self.dataManager = DataManager()
        X_train, X_val, y_train, y_val = self.dataManager.get_train_data()   

        train_set = CovidDataset(X_train, y_train)
        self.train_loader = DataLoader(train_set, BATCH_SIZE, True, drop_last=False, num_workers=0, pin_memory=True)

        val_set = CovidDataset(X_val, y_val)
        self.val_loader = DataLoader(val_set, BATCH_SIZE, False, drop_last=False, num_workers=0, pin_memory=True)

    def set_model(self):
        self.model = NeuralNet().to(self.device)

    def set_optim(self):
        OPTIMIZER = config['OPTIMIZER']
        OPTIM_PARAMS = config['OPTIM_PARAMS']

        self.optimizer = getattr(torch.optim, OPTIMIZER)(self.model.parameters(), **OPTIM_PARAMS)

    def train(self):
        EPOCH_NUM = config['EPOCH_NUM']
        MODEL_PATH = config['MODEL_PATH']
        EARLY_STOP = config['EARLY_STOP']

        min_val_loss = float('inf')
        early_stop_count = 0
        for epoch in range(EPOCH_NUM):
            self.model.train()
            for x, y in self.train_loader:
                self.optimizer.zero_grad()
                x, y = x.to(self.device), y.to(self.device)
                y_pred = self.model(x)
                loss = self.model.get_loss(y_pred, y)
                loss.backward()
                self.optimizer.step()

            self.update_lr()
            val_loss = self.get_loss(self.val_loader)
            train_loss = self.get_loss(self.train_loader)
            if epoch % 100 == 0:
                print(f'epoch: {epoch+1}, train_loss: {train_loss}, val_loss: {val_loss}')

            if val_loss < min_val_loss:
                min_val_loss = val_loss
                print(f'Saving model, epoch: {epoch+1}, train_loss: {train_loss}, val_loss: {val_loss}')
                torch.save(self.model.state_dict(), MODEL_PATH)
                early_stop_cnt = 0

            else:
                early_stop_cnt += 1

            self.loss_record['val'].append(val_loss)
            self.loss_record['train'].append(train_loss)

            if early_stop_cnt > EARLY_STOP:
                break

        # print(f'Saving model, epoch: {epoch+1}, train_loss: {train_loss}, val_loss: {val_loss}')
        # torch.save(self.model.state_dict(), MODEL_PATH)
        ckpt = torch.load(MODEL_PATH, map_location='cpu')  # Load your best model
        self.model.load_state_dict(ckpt)

        print(f'finished training after {epoch+1} epochs')
        self.model.summary()

    
    def update_lr(self):
        DECAY_RATE = config['DECAY_RATE']
        MIN_LR = config['MIN_LR']
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = param_group['lr'] * DECAY_RATE
            param_group['lr'] = max(MIN_LR, param_group['lr'])

    def get_loss(self, loader):
        self.model.eval()
        total_loss = 0
        for x, y in loader:
            x, y = x.to(self.device), y.to(self.device)
            with torch.no_grad():
                y_pred = self.model(x)
                loss = self.model.get_loss(y_pred, y)
            
            total_loss += loss.detach().cpu().item() * len(x)
        total_loss /= len(loader.dataset)

        return total_loss

In [None]:
class Emssembler():
    def __init__(self):
        MODEL_NUM = config['MODEL_NUM']
        self.trainers = []
        for i in range(MODEL_NUM):
            self.trainers.append(Trainer())

    def train(self):
        MODEL_NUM = config['MODEL_NUM']
        for trainer in self.trainers:
            trainer.train()

    def pred(self):
        MODEL_NUM = config['MODEL_NUM']
        PRED_PATH = config['PRED_PATH']
        
        y_preds = None
        for trainer in self.trainers:
            if y_preds is None:
                y_preds = trainer.pred_y_test()
            else:
                y_preds += trainer.pred_y_test()

        y_preds /= MODEL_NUM

        with open(PRED_PATH, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(['id', 'tested_positive'])
            for i, p in enumerate(y_preds):
                writer.writerow([i, p])


In [None]:
# trainer = Trainer()
# trainer.train()
# trainer.draw_learning_curve()
# trainer.draw_val_results()
# trainer.pred_y_test()
emssembler = Emssembler()
emssembler.train()
emssembler.pred()
print(f'config: {config}')



init trainer
init neural net...
init data manager...
getting train data...
init dataset...
init dataset...
init trainer
init neural net...
init data manager...
getting train data...
init dataset...
init dataset...
init trainer
init neural net...
init data manager...
getting train data...
init dataset...
init dataset...
epoch: 1, train_loss: 212.09690856933594, val_loss: 209.4459991455078
Saving model, epoch: 1, train_loss: 212.09690856933594, val_loss: 209.4459991455078
Saving model, epoch: 2, train_loss: 159.7740936279297, val_loss: 158.03463745117188
Saving model, epoch: 3, train_loss: 85.24370574951172, val_loss: 83.52495574951172
Saving model, epoch: 4, train_loss: 33.01015853881836, val_loss: 31.402835845947266
Saving model, epoch: 10, train_loss: 20.869855880737305, val_loss: 20.821613311767578
Saving model, epoch: 18, train_loss: 20.45854949951172, val_loss: 19.47454071044922
Saving model, epoch: 19, train_loss: 15.57339859008789, val_loss: 14.793313026428223
Saving model, epoch