In [None]:
import os
import random
import time
from pprint import pprint
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

import torch
import torch.functional as F
from torch import nn
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader

from torch.optim.lr_scheduler import (
    CosineAnnealingWarmRestarts,
    CosineAnnealingLR,
    ReduceLROnPlateau,
    ExponentialLR
)

In [None]:
USE_TPU = False
EXP_DIR = Path("./")

In [None]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

In [None]:
train_df = pd.read_csv("../input/ventilator-pressure-prediction/train.csv")
test_df = pd.read_csv("../input/ventilator-pressure-prediction/test.csv")

In [None]:
train_df.head()

In [None]:
def make_feature(train_df, test_df):
    
    def _make_feature_per_dataset(df):
        u_out_change_time = df.loc[
            df.groupby("breath_id")["u_out"].diff() == 1,
            ["breath_id", "time_step"]
        ]
        u_out_change_time = u_out_change_time.rename(columns={"time_step": "u_out_change_time_step"})
        df = df.merge(u_out_change_time, on="breath_id", how="left")
        df["time_from_u_out_change"] = df["time_step"] - df["u_out_change_time_step"]
        df.drop(["u_out_change_time_step"], axis=1, inplace=True)

        df["u_in_cumsum"] = df.groupby("breath_id")["u_in"].cumsum()
        df["u_in_diff"] = (df.groupby("breath_id")["u_in"].diff() / df.groupby("breath_id")["time_step"].diff()).fillna(0)
        
        df['area'] = df['time_step'] * df['u_in']
        df['area'] = df.groupby('breath_id')['area'].cumsum()        
        df['u_in_lag2'] = df.groupby("breath_id")['u_in'].shift(2).fillna(0)
        
        return df
    
    train_df = _make_feature_per_dataset(train_df)
    test_df = _make_feature_per_dataset(test_df)

    return train_df, test_df


In [None]:
def normalize_feature(train_df, valid_df, test_df):
    
    cols = [
        "u_in",
        "u_in_cumsum",
        "time_step",
        "time_from_u_out_change",
        "u_in_diff",
        #"u_in_at_out1",
        #"u_in_at_out0",
        #"u_in_at_out1_cumsum",
        #"u_in_at_out0_cumsum",
        "area",
        "u_in_lag2"
    ]
    
    scaler = StandardScaler()
    train_df[cols] = scaler.fit_transform(train_df[cols])
    valid_df[cols] = scaler.transform(valid_df[cols])
    test_df[cols] = scaler.transform(test_df[cols])

    #for c in normalize_cols:    
    #    scaler = StandardScaler()
    #    train_df[c] = scaler.fit_transform(train_df[c])
    #    valid_df[c] = scaler.transform(valid_df[c])
    #    test_df[c] = scaler.transform(test_df[c])
    
    return train_df, valid_df, test_df

In [None]:
class PressureDataset(Dataset):
    

    
    def __init__(self, df, seq_features, is_train=True):
        
        self.ids = df["id"].values
        self.breath_ids = df["breath_id"].unique()
        self.seq_features = seq_features

        self.r_dict = {
            5: 0,
            20: 1,
            50: 2,
        }
        self.c_dict = {
            10: 0,
            20: 1,
            50: 2,
        }
        
        R_dict = df.groupby("breath_id")["R"].first()
        self.R_dict = R_dict.map(self.r_dict).to_dict()
        C_dict = df.groupby("breath_id")["C"].first()
        self.C_dict = C_dict.map(self.c_dict).to_dict()
        
        self.seq_features_arr_dict = {}
        for feat in self.seq_features:
            self.seq_features_arr_dict[feat] = df.groupby("breath_id")[feat].apply(lambda x: x.values).to_dict()

        self.is_train = is_train
        if is_train:
            self.target_arr_dict = df.groupby("breath_id")["pressure"].apply(lambda x: x.values).to_dict()
            
    
    def __len__(self):
        return len(self.breath_ids)

    def __getitem__(self, idx):
        breath_id = self.breath_ids[idx]

        r_value = self.R_dict[breath_id]
        c_value = self.C_dict[breath_id]
        
        global_features = torch.tensor(
            [r_value, c_value], dtype=torch.long
        )
        features = {
            "global": global_features
        }
        for feat in self.seq_features:
            features[feat] = torch.tensor(self.seq_features_arr_dict[feat][breath_id], dtype=torch.float)
 
        if self.is_train:
            target = torch.tensor(
                self.target_arr_dict[breath_id],
                dtype=torch.float
            )
            return features, target
        else:
            return features

In [None]:
class RNNModel(nn.Module):

    def __init__(self, seq_features, pred_len=80, seq_len=80, device="cpu", n_hidden=128):
        super(RNNModel, self).__init__()
        
        self.seq_features = seq_features
        self.seq_feature_len = len(seq_features)
        self.pred_len = pred_len
        self.seq_len = seq_len
        self.device = device

        self.seq_linear = nn.Sequential(
            nn.Linear(self.seq_feature_len, n_hidden),
            nn.LayerNorm(n_hidden),
            nn.ReLU(),
            #nn.Linear(n_hidden*2, n_hidden),
            #nn.ReLU()
        )

        self.r_emb = nn.Embedding(
            3, 8
        )
        self.c_emb = nn.Embedding(
            3, 8
        )

        self.encoder_rnn = nn.LSTM(
            num_layers=4,
            input_size=n_hidden,
            hidden_size=n_hidden,
            batch_first=True,
            bidirectional=True,
        )

        self.decoder_rnn_cell = nn.LSTMCell(
            input_size=n_hidden,
            hidden_size=n_hidden,
        )
        # self.decoder_out = nn.Linear(n_hidden*2 + 8*2, 1)  # lstm_hidden + id_embedding
        self.decoder_out = nn.Sequential(
            nn.Linear(n_hidden*2 + 8*2, n_hidden),
            nn.LayerNorm(n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, 1),
        )   

    def __call__(self, features):
        
        batch_size = features["global"].shape[0]
        
        # global
        global_hidden = torch.cat([
            self.r_emb(features["global"][:, 0]),
            self.c_emb(features["global"][:, 1])
        ], dim=-1)
        
        # sequence
        seq_input = torch.cat([
            features[f].unsqueeze(-1) for f in self.seq_features
        ], dim=-1)  # (batchsize, seq_len, feature_size)
        seq_hidden = self.seq_linear(seq_input)  # (batchsize, seq_len, 32)
        
        hidden, (h_n, c_n) = self.encoder_rnn(seq_hidden)        
        hidden = torch.cat([
            hidden, global_hidden.unsqueeze(1).repeat([1, self.pred_len, 1])
        ], dim=-1)
        
        pred = self.decoder_out(hidden)

        return pred

In [None]:
# debug
if 0:
    dataset = PressureDataset(train_df, is_train=True)

    loader = DataLoader(
        dataset,
        batch_size=8,
        pin_memory=True,
        drop_last=False,
        #num_workers=1,
        #collate_fn=collate_fn,
        shuffle=True
    )

    model = RNNModel()

    for features, target in loader:
        features = features
        target = target
        break

    pred = model(features)

In [None]:
def train_one_epoch(model, loss_fn, data_loader, optimizer, config, device, scaler=None):
    # get batch data loop
    epoch_loss = 0
    epoch_data_num = len(data_loader.dataset)

    model.train()

    for iter_i, (features, targets) in enumerate(data_loader):
        # input
        features = {k : v.to(device) for k, v in features.items()}
        targets = targets.to(device)
        batch_size = len(targets)

        # zero grad
        optimizer.zero_grad()

        with torch.set_grad_enabled(True):
            if scaler is not None:
                with amp.autocast():
                    preds = model(features)
            else:
                preds = model(features)
                
            preds = preds.view(-1)
            targets = targets.view(-1)

            loss = loss_fn(preds, targets)
            print(f"\rTrain: {iter_i*config.batch_size} / {epoch_data_num}", end='')
            if USE_TPU:
                loss.backward()
                xm.optimizer_step(optimizer)
            elif scaler is not None:
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                optimizer.step()
            epoch_loss += loss.item()
    
    epoch_loss_per_data = epoch_loss / epoch_data_num
    print(f"\r", end="")
    print("\r                         ")
    return epoch_loss_per_data

In [None]:
def valid_one_epoch(model, loss_fn, data_loader, config, device):
    # get batch data loop
    epoch_loss = 0
    epoch_data_num = len(data_loader.dataset)
    pred_list = []
    target_list = []

    model.eval()
    for iter_i, (features, targets) in enumerate(data_loader):
        # input
        features = {k : v.to(device) for k, v in features.items()}
        targets = targets.to(device)
        batch_size = len(features)
        
        with torch.no_grad():
            preds = model(features)
            preds = preds.view(-1)
            targets = targets.view(-1)
            print(f"\rVal: {iter_i*config.batch_size} / {epoch_data_num}", end='')

        pred_list.append(preds.detach().cpu().numpy())
        target_list.append(targets.detach().cpu().numpy())

    epoch_loss_per_data = epoch_loss / epoch_data_num
    val_preds = np.concatenate(pred_list, axis=0)
    val_targets = np.concatenate(target_list, axis=0)
    print(f"\r", end="")
    print("\r                         ")
    return epoch_loss_per_data, val_preds, val_targets

In [None]:
def train_run(train_df, valid_df, config, model_prefix="", save_best_model=True, save_model=True):
    
    
    set_seed(config.seed)
    
    if USE_TPU:
        device = xm.xla_device()
    else:
        device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu"
        )

    print(f"train run device : {device}")
    
    ###################################
    # Model
    ###################################
    model = RNNModel(seq_features=config.seq_features, device=device)
    model.to(device)

    ###################################
    # Make data
    ###################################
    train_dataset = PressureDataset(train_df, seq_features=config.seq_features, is_train=True)
    valid_dataset = PressureDataset(valid_df, seq_features=config.seq_features, is_train=True)

    # data loader    
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        num_workers=config.num_workers,
        shuffle=True,
        drop_last=True
    )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size=config.batch_size,
        num_workers=config.num_workers,
        shuffle=False,
    )

    ##################
    # Optimiizer
    ##################
    if USE_TPU:
        lr = config.lr * xm.xrt_world_size()
    else:
        lr = config.lr
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    
    ##################
    # lr scheduler
    ##################
    scheduler = config.SchedulerClass(
        optimizer, **config.scheduler_params
    )

    ##################
    # loss function
    ##################
    loss_fn = nn.L1Loss()

    ###############################
    # train epoch loop
    ###############################
    # iteration and loss count
    val_score = 0
    best_model_path = None
    best_val_score = 10000
    best_val_epoch = -1
    iteration = 1
    epoch_train_loss = 0.0
    epoch_val_loss = 0.0
    valid_period = 1
    
    results_list = []
    val_preds_list = []
    old_model_save_path = None

    scaler = amp.GradScaler() if config.use_fp16 else None
    
    val_calc_indices = valid_df["u_out"] == 0

    for epoch in range(config.n_epoch):
        
        t_epoch_start = time.time()

        # train loop
        train_epoch_loss = train_one_epoch(model, loss_fn, train_loader, optimizer, config, device, scaler=scaler)

        # valid loop
        valid_epoch_loss, val_preds, val_targets = valid_one_epoch(model, loss_fn, valid_loader, config, device)

        # calc metric
        val_score = mean_absolute_error(val_targets[val_calc_indices], val_preds[val_calc_indices])
        
        t_epoch_finish = time.time()
        elapsed_time = t_epoch_finish - t_epoch_start

        # learning rate step
        lr = optimizer.param_groups[0]['lr']
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(valid_epoch_loss)
        else:
            scheduler.step()
        # save results
        results = {
            "epoch": epoch + 1,
            "lr": lr,
            "train_loss": train_epoch_loss,
            "mae": val_score
        }
        
        results_list.append(results)

        print(f"\r", end="")
        pprint(results)
        
        val_preds_list.append(val_preds)

        if val_score < best_val_score:
            best_val_score = val_score
            best_val_epoch = epoch

            if save_best_model:
               
                if old_model_save_path is not None and old_model_save_path.exists():
                    os.remove(old_model_save_path)
                
                model_save_path = EXP_DIR / f"{model_prefix}best-checkpoint-{str(epoch).zfill(3)}epoch.bin"
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'scheduler_state_dict': scheduler.state_dict(),
                    'epoch': epoch,
                }, model_save_path)
                
                old_model_save_path = model_save_path
                best_model_path = model_save_path

        if epoch == config.n_epoch -1 and save_model:
            model_save_path = EXP_DIR / f"{model_prefix}last-checkpoint.bin"
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'epoch': epoch,
            }, model_save_path)

    return best_val_score, results_list, val_preds_list, best_model_path

In [None]:
def test_predict(test_df, model_path):
    
    set_seed(config.seed)
    
    if USE_TPU:
        device = xm.xla_device()
    else:
        device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu"
        )

    ###################################
    # Model
    ###################################
    model = RNNModel(seq_features=config.seq_features, device=device)
    model.to(device)
    
    model.load_state_dict(
        torch.load(
            model_path,
            map_location=lambda storage,loc: storage
        )["model_state_dict"]
    )


    ###################################
    # Dataset & Dataloader
    ###################################
    test_dataset = PressureDataset(test_df, seq_features=config.seq_features, is_train=False)

    # data loader    
    test_loader = DataLoader(
        test_dataset, 
        batch_size=config.batch_size,
        num_workers=config.num_workers,
        shuffle=False,
    )    

    ###################################
    # Inference
    ###################################

    pred_list = []
    epoch_data_num = len(test_loader.dataset)

    model.eval()
    for iter_i, features in enumerate(test_loader):
        # input
        features = {k : v.to(device) for k, v in features.items()}
        batch_size = len(features)
        
        with torch.no_grad():
            preds = model(features)
            preds = preds.view(-1)
            print(f"\rTest: {iter_i*config.batch_size} / {epoch_data_num}", end='')

        pred_list.append(preds.detach().cpu().numpy())

    test_preds = np.concatenate(pred_list, axis=0)
    return test_preds

In [None]:
class Config:
    seed = 2021
    
    num_workers = 4
    batch_size = 128
    n_epoch = 150
    lr = 5e-3

    SchedulerClass = CosineAnnealingLR
    scheduler_params = dict(
        T_max=150,
        eta_min=1e-5
    )
    n_cv_fold = 5    
    use_fp16 = False
    
    seq_features = [
        "u_in",
        "u_out",
        "time_from_u_out_change",
        "u_in_cumsum",
        "u_in_diff",
        #"u_in_at_out1",
        #"u_in_at_out0",
        #"u_in_at_out1_cumsum",
        #"u_in_at_out0_cumsum",
        "area",
        "u_in_lag2"
    ]

    
config = Config()

In [None]:
folds = GroupKFold(n_splits=config.n_cv_fold)
oof_preds = np.zeros(len(train_df))
test_preds = np.zeros(len(test_df))
importance_list = []

train_df, test_df = make_feature(train_df, test_df)

In [None]:
model_num = 0
for fold_ix, (trn_idx, val_idx) in enumerate(folds.split(train_df, groups=train_df["breath_id"])):
    print(f"Fold {fold_ix}")
    
    _train_df = train_df.iloc[trn_idx].reset_index(drop=True)
    _valid_df = train_df.iloc[val_idx].reset_index(drop=True)
    
    _train_df, _valid_df, _test_df = normalize_feature(_train_df, _valid_df, test_df.copy())
    
    best_val_score, results_list, val_preds_list, best_model_path = train_run(
        _train_df, _valid_df,
        config, model_prefix=f"fold{fold_ix}", save_best_model=True, save_model=True
    )
    model_num += 1
    
    test_preds += test_predict(_test_df, best_model_path)
    
    if fold_ix >= 0:
        break
    
test_preds = test_preds / model_num

In [None]:
!ls

In [None]:
sub_df = pd.read_csv("../input/ventilator-pressure-prediction/sample_submission.csv")
sub_df["pressure"] = test_preds

In [None]:
sub_df.to_csv("submission.csv", index=False)