In [1]:
import numpy as np
import pandas as pd

import time
from tqdm import tqdm
from pathlib import Path
import multiprocessing as mp

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold

from transformers import get_cosine_schedule_with_warmup

import torch 
from torch import nn, optim
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader



# General Settings

In [2]:
INPUT_PATH = Path('/kaggle/input/lets-surpass-the-hosts-bayesian-model')
OUTPUT_PATH = Path('/kaggle/working')

ORIGINAL_FEATURES = ['A', 'B', 'E', 'F', 'G']

TARGET = 'Target'

N_CORES = mp.cpu_count()
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
NUM_FEATURES = len(ORIGINAL_FEATURES)
BATCH_SIZE = 32
N_EPOCHS = 5
N_WARMUPS = 80
LEARNING_RATE = 0.004
WEIGHT_DECAY = 0.01
SEED = 252

# Load Data

In [3]:
train = pd.read_csv(INPUT_PATH / 'train_df.csv')
train['Target'] = train['Target'].astype(int)

# Split Folds

In [4]:
def prepare_folds(train, features, target):
    X = train[features]
    y = train[target]
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
    scaler = StandardScaler()
    
    fold = 0
    for train_indices, val_indices in skf.split(X, y):
        fold += 1
        print(f'Preparing fold {fold} ...')
        df_train = train.loc[train.index.isin(train_indices)].reset_index(drop=True)
        df_val = train.loc[train.index.isin(val_indices)].reset_index(drop=True)
        
#         df_train[features] = scaler.fit_transform(df_train[features])
#         df_val[features] = scaler.transform(df_val[features])
        
        test = pd.read_csv(INPUT_PATH / 'test_df.csv')
        
#         test[features] = scaler.transform(test[features])
        test['Target'] = 0
        df_train.to_csv(f'df_train_fold_{fold}.csv', index=False)
        df_val.to_csv(f'df_val_fold_{fold}.csv', index=False)
        test.to_csv(f'test_fold_{fold}.csv', index=False)

In [5]:
prepare_folds(train, ORIGINAL_FEATURES, TARGET)

Preparing fold 1 ...
Preparing fold 2 ...
Preparing fold 3 ...
Preparing fold 4 ...
Preparing fold 5 ...
Preparing fold 6 ...
Preparing fold 7 ...
Preparing fold 8 ...
Preparing fold 9 ...
Preparing fold 10 ...


# Dataset and DataLoader

In [6]:
class SpamDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float)
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __getitem__(self, index):
        X = self.features[index]
        y = self.targets[index]
        return X, y

    def __len__(self):
        return self.targets.shape[0]


def get_dataloader(df, 
                   feature_names, 
                   target_name,
                   batch_size,
                   mode):
    if mode == 'train':
        shuffle = True
        drop_last = True
    else:
        shuffle = False
        drop_last = False
    
    torch.manual_seed(SEED)
    train_dataset = SpamDataset(
        features=df[feature_names].to_numpy(), 
        targets=df[target_name].to_numpy()
    )
    
    data_loader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=N_CORES
    )
    
    return data_loader

# Model

In [7]:
class GatedLinearUnit(nn.Module):
    def __init__(self, input_size):
        super(GatedLinearUnit, self).__init__()
        self.linear = nn.Linear(input_size, input_size)
        self.gate = nn.Sequential(
            nn.Linear(input_size, input_size),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.linear(x) * self.gate(x)
    
    
class GatedResidualNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, dropout):
        super(GatedResidualNetwork, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.grn = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ELU(),
            nn.Linear(hidden_size, hidden_size),
            nn.Dropout(dropout),
            GatedLinearUnit(hidden_size),
        )
        
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.feature_projection  = nn.Linear(input_size, hidden_size)
        
    def forward(self, inputs):
        x = self.grn(inputs)
        if inputs.shape[-1] != self.hidden_size:
            inputs = self.feature_projection(inputs)
        x = self.layer_norm(x + inputs)
        return x
    
class VariableSelectionNetwork(nn.Module):
    def __init__(self, num_features, dense_units, hidden_size, dropout):
        super(VariableSelectionNetwork, self).__init__()
        self.num_features = num_features
        self.hidden_size = hidden_size
        self.grns = nn.ModuleList()
        for _ in range(num_features):
            self.grns.append(GatedResidualNetwork(dense_units, hidden_size, dropout))
        
        
        self.grn_concat = GatedResidualNetwork(num_features*dense_units,  hidden_size, dropout)
        self.softmax = nn.Sequential(
            nn.Linear(hidden_size, num_features),
            nn.Softmax(dim=-1)
        )
        
    def forward(self, inputs):
        v = torch.cat(inputs, dim=1)
        v = self.grn_concat(v)
        v = self.softmax(v)
        v = torch.unsqueeze(v, dim=-1)
        
        x = []
        for idx, input_ in enumerate(inputs):
            x.append(self.grns[idx](input_))
        x = torch.stack(x, dim=1)
        
        out = (v.transpose(2, 1) @ x).squeeze(dim=1)
        return out
    
class VariableSelectionFlow(nn.Module):
    def __init__(self, num_features, hidden_size, dense_units, dropout):
        super(VariableSelectionFlow, self).__init__()
        self.variable_selection = VariableSelectionNetwork(num_features, dense_units, hidden_size, dropout)
        self.split = lambda x: torch.split(x, 1, dim=-1)
        self.dense_list = nn.ModuleList(
            [
            nn.Linear(1, dense_units) 
            for _ in range(num_features)
            ]
        )
        
        
    def forward(self, inputs):
        split_inputs = self.split(inputs)
        x = []
        for split_input, linear in zip(split_inputs, self.dense_list):
            x.append(linear(split_input))
            
        return self.variable_selection(x)

In [8]:
class Net(nn.Module):
    def __init__(self, num_features, dense_units, hidden_size, dropout):
        super(Net, self).__init__()
        self.num_features = num_features
        self.dense_units = dense_units
        self.hidden_size = hidden_size

        self.dropout = dropout
        
        self.variable_slection_flows = nn.Sequential(
            VariableSelectionFlow(num_features, self.hidden_size, dense_units, self.dropout),
            nn.Linear(self.hidden_size, 2)
        )
        
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.xavier_uniform_(module.weight)
    
    def forward(self, x):
        logits = self.variable_slection_flows(x)
        return logits

# Training

In [9]:
def fit(model, optimizer, scheduler, epochs, train_dataloader, val_dataloader):

    start_time = time.time()
    scaler = GradScaler()

    for epoch in range(epochs):

        model.train()
        
        for batch_idx, (features, targets) in enumerate(train_dataloader):
            features = features.to(DEVICE)
            targets = targets.to(DEVICE)
            with autocast():
                logits = model(features)                
                loss = F.cross_entropy(logits, targets)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

#             if not batch_idx % 10:
#                 print(
#                     f'Epoch: {epoch + 1}/{epochs}'
#                     f' | Batch: {batch_idx}/{len(train_dataloader)}'
#                     f' | Loss: {loss.detach().cpu().item():.4f}')

        if val_dataloader is not None:
            y_scores = torch.tensor([])
            y_true = torch.tensor([])

            with torch.inference_mode():

                model.eval()

                for batch_idx, (features, targets) in enumerate(val_dataloader):
                    features = features.to(DEVICE)
                    with autocast():
                        logits = model(features).detach().cpu().type(torch.float)
                        probs = F.softmax(logits, dim=-1)[:, 1]
                        y_scores = torch.cat([y_scores, probs])
                        y_true = torch.cat([y_true, targets])

                val_score = roc_auc_score(y_true, y_scores)
                print('Validation score (AUC):', val_score.item())

    elapsed = (time.time() - start_time) / 60
    print(f'Total training time: {elapsed:.3f} min')

    model.eval()

    if val_dataloader is not None:
        return model, val_score
    else:
        return model

In [10]:
N_EPOCHS = 8
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0
SEEDS = [789, 279, 318, 2001, 1976, 1966, 1994, 252]

In [11]:
SEED_PREDS = 0
SEED_SCORES = []

for seed in SEEDS:
    print('Seed', seed)

    val_scores = []
    Y_SCORES = 0
    
    for fold in range(1, 11):
        print(f'Starting fold {fold} ...')
        df_train = pd.read_csv(OUTPUT_PATH  / f'df_train_fold_{fold}.csv')
        df_val = pd.read_csv(OUTPUT_PATH  / f'df_val_fold_{fold}.csv')
        test = pd.read_csv(OUTPUT_PATH  / f'test_fold_{fold}.csv')

        train_dataloader = get_dataloader(
            df_train, 
            feature_names=ORIGINAL_FEATURES, 
            target_name=TARGET,
            batch_size=BATCH_SIZE,
            mode='train'
        )

        val_dataloader = get_dataloader(
            df_val, 
            feature_names=ORIGINAL_FEATURES, 
            target_name=TARGET,
            batch_size=BATCH_SIZE,
            mode='val'
        )

        test_dataloader = get_dataloader(
            test, 
            feature_names=ORIGINAL_FEATURES, 
            target_name=TARGET,
            batch_size=BATCH_SIZE,
            mode='test'
        )

        torch.manual_seed(seed)
        model = Net(
            num_features=NUM_FEATURES, 
            dense_units=8,
            hidden_size=16,
            dropout=0.05
        )

        model.to(DEVICE)

        model.train()

        optimizer = optim.AdamW(
            model.parameters(), 
            lr=LEARNING_RATE,
            weight_decay=WEIGHT_DECAY
        )

        model, val_score = fit(model=model,
                               optimizer=optimizer,
                               scheduler=None,
                               epochs=N_EPOCHS,
                               train_dataloader=train_dataloader,
                               val_dataloader=val_dataloader)

        val_scores.append(val_score)

        y_scores = torch.tensor([])
        with torch.inference_mode():
            model.eval()
            for batch_idx, (features, targets) in enumerate(test_dataloader):
                features = features.to(DEVICE)
                with autocast():
                    logits = model(features).detach().cpu().type(torch.float)
                    probs = F.softmax(logits, dim=-1)[:, 1]
                    y_scores = torch.cat([y_scores, probs])

        Y_SCORES += y_scores / 10
    
    SEED_PREDS += Y_SCORES / len(SEEDS)
    SEED_SCORES.append(np.mean(val_scores))

Seed 789
Starting fold 1 ...
Validation score (AUC): 0.8086734693877551
Validation score (AUC): 0.8418367346938775
Validation score (AUC): 0.8596938775510203
Validation score (AUC): 0.8571428571428572
Validation score (AUC): 0.8367346938775511
Validation score (AUC): 0.8443877551020408
Validation score (AUC): 0.854591836734694
Validation score (AUC): 0.8545918367346939
Total training time: 0.054 min
Starting fold 2 ...
Validation score (AUC): 0.7704081632653061
Validation score (AUC): 0.8061224489795918
Validation score (AUC): 0.7882653061224489
Validation score (AUC): 0.7755102040816326
Validation score (AUC): 0.7551020408163265
Validation score (AUC): 0.7576530612244898
Validation score (AUC): 0.7372448979591837
Validation score (AUC): 0.7423469387755102
Total training time: 0.048 min
Starting fold 3 ...
Validation score (AUC): 0.7627551020408162
Validation score (AUC): 0.8163265306122449
Validation score (AUC): 0.8239795918367346
Validation score (AUC): 0.826530612244898
Validation 

In [12]:
print(np.mean(SEED_SCORES))
print(np.std(SEED_SCORES))

SEED_SCORES

0.8171630527210885
0.016486137137366824


[0.8361243386243388,
 0.8008427815570673,
 0.7837836986646509,
 0.8285333207357016,
 0.8219532627865961,
 0.8244633408919124,
 0.8107451499118167,
 0.830858528596624]

In [13]:
sub = pd.read_csv(INPUT_PATH / 'submission.csv')
sub[TARGET] = SEED_PREDS
sub.to_csv('submission.csv', index=False)
sub

Unnamed: 0,id,Target
0,420,0.553706
1,421,0.758907
2,422,0.904092
3,423,0.927545
4,424,0.916706
...,...,...
275,695,0.913177
276,696,0.619613
277,697,0.914858
278,698,0.600448
