# Imports

In [1]:
import sys
sys.path.append('/kaggle/input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import pandas as pd
import numpy as np
from tqdm.auto import tqdm, trange
import lightgbm as lgb
from sklearn.compose import make_column_transformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import time
from tqdm import tqdm
from pathlib import Path
import multiprocessing as mp

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

from transformers import get_cosine_schedule_with_warmup

import torch 
from torch import nn, optim
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader



In [2]:
INPUT_PATH = Path('/kaggle/input/icr-identify-age-related-conditions')
OUTPUT_PATH = Path('/kaggle/working')
ORIGINAL_FEATURES = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN', 'BP',
       'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS', 'CU',
       'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY', 'EB',
       'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI', 'FL',
       'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL']

TARGET = 'Class'

N_CORES = mp.cpu_count()
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
NUM_FEATURES = len(ORIGINAL_FEATURES)
BATCH_SIZE = 32
N_EPOCHS = 20
N_WARMUPS = 0
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.005
SEED = 252

# Read data

In [3]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv').drop(['Id'], axis=1)
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

# Evaluation Metric

In [4]:
import torch

def score(y_true, y_pred):

    # Calculate the number of observations for each class
    N_0 = torch.sum(1 - y_true)
    N_1 = torch.sum(y_true)
    
    # Calculate the predicted probabilities for each class
    p_1 = torch.clamp(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    
    # Calculate the average log loss for each class
    log_loss_0 = -torch.sum((1 - y_true) * torch.log(p_0)) / N_0
    log_loss_1 = -torch.sum(y_true * torch.log(p_1)) / N_1
    
    # Return the (not further weighted) average of the averages
    a = (log_loss_0 + log_loss_1) / 2
    
    return a


In [5]:
def balanced_log_loss(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    # y_true: correct labels 0, 1
    # y_pred: predicted probabilities of class=1
    # Implements the Evaluation equation with w_0 = w_1 = 1.
    # Calculate the number of observations for each class
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    # Calculate the predicted probabilities for each class
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    # Calculate the average log loss for each class
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0)) / N_0
    log_loss_1 = -np.sum(y_true * np.log(p_1)) / N_1
    # return the (not further weighted) average of the averages
    a = (log_loss_0 + log_loss_1)/2
    return 'Loss', a, False

# 5-seed ensemble

In [6]:
features = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN', 'BP',
       'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS', 'CU',
       'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY', 'EB',
       'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI', 'FL',
       'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL']

train['EJ'] = train['EJ'].str.strip()
test['EJ'] = test['EJ'].str.strip()

train['EJ'] = train['EJ'].map({'A': 1, 'B': 0})
test['EJ'] = test['EJ'].map({'A': 1, 'B': 0})

In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
X = train[features]
y = train['Class']

for fold, (_, val_idx) in enumerate(skf.split(X, y)):
    train.loc[train.index.isin(val_idx), 'fold'] = fold + 1
train['fold'] = train['fold'].astype(int)

In [8]:
def prepare_folds(train, features, target):
    X = train[features]
    y = train[target]
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    scaler = StandardScaler()
    
    fold = 0
    for train_indices, val_indices in skf.split(X, y):
        fold += 1
        print(f'Preparing fold {fold} ...')
        df_train = train.loc[train.index.isin(train_indices)].reset_index(drop=True)
        df_val = train.loc[train.index.isin(val_indices)].reset_index(drop=True)
        
        df_train[features] = scaler.fit_transform(df_train[features])
        df_val[features] = scaler.transform(df_val[features])
        
        df_train.to_csv(f'df_train_fold_{fold}.csv', index=False)
        df_val.to_csv(f'df_val_fold_{fold}.csv', index=False)

In [9]:
prepare_folds(train, ORIGINAL_FEATURES, TARGET)

Preparing fold 1 ...
Preparing fold 2 ...
Preparing fold 3 ...
Preparing fold 4 ...
Preparing fold 5 ...


In [10]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='median')

In [11]:
class SpamDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float)
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __getitem__(self, index):
        X = self.features[index]
        y = self.targets[index]
        return X, y

    def __len__(self):
        return self.targets.shape[0]


def get_dataloader(features, targets, 
                       feature_names, 
                       target_name,
                       batch_size,
                       mode):
    if mode == 'train':
        shuffle = True
        drop_last = True
    else:
        shuffle = False
        drop_last = False
    
    torch.manual_seed(SEED)
    train_dataset = SpamDataset(
        features=features, 
        targets=targets
    )
    
    data_loader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=N_CORES
    )
    
    return data_loader

In [12]:
class GatedLinearUnit(nn.Module):
    def __init__(self, input_size):
        super(GatedLinearUnit, self).__init__()
        self.linear = nn.Linear(input_size, input_size)
        self.gate = nn.Sequential(
            nn.Linear(input_size, input_size),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.linear(x) * self.gate(x)
    
    
class GatedResidualNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, dropout):
        super(GatedResidualNetwork, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.grn = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ELU(),
            nn.Linear(hidden_size, hidden_size),
            nn.Dropout(dropout),
            GatedLinearUnit(hidden_size),
        )
        
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.feature_projection  = nn.Linear(input_size, hidden_size)
        
    def forward(self, inputs):
        x = self.grn(inputs)
        if inputs.shape[-1] != self.hidden_size:
            inputs = self.feature_projection(inputs)
        x = self.layer_norm(x + inputs)
        return x
    
class VariableSelectionNetwork(nn.Module):
    def __init__(self, num_features, dense_units, hidden_size, dropout):
        super(VariableSelectionNetwork, self).__init__()
        self.num_features = num_features
        self.hidden_size = hidden_size
        self.grns = nn.ModuleList()
        for _ in range(num_features):
            self.grns.append(GatedResidualNetwork(dense_units, hidden_size, dropout))
        
        
        self.grn_concat = GatedResidualNetwork(num_features*dense_units,  hidden_size, dropout)
        self.softmax = nn.Sequential(
            nn.Linear(hidden_size, num_features),
            nn.Softmax(dim=-1)
        )
        
    def forward(self, inputs):
        v = torch.cat(inputs, dim=1)
        v = self.grn_concat(v)
        v = self.softmax(v)
        v = torch.unsqueeze(v, dim=-1)
        
        x = []
        for idx, input_ in enumerate(inputs):
            x.append(self.grns[idx](input_))
        x = torch.stack(x, dim=1)
        
        out = (v.transpose(2, 1) @ x).squeeze(dim=1)
        return out
    
class VariableSelectionFlow(nn.Module):
    def __init__(self, num_features, hidden_size, dense_units, dropout):
        super(VariableSelectionFlow, self).__init__()
        self.variable_selection = VariableSelectionNetwork(num_features, dense_units, hidden_size, dropout)
        self.split = lambda x: torch.split(x, 1, dim=-1)
        self.dense_list = nn.ModuleList(
            [
            nn.Linear(1, dense_units) 
            for _ in range(num_features)
            ]
        )
        
        
    def forward(self, inputs):
        split_inputs = self.split(inputs)
        x = []
        for split_input, linear in zip(split_inputs, self.dense_list):
            x.append(linear(split_input))
            
        return self.variable_selection(x)
class Net(nn.Module):
    def __init__(self, num_features, dense_units, hidden_sizes, dropouts):
        super(Net, self).__init__()
        self.num_features = num_features
        self.dense_units = dense_units
        self.hidden_size_1 = hidden_sizes[0]
        self.hidden_size_2 = hidden_sizes[1]
        self.hidden_size_3 = hidden_sizes[2]

        self.dropout_1 = dropouts[0]
        self.dropout_2 = dropouts[1]
        self.dropout_3 = dropouts[2]
        
        self.variable_slection_flows = nn.Sequential(
            VariableSelectionFlow(num_features, self.hidden_size_1, dense_units, self.dropout_1),
            VariableSelectionFlow(self.hidden_size_1, self.hidden_size_2, dense_units, self.dropout_2),
            VariableSelectionFlow(self.hidden_size_2, self.hidden_size_3, dense_units, self.dropout_3),
            nn.Linear(self.hidden_size_3, 2)
        )
        
    def forward(self, x):
        logits = self.variable_slection_flows(x)
        return logits

In [13]:
def fit(model, optimizer, scheduler, epochs, train_dataloader, val_dataloader):

    start_time = time.time()
    scaler = GradScaler()

    for epoch in range(epochs):

        model.train()
        
        for batch_idx, (features, targets) in enumerate(train_dataloader):
            features = features.to(DEVICE)
            targets = targets.to(DEVICE)
            with autocast():
                logits = model(features)
                probs = F.softmax(logits, dim=-1)[:, 1]
                loss = score(targets, probs)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()

            if not batch_idx % 10:
                print(
                    f'Epoch: {epoch + 1}/{epochs}'
                    f' | Batch: {batch_idx}/{len(train_dataloader)}'
                    f' | Loss: {loss.detach().cpu().item():.4f}')

        if val_dataloader is not None:
            y_scores = torch.tensor([])
            y_true = torch.tensor([])

            with torch.inference_mode():

                model.eval()

                for batch_idx, (features, targets) in enumerate(val_dataloader):
                    features = features.to(DEVICE)
                    with autocast():
                        logits = model(features).detach().cpu().type(torch.float)
                        probs = F.softmax(logits, dim=-1)[:, 1]
                        y_scores = torch.cat([y_scores, probs])
                        y_true = torch.cat([y_true, targets])

                val_score = balanced_log_loss(y_true, y_scores)
                print('Validation score (AUC):', val_score)

    elapsed = (time.time() - start_time) / 60
    print(f'Total training time: {elapsed:.3f} min')

    model.eval()

    if val_dataloader is not None:
        return model, val_score
    else:
        return model

In [14]:
N_EPOCHS = 15
N_WARMUPS = 100
WEIGHT_DECAY = 0.005
LEARNING_RATE = 0.01

In [15]:
# val_scores = []
# for fold in range(1, 6):
#     df_train = pd.read_csv(OUTPUT_PATH  / f'df_train_fold_{fold}.csv')
#     df_val = pd.read_csv(OUTPUT_PATH  / f'df_val_fold_{fold}.csv')
#     imp = SimpleImputer(strategy='median')
#     X_train = df_train[ORIGINAL_FEATURES]
#     y_train = df_train[TARGET]
#     X_val = df_val[ORIGINAL_FEATURES]
#     y_val = df_val[TARGET]
    
#     X_train, X_val = imp.fit_transform(X_train), imp.transform(X_val)
    
#     train_dataloader = get_dataloader(
#         X_train, y_train,
#         feature_names=ORIGINAL_FEATURES, 
#         target_name=TARGET,
#         batch_size=BATCH_SIZE,
#         mode='train'
#     )
    
#     val_dataloader = get_dataloader(
#         X_val, y_val, 
#         feature_names=ORIGINAL_FEATURES, 
#         target_name=TARGET,
#         batch_size=BATCH_SIZE,
#         mode='val'
#     )
    
#     torch.manual_seed(SEED)
#     model = Net(
#         num_features=NUM_FEATURES, 
#         dense_units=8,
#         hidden_sizes=[32, 32, 32],
#         dropouts=[0.75, 0.5, 0.1]
#     )
    
#     model.to(DEVICE)
#     model.train()
    
#     optimizer = optim.AdamW(
#         model.parameters(), 
#         lr=LEARNING_RATE,
#         weight_decay=WEIGHT_DECAY
#     )
    
#     scheduler = get_cosine_schedule_with_warmup(
#         optimizer=optimizer, 
#         num_warmup_steps=N_WARMUPS,
#         num_training_steps=len(train_dataloader)*N_EPOCHS
#     )
    
#     model, val_score = fit(model=model,
#                            optimizer=optimizer,
#                            scheduler=scheduler,
#                            epochs=N_EPOCHS,
#                            train_dataloader=train_dataloader,
#                            val_dataloader=val_dataloader)
    
#     val_scores.append(val_score)

In [16]:
test['Class'] = 0

In [17]:
X_train = train[ORIGINAL_FEATURES]
y_train = train[TARGET]
X_test = test[ORIGINAL_FEATURES]
y_test = test[TARGET]

X_train, X_test = imp.fit_transform(X_train), imp.transform(X_test)

train_dataloader = get_dataloader(
    X_train, y_train,
    feature_names=ORIGINAL_FEATURES, 
    target_name=TARGET,
    batch_size=BATCH_SIZE,
    mode='train'
)

test_dataloader = get_dataloader(
    X_test, y_test, 
    feature_names=ORIGINAL_FEATURES, 
    target_name=TARGET,
    batch_size=BATCH_SIZE,
    mode='val'
)

torch.manual_seed(SEED)
model = Net(
    num_features=NUM_FEATURES, 
    dense_units=8,
    hidden_sizes=[32, 32, 32],
    dropouts=[0.75, 0.5, 0.25]
)

model.to(DEVICE)
model.train()

optimizer = optim.AdamW(
    model.parameters(), 
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY
)

scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer, 
    num_warmup_steps=N_WARMUPS,
    num_training_steps=len(train_dataloader)*N_EPOCHS
)

model = fit(model=model,
           optimizer=optimizer,
           scheduler=scheduler,
           epochs=N_EPOCHS,
           train_dataloader=train_dataloader,
           val_dataloader=None)

Epoch: 1/15 | Batch: 0/19 | Loss: 0.7048
Epoch: 1/15 | Batch: 10/19 | Loss: 0.6967
Epoch: 2/15 | Batch: 0/19 | Loss: 0.6925
Epoch: 2/15 | Batch: 10/19 | Loss: 0.7254
Epoch: 3/15 | Batch: 0/19 | Loss: 0.5173
Epoch: 3/15 | Batch: 10/19 | Loss: 0.7123
Epoch: 4/15 | Batch: 0/19 | Loss: 0.4894
Epoch: 4/15 | Batch: 10/19 | Loss: 0.2931
Epoch: 5/15 | Batch: 0/19 | Loss: 0.4056
Epoch: 5/15 | Batch: 10/19 | Loss: 0.4076
Epoch: 6/15 | Batch: 0/19 | Loss: 0.4935
Epoch: 6/15 | Batch: 10/19 | Loss: 0.3445
Epoch: 7/15 | Batch: 0/19 | Loss: 0.5423
Epoch: 7/15 | Batch: 10/19 | Loss: 0.5617
Epoch: 8/15 | Batch: 0/19 | Loss: 0.3421
Epoch: 8/15 | Batch: 10/19 | Loss: 0.7682
Epoch: 9/15 | Batch: 0/19 | Loss: 0.5203
Epoch: 9/15 | Batch: 10/19 | Loss: 0.3110
Epoch: 10/15 | Batch: 0/19 | Loss: 0.4536
Epoch: 10/15 | Batch: 10/19 | Loss: 0.5030
Epoch: 11/15 | Batch: 0/19 | Loss: 0.5255
Epoch: 11/15 | Batch: 10/19 | Loss: 0.2635
Epoch: 12/15 | Batch: 0/19 | Loss: 0.3376
Epoch: 12/15 | Batch: 10/19 | Loss: 0.461

In [18]:
y_scores = torch.tensor([])

with torch.inference_mode():

    model.eval()

    for batch_idx, (features, targets) in enumerate(test_dataloader):
        features = features.to(DEVICE)
        with autocast():
            logits = model(features).detach().cpu().type(torch.float)
            probs = F.softmax(logits, dim=-1)
            y_scores = torch.cat([y_scores, probs])

# Submission

In [19]:
sub = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
sub[['class_0', 'class_1']] = y_scores
sub.to_csv('submission.csv', index=False)
print('Submission file saved!')
sub

Submission file saved!


Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.923802,0.076198
1,010ebe33f668,0.923802,0.076198
2,02fa521e1838,0.923802,0.076198
3,040e15f562a2,0.923802,0.076198
4,046e85c7cc7f,0.923802,0.076198
