In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import os
import copy

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

### Introduction and Credit

I had a lot of fun with this competition, and I'm disappointed it's over because I was making really good progress towards the end. My final submission got up to 0.798, but it finished scoring after the competition had ended. I'm sure I could have gotten this well above 0.8 if I had the time. There are still a number of ideas I didn't get the chance to implement. 

The following kernels and github were instrumental in building this model.

https://www.kaggle.com/wangsg/a-self-attentive-model-for-knowledge-tracing

https://www.kaggle.com/mpware/sakt-fork

https://github.com/arshadshk/SAINT-pytorch

## Load data

The poorly named train-df-saint-not-binned contains a pre processed feather version of train_df. In that kernel, I calculate all the interesting features, then create artificial users for any user with more than seq_len interactions. So if a user had 160 interactions, I split this into two users: One with the final 100 interactions, and one with the first 60 interactions. This strategy seemed to work well when applied to my SAKT fork, so I kept it with my SAINT model. As a result of this, instead of 39000 users, the model trains on 120000 "users" or so. 

Given more time, I would have liked to experiment with other ways of creating sequences for the model.

In [2]:
%%time

train_df = pd.read_feather('../input/train-df-saint-not-binned-everything/train_not_binned')
train_df['prior_question_elapsed_time'].fillna(0, inplace=True)
train_df['prior_question_elapsed_time'] /= 1000
train_df['prior_question_elapsed_time'] = train_df['prior_question_elapsed_time'].round()
train_df['prior_question_elapsed_time'] = train_df['prior_question_elapsed_time'].astype('int16')

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99271300 entries, 0 to 99271299
Data columns (total 9 columns):
 #   Column                       Dtype
---  ------                       -----
 0   timestamp                    int64
 1   user_id                      int64
 2   content_id                   int16
 3   answered_correctly           int8 
 4   prior_question_elapsed_time  int16
 5   part                         int8 
 6   lag1                         int16
 7   lag2                         int16
 8   lag3                         int16
dtypes: int16(5), int64(2), int8(2)
memory usage: 2.6 GB
CPU times: user 2.24 s, sys: 3.16 s, total: 5.4 s
Wall time: 21.4 s


## Preprocess

In [3]:
TARGET = 'answered_correctly'
MAX_SEQ = 100
# content_ids = train_df["content_id"].unique()
NUM_QUESTIONS = len(train_df["content_id"].unique()) + 1
NUM_USERS = len(train_df['user_id'].unique())
NUM_LAG1S = train_df['lag1'].max() + 1
NUM_LAG2S = train_df['lag2'].max() + 1
NUM_LAG3S = train_df['lag3'].max() + 1
ELAPSED_TIMES = train_df['prior_question_elapsed_time'].max() + 1
MODEL_BEST = 'model_best.pt'
BS = 1024

In [4]:
%%time

# Creates a Series with user_ids as indices and a tuple of all the content_ids and answered_correctlys as lists
def create_group(df):
    return df[['user_id', 'content_id', 'answered_correctly', 'lag1', 'lag2', 'lag3', 'part', 'prior_question_elapsed_time']].groupby('user_id').apply(lambda r: [
        r['content_id'].values,
        r['answered_correctly'].values,
        r['lag1'].values,
        r['lag2'].values,
        r['lag3'].values,
        r['part'].values,
        r['prior_question_elapsed_time'].values])

train_group = create_group(train_df)

del train_df
gc.collect()

CPU times: user 5min 9s, sys: 7.19 s, total: 5min 16s
Wall time: 5min 22s


0

In [5]:
valid_group = train_group.sample(frac=0.03)
train_group = train_group.drop(valid_group.index).reset_index(drop=True)
valid_group.reset_index(drop=True, inplace=True)
train_group.shape, valid_group.shape

((1194644,), (36948,))

In [6]:
class SAINTDataset(Dataset):
    def __init__(self, user_sequences, num_questions, subset='train', max_seq=100, min_seq=10):
        super(SAINTDataset, self).__init__()
        self.max_seq = max_seq
        self.num_questions = num_questions
        self.user_sequences = user_sequences
        self.subset = subset

        self.user_ids = []
        for user_id in user_sequences.index:
            q, _, _, _, _, _, _ = user_sequences[user_id]
            if len(q) < min_seq:
                continue
            self.user_ids.append(user_id)

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        # question_id, answered_correctly, lag1, lag2, lag3, part, elapsed_time
        q_, qa_, l1_, l2_, l3_, p_, el_ = self.user_sequences[user_id]
        seq_len = len(q_)

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        l1 = np.zeros(self.max_seq, dtype=int)
        l2 = np.zeros(self.max_seq, dtype=int)
        l3 = np.zeros(self.max_seq, dtype=int)
        p = np.zeros(self.max_seq, dtype=int)
        el = np.zeros(self.max_seq, dtype=int)
        
#         # If there are more questions answered than max_seq, take the last max_seq sequences
        if seq_len >= self.max_seq:
            q[:] = q_[-self.max_seq:]
            qa[:] = qa_[-self.max_seq:]
            l1[:] = l1_[-self.max_seq:]
            l2[:] = l2_[-self.max_seq:]
            l3[:] = l3_[-self.max_seq:]
            p[:] = p_[-self.max_seq:]
            el[:] = el_[-self.max_seq:]
        # If not, map our user_sequences to the tail end of q and qa, the start will be padded with zeros
        else:
            q[-seq_len:] = q_
            qa[-seq_len:] = qa_
            l1[-seq_len:] = l1_
            l2[-seq_len:] = l2_
            l3[-seq_len:] = l3_
            el[-seq_len:] = el_
        
        r = np.zeros(self.max_seq, dtype=int)
        r[1:] = qa[:-1].copy()
        
        return q, r, qa, l1, l2, l3, p, el 

## Define model

In [7]:
class FFN(nn.Module):
    def __init__(self, dim=128):
        super().__init__()
        self.layer1 = nn.Linear(dim, dim)
        self.layer2 = nn.Linear(dim, dim)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        return self.layer2(   self.relu(   self.layer1(x)))

    
def future_mask(seq_length):
    future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
    return torch.from_numpy(future_mask)


class Encoder(nn.Module):
    def __init__(self, n_in, seq_len=100, embed_dim=128, nheads=4):
        super().__init__()
        self.seq_len = seq_len

        self.part_embed = nn.Embedding(10, embed_dim)
        
        self.e_embed = nn.Embedding(n_in, embed_dim)
        self.e_pos_embed = nn.Embedding(seq_len, embed_dim)
        self.e_norm = nn.LayerNorm(embed_dim)
        
        self.e_multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=nheads, dropout=0.2)
        self.m_norm = nn.LayerNorm(embed_dim)
        self.ffn = FFN(embed_dim)
    
    def forward(self, e, p, first_block=True):
        
        if first_block:
            e = self.e_embed(e)
            p = self.part_embed(p)
            e = e + p
         
        pos = torch.arange(self.seq_len).unsqueeze(0).to(device)
        e_pos = self.e_pos_embed(pos)
        e = e + e_pos
        e = self.e_norm(e)
        e = e.permute(1,0,2) #[bs, s_len, embed] => [s_len, bs, embed]     
        n = e.shape[0]
        
        att_mask = future_mask(n).to(device)
        att_out, _ = self.e_multi_att(e, e, e, attn_mask=att_mask)
        m = e + att_out
        m = m.permute(1,0,2)
        
        o = m + self.ffn(self.m_norm(m))
        
        return o
    
class Decoder(nn.Module):
    def __init__(self, n_in, seq_len=100, embed_dim=128, nheads=4):
        super().__init__()
        self.seq_len = seq_len
        
        self.r_embed = nn.Embedding(n_in, embed_dim)
        self.r_pos_embed = nn.Embedding(seq_len, embed_dim)
        self.r_norm = nn.LayerNorm(embed_dim)
        
        self.l1_embed = nn.Embedding(NUM_LAG1S, embed_dim)
        self.l2_embed = nn.Embedding(NUM_LAG2S, embed_dim)
        self.l3_embed = nn.Embedding(NUM_LAG3S, embed_dim)
        self.el_t_embed = nn.Embedding(ELAPSED_TIMES, embed_dim)
        
        self.r_multi_att1 = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=4, dropout=0.2)
        self.r_multi_att2 = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=4, dropout=0.2)
        self.ffn = FFN(embed_dim)
        
        self.r_norm1 = nn.LayerNorm(embed_dim)
        self.r_norm2 = nn.LayerNorm(embed_dim)
        self.r_norm3 = nn.LayerNorm(embed_dim)

    
    def forward(self, r, o, l1, l2, l3, el, first_block=True):
        
        if first_block:
            r = self.r_embed(r)
            l1 = self.l1_embed(l1)
            l2 = self.l2_embed(l2)
            l3 = self.l3_embed(l3)
            el = self.el_t_embed(el)

            r = r + l1 + l2 + l3 + el
  
        pos = torch.arange(self.seq_len).unsqueeze(0).to(device)
        r_pos_embed = self.r_pos_embed(pos)
        r = r + r_pos_embed
        r = self.r_norm1(r) 
        r = r.permute(1,0,2)   
        n = r.shape[0]
   
        att_out1, _ = self.r_multi_att1(r, r, r, attn_mask=future_mask(n).to(device))
        m1 = r + att_out1

        o = o.permute(1,0,2)
        o = self.r_norm2(o)
        att_out2, _ = self.r_multi_att2(m1, o, o, attn_mask=future_mask(n).to(device))
        
        m2 = att_out2 + m1
        m2 = m2.permute(1,0,2)        
        m2 = self.r_norm3(m2)
        
        l = m2 + self.ffn(m2)
        
        return l

# This is an altered version from https://github.com/arshadshk/SAINT-pytorch
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class SAINT(nn.Module):
    def __init__(self, dim_model, num_en, num_de, heads_en, total_ex, total_in, heads_de, seq_len):
        super().__init__()
        
        self.num_en = num_en
        self.num_de = num_de

        self.encoder = get_clones( Encoder(n_in=total_ex, seq_len=seq_len, embed_dim=dim_model, nheads=heads_en) , num_en)
        self.decoder = get_clones( Decoder(n_in=total_in, seq_len=seq_len, embed_dim=dim_model, nheads=heads_de) , num_de)

        self.out = nn.Linear(in_features= dim_model , out_features=1)
    
    def forward(self, in_ex, in_in, l1, l2, l3, p, el):
        
        ## pass through each of the encoder blocks in sequence
        first_block = True
        for x in range(self.num_en):
            if x>=1:
                first_block = False
            in_ex = self.encoder[x](in_ex, p, first_block=first_block)
        
        ## pass through each decoder blocks in sequence
        first_block = True
        for x in range(self.num_de):
            if x>=1:
                first_block = False
            in_in = self.decoder[x]( in_in , in_ex, l1, l2, l3, el, first_block=first_block )

        ## Output layer
        in_in = torch.sigmoid( self.out( in_in ) )
        return in_in.squeeze(-1)

In [8]:
# train_iterator is our dataloader, criterion is nn.BCEWithLogitsLoss
def train_epoch(model, train_iterator, optim, criterion, device="cpu"):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    tbar = tqdm(train_iterator)
    for item in tbar:
        e = item[0].to(device).long()
        r = item[1].to(device).long()
        label = item[2].to(device).float()
        l1 = item[3].to(device).long()
        l2 = item[4].to(device).long()
        l3 = item[5].to(device).long()
        p = item[6].to(device).long()
        el = item[7].to(device).long()

        # Zero the gradients in the optimizer
        optim.zero_grad()
        # The results of one forward pass
        output = model(e, r, l1, l2, l3, p, el)
        # Calculate the loss
        loss = criterion(output, torch.sigmoid(label))
        # Calculate the gradients with respect to the loss
        loss.backward()
        # Adjust the parameters to minimize the loss based on these gradients
        optim.step()
        # Add our loss to the list of losses
        train_loss.append(loss.item())

        output = output[:, -1]
        label = label[:, -1] 
        pred = (output >= 0.5).long()
         
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

        tbar.set_description('loss - {:.4f}'.format(loss))

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.mean(train_loss)

    return loss, acc, auc

In [9]:
# https://www.kaggle.com/mpware/sakt-fork
def valid_epoch(model, valid_iterator, criterion, device="cpu"):
    model.eval()

    valid_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    #tbar = tqdm(valid_iterator)
    for item in valid_iterator: # tbar:
        e = item[0].to(device).long()
        r = item[1].to(device).long()
        label = item[2].to(device).float()
        l1 = item[3].to(device).long()
        l2 = item[4].to(device).long()
        l3 = item[5].to(device).long()
        p = item[6].to(device).long()
        el = item[7].to(device).long()

        with torch.no_grad():
            output = model(e, r, l1, l2, l3, p, el)
        loss = criterion(output, torch.sigmoid(label))
        valid_loss.append(loss.item())

        output = output[:, -1] # (BS, 1)
        label = label[:, -1] 
        pred = (output >= 0.5).long()
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.mean(valid_loss)

    return loss, acc, auc


In [10]:
gc.collect()
train_dataset = SAINTDataset(train_group, NUM_QUESTIONS, max_seq=MAX_SEQ)
train_dataloader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=8)

valid_dataset = SAINTDataset(valid_group, NUM_QUESTIONS, max_seq=MAX_SEQ, subset='valid')
valid_dataloader = DataLoader(valid_dataset, batch_size=BS, shuffle=False, num_workers=8)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SAINT(dim_model=128,
            num_en=2,
            num_de=2,
            heads_en=4,
            heads_de=4,
            total_ex=NUM_QUESTIONS, 
            total_in=2,
            seq_len=100
            )

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss() 

model.to(device)
criterion.to(device)

BCELoss()

In [12]:
gc.collect()
epochs = 30
history = []
auc_max = -np.inf

for epoch in range(1, epochs+1):
    train_loss, train_acc, train_auc = train_epoch(model, train_dataloader, optimizer, criterion, device)
    print(f'Epoch {epoch}, train_loss: {train_loss:5f}, train_acc: {train_acc:5f}, train_auc: {train_auc:5f}')
    valid_loss, valid_acc, valid_auc = valid_epoch(model, valid_dataloader, criterion, device)
    print(f'Epoch {epoch}, valid_loss: {valid_loss:5f}, valid_acc: {valid_acc:5f}, valid_auc: {valid_auc:5f}')
    
    lr = optimizer.param_groups[0]['lr']
    history.append({"epoch":epoch, "lr": lr, **{"train_auc": train_auc, "train_acc": train_acc}, **{"valid_auc": valid_auc, "valid_acc": valid_acc}})
    if valid_auc > auc_max:
        print("Epoch#%s, valid loss %.4f, Metric loss improved from %.4f to %.4f, saving model ..." % (epoch, valid_loss, auc_max, valid_auc))
        auc_max = valid_auc
        torch.save(model.state_dict(), MODEL_BEST)
    

loss - 0.6507: 100%|██████████| 1152/1152 [06:22<00:00,  3.01it/s]


Epoch 1, train_loss: 0.653833, train_acc: 0.617966, train_auc: 0.671983


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 1, valid_loss: 0.651383, valid_acc: 0.618819, valid_auc: 0.748260
Epoch#1, valid loss 0.6514, Metric loss improved from -inf to 0.7483, saving model ...


loss - 0.6512: 100%|██████████| 1152/1152 [06:24<00:00,  3.00it/s]


Epoch 2, train_loss: 0.650869, train_acc: 0.617858, train_auc: 0.766011


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 2, valid_loss: 0.650544, valid_acc: 0.618572, valid_auc: 0.771194
Epoch#2, valid loss 0.6505, Metric loss improved from 0.7483 to 0.7712, saving model ...


loss - 0.6458: 100%|██████████| 1152/1152 [06:25<00:00,  2.99it/s]


Epoch 3, train_loss: 0.650610, train_acc: 0.617843, train_auc: 0.773138


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 3, valid_loss: 0.650478, valid_acc: 0.618572, valid_auc: 0.773283
Epoch#3, valid loss 0.6505, Metric loss improved from 0.7712 to 0.7733, saving model ...


loss - 0.6514: 100%|██████████| 1152/1152 [06:27<00:00,  2.98it/s]


Epoch 4, train_loss: 0.650538, train_acc: 0.617842, train_auc: 0.775381


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 4, valid_loss: 0.650616, valid_acc: 0.618572, valid_auc: 0.775502
Epoch#4, valid loss 0.6506, Metric loss improved from 0.7733 to 0.7755, saving model ...


loss - 0.6508: 100%|██████████| 1152/1152 [06:23<00:00,  3.00it/s]


Epoch 5, train_loss: 0.650474, train_acc: 0.617842, train_auc: 0.777310


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 5, valid_loss: 0.650332, valid_acc: 0.618572, valid_auc: 0.777450
Epoch#5, valid loss 0.6503, Metric loss improved from 0.7755 to 0.7774, saving model ...


loss - 0.6515: 100%|██████████| 1152/1152 [06:25<00:00,  2.99it/s]


Epoch 6, train_loss: 0.650410, train_acc: 0.617843, train_auc: 0.779347


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 6, valid_loss: 0.650256, valid_acc: 0.618572, valid_auc: 0.780301
Epoch#6, valid loss 0.6503, Metric loss improved from 0.7774 to 0.7803, saving model ...


loss - 0.6537: 100%|██████████| 1152/1152 [06:26<00:00,  2.98it/s]


Epoch 7, train_loss: 0.650350, train_acc: 0.617843, train_auc: 0.781085


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 7, valid_loss: 0.650202, valid_acc: 0.618572, valid_auc: 0.781699
Epoch#7, valid loss 0.6502, Metric loss improved from 0.7803 to 0.7817, saving model ...


loss - 0.6512: 100%|██████████| 1152/1152 [06:26<00:00,  2.98it/s]


Epoch 8, train_loss: 0.650295, train_acc: 0.617847, train_auc: 0.782559


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 8, valid_loss: 0.650162, valid_acc: 0.618600, valid_auc: 0.783768
Epoch#8, valid loss 0.6502, Metric loss improved from 0.7817 to 0.7838, saving model ...


loss - 0.6505: 100%|██████████| 1152/1152 [06:27<00:00,  2.97it/s]


Epoch 9, train_loss: 0.650250, train_acc: 0.617870, train_auc: 0.783840


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 9, valid_loss: 0.650199, valid_acc: 0.618627, valid_auc: 0.784117
Epoch#9, valid loss 0.6502, Metric loss improved from 0.7838 to 0.7841, saving model ...


loss - 0.6509: 100%|██████████| 1152/1152 [06:23<00:00,  3.00it/s]


Epoch 10, train_loss: 0.650213, train_acc: 0.617886, train_auc: 0.784853


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 10, valid_loss: 0.650088, valid_acc: 0.618600, valid_auc: 0.784829
Epoch#10, valid loss 0.6501, Metric loss improved from 0.7841 to 0.7848, saving model ...


loss - 0.6502: 100%|██████████| 1152/1152 [06:25<00:00,  2.99it/s]


Epoch 11, train_loss: 0.650179, train_acc: 0.617914, train_auc: 0.785792


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 11, valid_loss: 0.650058, valid_acc: 0.618655, valid_auc: 0.785818
Epoch#11, valid loss 0.6501, Metric loss improved from 0.7848 to 0.7858, saving model ...


loss - 0.6490: 100%|██████████| 1152/1152 [06:25<00:00,  2.99it/s]


Epoch 12, train_loss: 0.650143, train_acc: 0.617917, train_auc: 0.787003


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 12, valid_loss: 0.650055, valid_acc: 0.618737, valid_auc: 0.786386
Epoch#12, valid loss 0.6501, Metric loss improved from 0.7858 to 0.7864, saving model ...


loss - 0.6461: 100%|██████████| 1152/1152 [06:25<00:00,  2.99it/s]


Epoch 13, train_loss: 0.650103, train_acc: 0.617935, train_auc: 0.788054


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 13, valid_loss: 0.650068, valid_acc: 0.618682, valid_auc: 0.788126
Epoch#13, valid loss 0.6501, Metric loss improved from 0.7864 to 0.7881, saving model ...


loss - 0.6506: 100%|██████████| 1152/1152 [06:25<00:00,  2.99it/s]


Epoch 14, train_loss: 0.650064, train_acc: 0.617966, train_auc: 0.789713


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 14, valid_loss: 0.649931, valid_acc: 0.618682, valid_auc: 0.790307
Epoch#14, valid loss 0.6499, Metric loss improved from 0.7881 to 0.7903, saving model ...


loss - 0.6483: 100%|██████████| 1152/1152 [06:24<00:00,  3.00it/s]


Epoch 15, train_loss: 0.650020, train_acc: 0.617975, train_auc: 0.791462


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 15, valid_loss: 0.649909, valid_acc: 0.618655, valid_auc: 0.791436
Epoch#15, valid loss 0.6499, Metric loss improved from 0.7903 to 0.7914, saving model ...


loss - 0.6507: 100%|██████████| 1152/1152 [06:26<00:00,  2.98it/s]


Epoch 16, train_loss: 0.649988, train_acc: 0.617979, train_auc: 0.792574


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 16, valid_loss: 0.649919, valid_acc: 0.618627, valid_auc: 0.792077
Epoch#16, valid loss 0.6499, Metric loss improved from 0.7914 to 0.7921, saving model ...


loss - 0.6507: 100%|██████████| 1152/1152 [06:27<00:00,  2.98it/s]


Epoch 17, train_loss: 0.649963, train_acc: 0.617986, train_auc: 0.793241


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 17, valid_loss: 0.649927, valid_acc: 0.618956, valid_auc: 0.792794
Epoch#17, valid loss 0.6499, Metric loss improved from 0.7921 to 0.7928, saving model ...


loss - 0.6495: 100%|██████████| 1152/1152 [06:26<00:00,  2.98it/s]


Epoch 18, train_loss: 0.649941, train_acc: 0.618001, train_auc: 0.794060


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 18, valid_loss: 0.649872, valid_acc: 0.618792, valid_auc: 0.793126
Epoch#18, valid loss 0.6499, Metric loss improved from 0.7928 to 0.7931, saving model ...


loss - 0.6491: 100%|██████████| 1152/1152 [06:24<00:00,  3.00it/s]


Epoch 19, train_loss: 0.649923, train_acc: 0.618017, train_auc: 0.794631


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 19, valid_loss: 0.649834, valid_acc: 0.618792, valid_auc: 0.794109
Epoch#19, valid loss 0.6498, Metric loss improved from 0.7931 to 0.7941, saving model ...


loss - 0.6478: 100%|██████████| 1152/1152 [06:24<00:00,  3.00it/s]


Epoch 20, train_loss: 0.649906, train_acc: 0.618027, train_auc: 0.795168


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 20, valid_loss: 0.649817, valid_acc: 0.618682, valid_auc: 0.794606
Epoch#20, valid loss 0.6498, Metric loss improved from 0.7941 to 0.7946, saving model ...


loss - 0.6503: 100%|██████████| 1152/1152 [06:27<00:00,  2.97it/s]


Epoch 21, train_loss: 0.649894, train_acc: 0.618048, train_auc: 0.795468


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 21, valid_loss: 0.649867, valid_acc: 0.618655, valid_auc: 0.794548


loss - 0.6459: 100%|██████████| 1152/1152 [06:23<00:00,  3.00it/s]


Epoch 22, train_loss: 0.649878, train_acc: 0.618024, train_auc: 0.795970


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 22, valid_loss: 0.649820, valid_acc: 0.618627, valid_auc: 0.795031
Epoch#22, valid loss 0.6498, Metric loss improved from 0.7946 to 0.7950, saving model ...


loss - 0.6482: 100%|██████████| 1152/1152 [06:27<00:00,  2.97it/s]


Epoch 23, train_loss: 0.649868, train_acc: 0.618066, train_auc: 0.796461


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 23, valid_loss: 0.649803, valid_acc: 0.618819, valid_auc: 0.795249
Epoch#23, valid loss 0.6498, Metric loss improved from 0.7950 to 0.7952, saving model ...


loss - 0.6475: 100%|██████████| 1152/1152 [06:24<00:00,  3.00it/s]


Epoch 24, train_loss: 0.649856, train_acc: 0.618057, train_auc: 0.796574


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 24, valid_loss: 0.649781, valid_acc: 0.618737, valid_auc: 0.795268
Epoch#24, valid loss 0.6498, Metric loss improved from 0.7952 to 0.7953, saving model ...


loss - 0.6515: 100%|██████████| 1152/1152 [06:24<00:00,  2.99it/s]


Epoch 25, train_loss: 0.649849, train_acc: 0.618092, train_auc: 0.797146


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 25, valid_loss: 0.649765, valid_acc: 0.618874, valid_auc: 0.796104
Epoch#25, valid loss 0.6498, Metric loss improved from 0.7953 to 0.7961, saving model ...


loss - 0.6512: 100%|██████████| 1152/1152 [06:23<00:00,  3.00it/s]


Epoch 26, train_loss: 0.649839, train_acc: 0.618151, train_auc: 0.797234


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 26, valid_loss: 0.649788, valid_acc: 0.618764, valid_auc: 0.795673


loss - 0.6474: 100%|██████████| 1152/1152 [06:24<00:00,  3.00it/s]


Epoch 27, train_loss: 0.649827, train_acc: 0.618137, train_auc: 0.797675


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 27, valid_loss: 0.649788, valid_acc: 0.618847, valid_auc: 0.796373
Epoch#27, valid loss 0.6498, Metric loss improved from 0.7961 to 0.7964, saving model ...


loss - 0.6501: 100%|██████████| 1152/1152 [06:26<00:00,  2.98it/s]


Epoch 28, train_loss: 0.649819, train_acc: 0.618154, train_auc: 0.797909


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 28, valid_loss: 0.649768, valid_acc: 0.618764, valid_auc: 0.796082


loss - 0.6499: 100%|██████████| 1152/1152 [06:23<00:00,  3.00it/s]


Epoch 29, train_loss: 0.649811, train_acc: 0.618123, train_auc: 0.798239


  0%|          | 0/1152 [00:00<?, ?it/s]

Epoch 29, valid_loss: 0.649754, valid_acc: 0.618764, valid_auc: 0.796355


loss - 0.6492: 100%|██████████| 1152/1152 [06:25<00:00,  2.99it/s]


Epoch 30, train_loss: 0.649803, train_acc: 0.618170, train_auc: 0.798510
Epoch 30, valid_loss: 0.649733, valid_acc: 0.618792, valid_auc: 0.796974
Epoch#30, valid loss 0.6497, Metric loss improved from 0.7964 to 0.7970, saving model ...


### Conclusion

Although I'm happy with how the competition went, I still think I could have improved on my model quite a bit. 

I had issues using a continuous representation of the prior_question_elapsed_time feature, so in the end I just left the categorical version. The SAINT+ paper found that this feature worked marginally better with a continuous representation, so I would have liked to get that working.

I'm also sure that my lag features could have been engineered better. Since several questions were given together in a bundle, they had the same timestamps. So in using a simple lag such as lag = t<sub>n</sub> - t<sub>n-1</sub>, you get a lot of lag=0, which doesn't give much signal to the model. It also feeds the model sequences that can change, depending on how the group was formed. So I would have liked to experiment with this a bit.

Finally, there was still a gap between my LB score and my training scores. So I'm sure there were still some issues to resolve on that front.

If anyone has any comments or questions, I'd love to hear them. Thanks for looking at my kernel!