In [1]:
# !pip install datasets
# !pip install torch
# !pip install pyarrow
# !pip install tokenizers


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
# from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders
from pathlib import Path
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
import random
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
%env TOKENIZERS_PARALLELISM = false

env: TOKENIZERS_PARALLELISM=false


In [4]:
from datasets import load_dataset
import ast

def prepare_dataset(batch_size=32, max_length=128, train_size=10000, val_size=1000, test_size=1000):
    dataset = load_dataset("PedroCJardim/QASports")

    train_dataset = dataset["train"].select(range(min(train_size, len(dataset["train"]))))
    val_dataset = dataset["validation"].select(range(min(val_size, len(dataset["validation"]))))
    test_dataset = dataset["test"].select(range(min(test_size, len(dataset["test"]))))

    train_context = pd.Series(train_dataset["context"])
    val_context = pd.Series(val_dataset["context"])
    # test_context = pd.Series(test_dataset["context"])

    # full_df = pd.concat([train_context,val_context,test_context])
    full_df = pd.concat([train_context,val_context])
    full = list(full_df)
    random.shuffle(full)

    def preprocess_function(examples):
        questions_preprocess = [q.strip() for q in examples["question"] if q]
        answers_preprocess = [ast.literal_eval(a)["text"] for a in examples["answer"] if a]

        df_preprocess = pd.DataFrame(questions_preprocess, columns=['questions'])
        df_preprocess['answers'] = answers_preprocess
        df_preprocess = df_preprocess[df_preprocess["answers"] != ""]
        return df_preprocess

    train_dataset = preprocess_function(train_dataset)
    val_dataset = preprocess_function(val_dataset)
    test_dataset = preprocess_function(test_dataset)
    
    # Reset the index of the DataFrame
    train_dataset = train_dataset.reset_index(drop=True)
    val_dataset = val_dataset.reset_index(drop=True)
    test_dataset = test_dataset.reset_index(drop=True)


    return train_dataset, val_dataset, test_dataset, full

train_dataset, val_dataset, test_dataset, full = prepare_dataset()

In [5]:
train_dataset

Unnamed: 0,questions,answers
0,How many field goals did Kobe Bryant score?,9936
1,Which Lakers forward scored a Finals record 61...,Elgin Baylor
2,Who owns the building?,City of Memphis
3,"Who was the team's president, front office man...",Rick Pitino
4,What college did Marbury commit to?,Georgia Tech
...,...,...
6518,How many points did Beverly score in a win ove...,16
6519,When was the 2017 NBA All-Star Game played?,"February 19, 2017"
6520,"On what date did Westbrook score 20 points, 14...",October 27
6521,Who made a hard box out after a free throw on ...,Plenette Pierson


In [6]:
# train BERT tokenizer on vocabulary from the sportsQA dataset "context" column
bert_tokenizer = Tokenizer(WordPiece(unk_token="<unk>"))
bert_tokenizer.normalizer = normalizers.Sequence([Lowercase()])
bert_tokenizer.pre_tokenizer = Whitespace()
bert_tokenizer.decoder = decoders.WordPiece()
trainer = WordPieceTrainer(special_tokens=["<unk>","<pad>","<bos>","<eos>"])
bert_tokenizer.train_from_iterator(full,trainer)
bert_tokenizer.enable_padding(
    pad_id=bert_tokenizer.token_to_id('<pad>'),
    length=128,
    pad_token='<pad>'
)
bert_tokenizer.enable_truncation(128)

base = Path('tokenizer',)
base.mkdir(exist_ok=True,parents=True)
bert_tokenizer.save(str(base / 'sportsQA_context.json'))






In [7]:
x = bert_tokenizer.encode(f"<bos>{train_dataset.loc[1,'questions']}<eos>")
for a,b in zip(x.ids, x.tokens):
    if b!= '<pad>':
        print(f'{a} : {b}')
    
print('\n',bert_tokenizer.decode(x.ids),'\n\n')

x = bert_tokenizer.encode(f"<bos>{train_dataset.loc[1,'answers']}<eos>")
for a,b in zip(x.ids, x.tokens):
    if b!= '<pad>':
        print(f'{a} : {b}')
    
print('\n',bert_tokenizer.decode(x.ids))

2 : <bos>
1217 : which
793 : lakers
973 : forward
1385 : scored
40 : a
756 : finals
773 : record
1380 : 61
697 : points
33 : ?
3 : <eos>

 which lakers forward scored a finals record 61 points? 


2 : <bos>
5708 : elgin
2413 : baylor
3 : <eos>

 elgin baylor


In [8]:
class Dataset:
    def __init__(self,df):
        self.df = df
    def __len__(self,):
        return len(self.df)
    def __getitem__(self,idx):
        sample = self.df.iloc[idx,:]
        en,lang2 = sample['questions'], sample['answers']
        start_token = "<bos>"
        qn = bert_tokenizer.encode(f'<bos>{en.strip()}<eos>').ids
        ans = bert_tokenizer.encode(f'<bos>{lang2.strip()}<eos>').ids
        ans_shift = ans.copy()
        ans_shift[:-1] = ans[1:]
        ans_shift[-1] = bert_tokenizer.token_to_id('<pad>')
        
        qn = torch.tensor(qn,dtype=torch.long)
        ans = torch.tensor(ans,dtype=torch.long)
        ans_shift = torch.tensor(ans_shift,dtype=torch.long)
        ans_shift[ans_shift==1]=-100
        return qn,ans,ans_shift

In [9]:
train_ds = Dataset(train_dataset)
val_ds = Dataset(val_dataset)

In [10]:
# https://github.com/bzhangGo/rmsnorm/blob/master/rmsnorm_torch.py
class RMSNorm(nn.Module):
    def __init__(self, d, p=-1., eps=1e-8, bias=False):
        """
            Root Mean Square Layer Normalization
        :param d: model size
        :param p: partial RMSNorm, valid value [0, 1], default -1.0 (disabled)
        :param eps:  epsilon value, default 1e-8
        :param bias: whether use bias term for RMSNorm, disabled by
            default because RMSNorm doesn't enforce re-centering invariance.
        """
        super(RMSNorm, self).__init__()

        self.eps = eps
        self.d = d
        self.p = p
        self.bias = bias

        self.scale = nn.Parameter(torch.ones(d))
        self.register_parameter("scale", self.scale)

        if self.bias:
            self.offset = nn.Parameter(torch.zeros(d))
            self.register_parameter("offset", self.offset)

    def forward(self, x):
        if self.p < 0. or self.p > 1.:
            norm_x = x.norm(2, dim=-1, keepdim=True)
            d_x = self.d
        else:
            partial_size = int(self.d * self.p)
            partial_x, _ = torch.split(x, [partial_size, self.d - partial_size], dim=-1)

            norm_x = partial_x.norm(2, dim=-1, keepdim=True)
            d_x = partial_size

        rms_x = norm_x * d_x ** (-1. / 2)
        x_normed = x / (rms_x + self.eps)

        if self.bias:
            return self.scale * x_normed + self.offset

        return self.scale * x_normed


class MultiheadAttention(nn.Module):
    def __init__(self, dim, n_heads, dropout=0.):
        super().__init__()
        self.dim = dim
        self.n_heads = n_heads
        assert dim % n_heads == 0, 'dim should be div by n_heads'
        self.head_dim = self.dim // self.n_heads
        self.q = nn.Linear(dim, dim, bias=False)
        self.k = nn.Linear(dim, dim, bias=False)
        self.v = nn.Linear(dim, dim, bias=False)
        self.attn_dropout = nn.Dropout(dropout)
        self.scale = self.head_dim ** -0.5
        self.out_proj = nn.Linear(dim, dim, bias=False)
        
        # Add weight matrix W
        self.W = nn.Parameter(torch.Tensor(self.head_dim, self.head_dim))
        nn.init.xavier_uniform_(self.W)

    def forward(self, q, k, v, mask=None):
        batch, t, c = q.shape
        q = self.q(q)
        k = self.k(k)
        v = self.v(v)
        q = q.view(batch, q.size(1), self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        k = k.view(batch, k.size(1), self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        v = v.view(batch, v.size(1), self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        # Apply weight matrix W
        q = torch.matmul(q, self.W)
        k = torch.matmul(k, self.W)
        
        qkT = torch.matmul(q, k.transpose(-1, -2)) * self.scale
        qkT = self.attn_dropout(qkT)
        
        if mask is not None:
            mask = mask.to(dtype=qkT.dtype, device=qkT.device)
            a, b = qkT.size(-2), qkT.size(-1)
            qkT = qkT.masked_fill(mask[:, :, :a, :b] == 0, float('-inf'))
            
        qkT = F.softmax(qkT, dim=-1)
            
        attn = torch.matmul(qkT, v)
        attn = attn.permute(0, 2, 1, 3).contiguous().view(batch, t, c)
        out = self.out_proj(attn)
        return out


    


class FeedForward(nn.Module):
    def __init__(self,dim,dropout=0.):
        super().__init__()
        self.feed_forward = nn.Sequential(
            nn.Linear(dim,dim*4,bias=False),
            nn.Dropout(dropout),
            nn.GELU(),
            nn.Linear(dim*4,dim,bias=False)
        )
        
    def forward(self, x):
        return self.feed_forward(x)
    


class EncoderBlock(nn.Module):
    def __init__(self, dim, n_heads, attn_dropout=0., mlp_dropout=0.):
        super().__init__()
        self.attn = MultiheadAttention(dim,n_heads,attn_dropout)
        self.ffd = FeedForward(dim,mlp_dropout)
        self.ln_1 = RMSNorm(dim)
        self.ln_2 = RMSNorm(dim)
        
    def forward(self,x,mask=None):
        x = self.ln_1(x)
        x = x + self.attn(x,x,x,mask)
        x = self.ln_2(x)
        x = x + self.ffd(x)
        return x
    


class DecoderBlock(nn.Module):
    def __init__(self, dim, n_heads, attn_dropout=0., mlp_dropout=0.):
        super().__init__()
        self.self_attn = MultiheadAttention(dim,n_heads,attn_dropout)
        self.cross_attn = MultiheadAttention(dim,n_heads,attn_dropout)
        self.ln_1 = RMSNorm(dim)
        self.ln_2 = RMSNorm(dim)
        self.ln_3 = RMSNorm(dim)
        self.ffd = FeedForward(dim,mlp_dropout)
        
    def forward(self, x, enc_out, src_mask, tgt_mask):
        x = self.ln_1(x)
        x = x + self.self_attn(x,x,x,tgt_mask)
        x = self.ln_2(x)
        x = x + self.cross_attn(x,enc_out,enc_out,src_mask) # decoder: q, encoder: k,v
        x = self.ln_3(x)
        x = x + self.ffd(x)
        
        return x
    


class Embedding(nn.Module):
    def __init__(self,vocab_size,max_len,dim):
        super().__init__()
        self.max_len = max_len
        self.class_embedding = nn.Embedding(vocab_size,dim)
        self.pos_embedding = nn.Embedding(max_len,dim)
    def forward(self,x):
        x = self.class_embedding(x)
        pos = torch.arange(0,x.size(1),device=x.device)
        x = x + self.pos_embedding(pos)
        return x
    


class Seq2SeqTransformer(nn.Module):
    def __init__(self, config):
        
        super().__init__()
        
        self.embedding = Embedding(config['vocab_size'],config['max_len'],config['dim'])
        
        self.depth = config['depth']
        self.encoders = nn.ModuleList([
            EncoderBlock(
                dim=config['dim'],
                n_heads=config['n_heads'],
                attn_dropout=config['attn_dropout'],
                mlp_dropout=config['mlp_dropout']
            ) for _ in range(self.depth)
        ])
        self.decoders = nn.ModuleList([
            DecoderBlock(
                dim=config['dim'],
                n_heads=config['n_heads'],
                attn_dropout=config['attn_dropout'],
                mlp_dropout=config['mlp_dropout']
            ) for _ in range(self.depth)
        ])
        
        self.ln_f = RMSNorm(config['dim'])
        self.lm_head = nn.Linear(config['dim'],config['vocab_size'],bias=False)
        
        self.embedding.class_embedding.weight = self.lm_head.weight
        
        self.pad_token_id = config['pad_token_id']
        self.register_buffer('tgt_mask',torch.tril(torch.ones(1,1,config['max_len'],config['max_len'])))
    
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def create_src_mask(self,src):
        return (src != self.pad_token_id).unsqueeze(1).unsqueeze(2) # N, 1, 1, src_len
    
    def forward(self, src, tgt, labels=None):
        
        src_mask = self.create_src_mask(src)
        
        enc_out = self.embedding(src)
        dec_out = self.embedding(tgt)
        
        for i in range(self.depth):
            enc_out = self.encoders[i](enc_out,mask=src_mask)
            dec_out = self.decoders[i](dec_out,enc_out,src_mask=src_mask,tgt_mask=self.tgt_mask)
            
        dec_out = self.ln_f(dec_out)
        
        if labels is not None:
            lm_logits = self.lm_head(dec_out)
            loss = F.cross_entropy(lm_logits.view(-1, lm_logits.shape[-1]), labels.view(-1))
            return loss
        
        lm_logits = self.lm_head(dec_out[:,[-1],:])
        return lm_logits
    
    def generate(self,src,max_tokens=80,temperature=1.0,deterministic=False,eos=5,bos=None):
        tgt = torch.ones(1,1).long() * bos
        tgt = tgt.to(src.device)
        for _ in range(max_tokens):
            out = self(src,tgt)
            out = out[:,-1,:] / temperature
            probs = F.softmax(out,dim=-1)
            if deterministic:
                next_token = torch.argmax(probs,dim=-1,keepdim=True)
            else:
                next_token = torch.multinomial(probs,num_samples=1)
            tgt = torch.cat([tgt,next_token],dim=1)
            if next_token.item() == eos:
                break
            
        return tgt.cpu().flatten()

In [11]:
config = {
    'dim': 768,
    'n_heads': 12,
    'attn_dropout': 0.1,
    'mlp_dropout': 0.1,
    'depth': 6,
    'vocab_size': bert_tokenizer.get_vocab_size(),  # Set to tokenizer vocabulary size
    'max_len': 128,
    'pad_token_id': bert_tokenizer.token_to_id('<pad>')
}

In [12]:
model = Seq2SeqTransformer(config).to('cuda')
print(sum([p.numel() for p in model.parameters() if p.requires_grad]))

122326272


In [13]:
print(model)

Seq2SeqTransformer(
  (embedding): Embedding(
    (class_embedding): Embedding(30000, 768)
    (pos_embedding): Embedding(128, 768)
  )
  (encoders): ModuleList(
    (0-5): 6 x EncoderBlock(
      (attn): MultiheadAttention(
        (q): Linear(in_features=768, out_features=768, bias=False)
        (k): Linear(in_features=768, out_features=768, bias=False)
        (v): Linear(in_features=768, out_features=768, bias=False)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=False)
      )
      (ffd): FeedForward(
        (feed_forward): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=False)
          (1): Dropout(p=0.1, inplace=False)
          (2): GELU(approximate='none')
          (3): Linear(in_features=3072, out_features=768, bias=False)
        )
      )
      (ln_1): RMSNorm()
      (ln_2): RMSNorm()
    )
  )
  (decoders): ModuleList(
    (0-5): 6 x DecoderBlock(
      (self_attn): M

In [14]:
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=128,shuffle=True,pin_memory=True,num_workers=2)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=128,shuffle=False,pin_memory=True,num_workers=2)
print(len(train_dl), len(val_dl))


51 6


In [15]:
test_dataset.head()

Unnamed: 0,questions,answers
0,How many.429.000.875 3.7 2002 Minnesota 3 1?,14.3
1,Who punched Kent Benson?,Abdul-Jabbar
2,How many regular-season games did the Huskies ...,126
3,In what year did Mitchell pledge her allegianc...,2013
4,When did the NBL merge with the BAA?,"August 3, 1949"


In [16]:
test_samples = [(test_dataset.loc[i,'questions'],test_dataset.loc[i,'answers']) for i in range(len(test_dataset))]


In [17]:
epochs = 32
train_losses = []
valid_losses = []
best_val_loss = 1e9

all_tl = []
all_lr = []

optim = torch.optim.Adam(model.parameters(),lr=1e-4)
sched = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optim,
    T_0=250,
    eta_min=1e-8
)

scaler = GradScaler()

  scaler = GradScaler()


In [None]:
for ep in tqdm(range(epochs)):
    model.train()
    trl = 0.
    tprog = tqdm(enumerate(train_dl),total=len(train_dl))
    for i, batch in tprog:
        with autocast():
            src, tgt, labels = [b.to('cuda') for b in batch]
            loss = model(src,tgt,labels)
            scaler.scale(loss).backward()
            scaler.unscale_(optim)
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2)
            scaler.step(optim)
            scaler.update()
            optim.zero_grad()
            sched.step(ep + i / len(train_dl))
            all_lr.append(sched.get_last_lr())
            trl += loss.item()
            all_tl.append(loss.item())
            tprog.set_description(f'train step loss: {loss.item():.4f}')
    train_losses.append(trl/len(train_dl))
    
    gc.collect()
    torch.cuda.empty_cache()
        
    model.eval()
    with torch.no_grad():
        vrl = 0.
        vprog = tqdm(enumerate(val_dl),total=len(val_dl))
        for i, batch in vprog:
            with autocast():
                src, tgt, labels = [b.to('cuda') for b in batch]
                loss = model(src,tgt,labels)
                vrl += loss.item()
                vprog.set_description(f'valid step loss: {loss.item():.4f}')
        vloss = vrl/len(val_dl)
        valid_losses.append(vloss)
        print(f'epoch {ep} | train_loss: {train_losses[-1]:.4f} valid_loss: {valid_losses[-1]:.4f}')
        
        if vloss < best_val_loss:
            best_val_loss = vloss
            
            print('saving best model...')
            sd = model.state_dict()
            torch.save(sd,'./kaggle_checkpoint/best_model_mul.pt')
            
        print('saving epoch checkpoint...')
        sd = model.state_dict()
        torch.save(sd,f'./kaggle_checkpoint/checkpoint_model_epoch{ep}.pt')
        # TODO: finish fixing this part of the code (tokenizing stuff)
        print('predicting with current epoch model...')
        for (src,tgt) in random.choices(test_samples,k=5):
            input_ids = bert_tokenizer.encode(f"<bos>{src}<eos>").ids
            input_ids = torch.tensor(input_ids,dtype=torch.long).unsqueeze(0).to('cuda')

            bos = bert_tokenizer.token_to_id('<bos>')
            
            tgt_out = model.generate(input_ids,bos=bos,deterministic=True)
            tgt_out = bert_tokenizer.decode(tgt_out.numpy())
            print(f'\nQuestion: {src} \nAnswer: {tgt} \nModelAnswer: {tgt_out}\n')

        # for (src,tgt,lang_id) in random.choices(test_samples,k=5):
        #     input_ids = bert_tokenizer.encode(f"<s-en>{src}</s>").ids
        #     input_ids = torch.tensor(input_ids,dtype=torch.long).unsqueeze(0).to('cuda')
        #     if lang_id == 'hi':
        #         bos = bert_tokenizer.token_to_id('<s-hi>')
        #     else:
        #         bos = bert_tokenizer.token_to_id('<s-te>')
        #     tgt_out = model.generate(input_ids,bos=bos,deterministic=True)
        #     tgt_out = bert_tokenizer.decode(tgt_out.numpy())
        #     print(f'\nEN: {src} \n{lang_id.upper()}: {tgt} \n{lang_id.upper()}_MODEL: {tgt_out}\n')

        print('-'*30,'\n\n')
    
    gc.collect()
    torch.cuda.empty_cache()

  0%|          | 0/32 [00:00<?, ?it/s]
  with autocast():

train step loss: 10.5693:   0%|          | 0/51 [00:01<?, ?it/s][A
train step loss: 10.5693:   2%|▏         | 1/51 [00:01<00:55,  1.12s/it][A
train step loss: 7.8923:   2%|▏         | 1/51 [00:01<00:55,  1.12s/it] [A
train step loss: 7.8923:   4%|▍         | 2/51 [00:01<00:33,  1.44it/s][A
train step loss: 8.2716:   4%|▍         | 2/51 [00:01<00:33,  1.44it/s][A
train step loss: 8.2716:   6%|▌         | 3/51 [00:01<00:26,  1.81it/s][A
train step loss: 8.1590:   6%|▌         | 3/51 [00:02<00:26,  1.81it/s][A
train step loss: 8.1590:   8%|▊         | 4/51 [00:02<00:23,  2.04it/s][A
train step loss: 7.8142:   8%|▊         | 4/51 [00:02<00:23,  2.04it/s][A
train step loss: 7.8142:  10%|▉         | 5/51 [00:02<00:21,  2.19it/s][A
train step loss: 8.0011:  10%|▉         | 5/51 [00:03<00:21,  2.19it/s][A
train step loss: 8.0011:  12%|█▏        | 6/51 [00:03<00:19,  2.29it/s][A
train step loss: 7.8115:  12%|█▏        | 6/51

epoch 0 | train_loss: 6.6060 valid_loss: 5.2709
saving best model...
saving epoch checkpoint...
predicting with current epoch model...

Question: How did the Warriors and Celtics tie their regular season games? 
Answer: 1-1 
ModelAnswer: three


Question: Who is the best player in the region? 
Answer: Alexander 
ModelAnswer: the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the


Question: How many minutes of play did Christensen play? 
Answer: 9 
ModelAnswer: 


Question: How many points did Leonard score in the win over the Miami Heat on January 24? 
Answer: 33 
ModelAnswer: three



  3%|▎         | 1/32 [00:29<15:13, 29.47s/it]


Question: What is the name of the station for the state of Connecticut? 
Answer: WCTX 
ModelAnswer: the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the

------------------------------ 





  0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 4.9527:   0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 4.9527:   2%|▏         | 1/51 [00:00<00:22,  2.23it/s][A
train step loss: 5.0791:   2%|▏         | 1/51 [00:00<00:22,  2.23it/s][A
train step loss: 5.0791:   4%|▍         | 2/51 [00:00<00:20,  2.39it/s][A
train step loss: 5.2907:   4%|▍         | 2/51 [00:01<00:20,  2.39it/s][A
train step loss: 5.2907:   6%|▌         | 3/51 [00:01<00:19,  2.43it/s][A
train step loss: 5.3386:   6%|▌         | 3/51 [00:01<00:19,  2.43it/s][A
train step loss: 5.3386:   8%|▊         | 4/51 [00:01<00:19,  2.46it/s][A
train step loss: 5.1287:   8%|▊         | 4/51 [00:02<00:19,  2.46it/s][A
train step loss: 5.1287:  10%|▉         | 5/51 [00:02<00:18,  2.48it/s][A
train step loss: 5.3034:  10%|▉         | 5/51 [00:02<00:18,  2.48it/s][A
train step loss: 5.3034:  12%|█▏        | 6/51 [00:02<00:18,  2.48it/s][A
train step loss: 5.1297:  12%|█▏        | 6/51 [00:02<00:18,  2.4

epoch 1 | train_loss: 4.8581 valid_loss: 4.7011
saving best model...
saving epoch checkpoint...
predicting with current epoch model...

Question: Who led the team throughout the year? 
Answer: Ron Artest 
ModelAnswer: the celtics


Question: Who was the eventual national champion of the 2005-06 season? 
Answer: North Carolina 
ModelAnswer: the celtics


Question: How many spectators did the arena originally hold? 
Answer: 20,340 
ModelAnswer: three


Question: What was Wall's single-season franchise assists record with No. 802 in the third quarter? 
Answer: 801 
ModelAnswer: the celtics



  6%|▋         | 2/32 [00:58<14:35, 29.19s/it]


Question: What company became the new owner of the Leafs and Maple Leaf Gardens? 
Answer: MLG Ventures 
ModelAnswer: the celtics

------------------------------ 





  0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 4.5957:   0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 4.5957:   2%|▏         | 1/51 [00:00<00:23,  2.13it/s][A
train step loss: 4.2754:   2%|▏         | 1/51 [00:00<00:23,  2.13it/s][A
train step loss: 4.2754:   4%|▍         | 2/51 [00:00<00:20,  2.34it/s][A
train step loss: 4.2230:   4%|▍         | 2/51 [00:01<00:20,  2.34it/s][A
train step loss: 4.2230:   6%|▌         | 3/51 [00:01<00:20,  2.38it/s][A
train step loss: 4.4591:   6%|▌         | 3/51 [00:01<00:20,  2.38it/s][A
train step loss: 4.4591:   8%|▊         | 4/51 [00:01<00:19,  2.43it/s][A
train step loss: 4.3017:   8%|▊         | 4/51 [00:02<00:19,  2.43it/s][A
train step loss: 4.3017:  10%|▉         | 5/51 [00:02<00:18,  2.46it/s][A
train step loss: 3.9811:  10%|▉         | 5/51 [00:02<00:18,  2.46it/s][A
train step loss: 3.9811:  12%|█▏        | 6/51 [00:02<00:18,  2.47it/s][A
train step loss: 4.4653:  12%|█▏        | 6/51 [00:02<00:18,  2.4

epoch 2 | train_loss: 4.1115 valid_loss: 4.4701
saving best model...
saving epoch checkpoint...
predicting with current epoch model...

Question: How many rebounds per game does Hannah have? 
Answer: 11.1 
ModelAnswer: two


Question: What did he say in an interview? 
Answer: We will win Game 2 
ModelAnswer: a a a a a a a


Question: What was John Ferriols' nickname? 
Answer: MVP 
ModelAnswer: $ 1


Question: How many championships did Kerr win as head coach of the Golden State Warriors? 
Answer: four 
ModelAnswer: three



  9%|▉         | 3/32 [01:26<13:57, 28.89s/it]


Question: In what state did the Opening L 88-80 (OT) to Western Kentucky occur? 
Answer: Bowling Green, Kentucky 
ModelAnswer: game 5

------------------------------ 





  0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 3.7144:   0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 3.7144:   2%|▏         | 1/51 [00:00<00:24,  2.06it/s][A
train step loss: 3.5552:   2%|▏         | 1/51 [00:00<00:24,  2.06it/s][A
train step loss: 3.5552:   4%|▍         | 2/51 [00:00<00:21,  2.30it/s][A
train step loss: 3.8355:   4%|▍         | 2/51 [00:01<00:21,  2.30it/s][A
train step loss: 3.8355:   6%|▌         | 3/51 [00:01<00:20,  2.39it/s][A
train step loss: 3.5650:   6%|▌         | 3/51 [00:01<00:20,  2.39it/s][A
train step loss: 3.5650:   8%|▊         | 4/51 [00:01<00:19,  2.43it/s][A
train step loss: 3.3733:   8%|▊         | 4/51 [00:02<00:19,  2.43it/s][A
train step loss: 3.3733:  10%|▉         | 5/51 [00:02<00:18,  2.46it/s][A
train step loss: 3.3997:  10%|▉         | 5/51 [00:02<00:18,  2.46it/s][A
train step loss: 3.3997:  12%|█▏        | 6/51 [00:02<00:18,  2.47it/s][A
train step loss: 3.5587:  12%|█▏        | 6/51 [00:02<00:18,  2.4

epoch 3 | train_loss: 3.5180 valid_loss: 4.4614
saving best model...
saving epoch checkpoint...
predicting with current epoch model...

Question: Who was the former Phoenix head coach? 
Answer: The Washington Wizards 
ModelAnswer: bill russell


Question: When did the NBL merge with the BAA? 
Answer: August 3, 1949 
ModelAnswer: october 27, 2008


Question: How many games did the home team lose in the first 5 games of the series? 
Answer: each 
ModelAnswer: four


Question: Who runs the Mark Cuban Foundation? 
Answer: Brian Cuban 
ModelAnswer: bill russell



 12%|█▎        | 4/32 [01:55<13:21, 28.62s/it]


Question: When did he play in the Globetrotters game at Madison Square Garden? 
Answer: February 2008 
ModelAnswer: january 7, 2017

------------------------------ 





  0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 3.0125:   0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 3.0125:   2%|▏         | 1/51 [00:00<00:24,  2.05it/s][A
train step loss: 2.9546:   2%|▏         | 1/51 [00:00<00:24,  2.05it/s][A
train step loss: 2.9546:   4%|▍         | 2/51 [00:00<00:21,  2.29it/s][A
train step loss: 3.0567:   4%|▍         | 2/51 [00:01<00:21,  2.29it/s][A
train step loss: 3.0567:   6%|▌         | 3/51 [00:01<00:20,  2.39it/s][A
train step loss: 3.1371:   6%|▌         | 3/51 [00:01<00:20,  2.39it/s][A
train step loss: 3.1371:   8%|▊         | 4/51 [00:01<00:19,  2.43it/s][A
train step loss: 2.9903:   8%|▊         | 4/51 [00:02<00:19,  2.43it/s][A
train step loss: 2.9903:  10%|▉         | 5/51 [00:02<00:18,  2.45it/s][A
train step loss: 3.0621:  10%|▉         | 5/51 [00:02<00:18,  2.45it/s][A
train step loss: 3.0621:  12%|█▏        | 6/51 [00:02<00:18,  2.46it/s][A
train step loss: 2.9102:  12%|█▏        | 6/51 [00:02<00:18,  2.4

epoch 4 | train_loss: 2.9727 valid_loss: 4.4022
saving best model...
saving epoch checkpoint...
predicting with current epoch model...

Question: Where are the final three rounds held? 
Answer: St. Joseph Civic Arena 
ModelAnswer: the arena in the arena arena and the year and the the the


Question: Who was the first MVP to foul out of an NBA Finals game? 
Answer: Curry 
ModelAnswer: san antonio spurs


Question: When did the NBL merge with the BAA? 
Answer: August 3, 1949 
ModelAnswer: august 4, 2010


Question: What two players were sent to Vancouver in a trade? 
Answer: Bobby Hurley and Michael Smith 
ModelAnswer: a player of the year year and the year of the year and the year and and and and the year and and and the year and the year and the year year and and and and the year year year year and the year and the year year year year and the year and the year and the year year year year year



 16%|█▌        | 5/32 [02:24<12:55, 28.71s/it]


Question: What is another name for the Adelaide 36ers? 
Answer: Sixers 
ModelAnswer: the celtics

------------------------------ 





  0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 2.5714:   0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 2.5714:   2%|▏         | 1/51 [00:00<00:22,  2.23it/s][A
train step loss: 2.4100:   2%|▏         | 1/51 [00:00<00:22,  2.23it/s][A
train step loss: 2.4100:   4%|▍         | 2/51 [00:00<00:20,  2.38it/s][A
train step loss: 2.4161:   4%|▍         | 2/51 [00:01<00:20,  2.38it/s][A
train step loss: 2.4161:   6%|▌         | 3/51 [00:01<00:19,  2.43it/s][A
train step loss: 2.6064:   6%|▌         | 3/51 [00:01<00:19,  2.43it/s][A
train step loss: 2.6064:   8%|▊         | 4/51 [00:01<00:19,  2.46it/s][A
train step loss: 2.3606:   8%|▊         | 4/51 [00:02<00:19,  2.46it/s][A
train step loss: 2.3606:  10%|▉         | 5/51 [00:02<00:18,  2.48it/s][A
train step loss: 2.3930:  10%|▉         | 5/51 [00:02<00:18,  2.48it/s][A
train step loss: 2.3930:  12%|█▏        | 6/51 [00:02<00:18,  2.49it/s][A
train step loss: 2.5299:  12%|█▏        | 6/51 [00:02<00:18,  2.4

epoch 5 | train_loss: 2.4656 valid_loss: 4.5119
saving epoch checkpoint...
predicting with current epoch model...

Question: What was Drummond's height? 
Answer: 6 ft 10 in 
ModelAnswer: 6 ft 5 in


Question: In what city is the championship game played? 
Answer: Kansas City 
ModelAnswer: detroit


Question: Who was the team's lead physician? 
Answer: Dr. Tony Daly 
ModelAnswer: san antonio spurs


Question: What is the NBA Regular season Year Team GP GS MPG? 
Answer: Career high 
ModelAnswer: 2010 - 11



 19%|█▉        | 6/32 [02:52<12:22, 28.55s/it]


Question: How much was Sharman ordered to pay in damages? 
Answer: $250,000 
ModelAnswer: $ 1. 5 million

------------------------------ 





  0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 2.0062:   0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 2.0062:   2%|▏         | 1/51 [00:00<00:23,  2.16it/s][A
train step loss: 1.9007:   2%|▏         | 1/51 [00:00<00:23,  2.16it/s][A
train step loss: 1.9007:   4%|▍         | 2/51 [00:00<00:20,  2.36it/s][A
train step loss: 1.8947:   4%|▍         | 2/51 [00:01<00:20,  2.36it/s][A
train step loss: 1.8947:   6%|▌         | 3/51 [00:01<00:19,  2.42it/s][A
train step loss: 1.9304:   6%|▌         | 3/51 [00:01<00:19,  2.42it/s][A
train step loss: 1.9304:   8%|▊         | 4/51 [00:01<00:19,  2.45it/s][A
train step loss: 1.9321:   8%|▊         | 4/51 [00:02<00:19,  2.45it/s][A
train step loss: 1.9321:  10%|▉         | 5/51 [00:02<00:18,  2.47it/s][A
train step loss: 2.0518:  10%|▉         | 5/51 [00:02<00:18,  2.47it/s][A
train step loss: 2.0518:  12%|█▏        | 6/51 [00:02<00:18,  2.48it/s][A
train step loss: 1.8849:  12%|█▏        | 6/51 [00:02<00:18,  2.4

epoch 6 | train_loss: 1.9439 valid_loss: 4.5486
saving epoch checkpoint...
predicting with current epoch model...

Question: Who was the last person to hit a buzzer beater before Lillard? 
Answer: John Stockton 
ModelAnswer: bob mcadoo


Question: How large is the facility called? 
Answer: 132,000-square-foot 
ModelAnswer: 30 and the year


Question: What would happen if the ninth seed finished the regular season within four games of the eighth seed? 
Answer: A possible play-in tournament 
ModelAnswer: the big ten conference and his his his his his his his his his. 2 2 2 2 2 2, and the game game game game game game game game game game


Question: Who set a new Kentucky scoring record in a road game against Tennessee? 
Answer: Jodie Meeks 
ModelAnswer: the new york knicks



 22%|██▏       | 7/32 [03:20<11:47, 28.29s/it]


Question: What type of template is used by the SMU? 
Answer: Template:N 
ModelAnswer: center

------------------------------ 





  0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 1.5014:   0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 1.5014:   2%|▏         | 1/51 [00:00<00:21,  2.28it/s][A
train step loss: 1.5205:   2%|▏         | 1/51 [00:00<00:21,  2.28it/s][A
train step loss: 1.5205:   4%|▍         | 2/51 [00:00<00:20,  2.41it/s][A
train step loss: 1.3929:   4%|▍         | 2/51 [00:01<00:20,  2.41it/s][A
train step loss: 1.3929:   6%|▌         | 3/51 [00:01<00:19,  2.45it/s][A
train step loss: 1.3144:   6%|▌         | 3/51 [00:01<00:19,  2.45it/s][A
train step loss: 1.3144:   8%|▊         | 4/51 [00:01<00:19,  2.47it/s][A
train step loss: 1.4375:   8%|▊         | 4/51 [00:02<00:19,  2.47it/s][A
train step loss: 1.4375:  10%|▉         | 5/51 [00:02<00:18,  2.48it/s][A
train step loss: 1.4057:  10%|▉         | 5/51 [00:02<00:18,  2.48it/s][A
train step loss: 1.4057:  12%|█▏        | 6/51 [00:02<00:18,  2.49it/s][A
train step loss: 1.4617:  12%|█▏        | 6/51 [00:02<00:18,  2.4

epoch 7 | train_loss: 1.4379 valid_loss: 4.6034
saving epoch checkpoint...
predicting with current epoch model...

Question: When was the downtown restaurant opened? 
Answer: 1967 
ModelAnswer: october 11, 2012


Question: What is Charleston Southern affiliated with? 
Answer: South Carolina Baptist Convention 
ModelAnswer: his man


Question: How many SEC Tournament championships does Auburn win? 
Answer: four 
ModelAnswer: four


Question: How many rebounds did Holiday have for the West? 
Answer: five 
ModelAnswer: 4. 5



 25%|██▌       | 8/32 [03:48<11:16, 28.18s/it]


Question: What was Jefferson traded to the Utah Jazz for? 
Answer: two future first round picks and center Kosta Koufos 
ModelAnswer: $ 24 million

------------------------------ 





  0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 0.9664:   0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 0.9664:   2%|▏         | 1/51 [00:00<00:22,  2.21it/s][A
train step loss: 1.0125:   2%|▏         | 1/51 [00:00<00:22,  2.21it/s][A
train step loss: 1.0125:   4%|▍         | 2/51 [00:00<00:20,  2.38it/s][A
train step loss: 0.9342:   4%|▍         | 2/51 [00:01<00:20,  2.38it/s][A
train step loss: 0.9342:   6%|▌         | 3/51 [00:01<00:19,  2.44it/s][A
train step loss: 0.9614:   6%|▌         | 3/51 [00:01<00:19,  2.44it/s][A
train step loss: 0.9614:   8%|▊         | 4/51 [00:01<00:19,  2.46it/s][A
train step loss: 1.0164:   8%|▊         | 4/51 [00:02<00:19,  2.46it/s][A
train step loss: 1.0164:  10%|▉         | 5/51 [00:02<00:18,  2.48it/s][A
train step loss: 0.9463:  10%|▉         | 5/51 [00:02<00:18,  2.48it/s][A
train step loss: 0.9463:  12%|█▏        | 6/51 [00:02<00:18,  2.49it/s][A
train step loss: 0.9218:  12%|█▏        | 6/51 [00:02<00:18,  2.4

epoch 8 | train_loss: 0.9868 valid_loss: 4.6919
saving epoch checkpoint...
predicting with current epoch model...

Question: What did Carter-Williams do during the 2012 offseason? 
Answer: worked on his game 
ModelAnswer: a 24


Question: How many of the 74 championships in NBA history are the Celtics and Lakers tied for? 
Answer: 34 
ModelAnswer: eight


Question: What was the name given to the Monday edition of Inside? 
Answer: Players Only 
ModelAnswer: the 2


Question: What is the finepharm AZS KK Jelenia Gora season 2006/2007 league table? 
Answer: 1 
ModelAnswer: the 2019



 28%|██▊       | 9/32 [04:16<10:48, 28.18s/it]


Question: What is the 83rd season of service for the Arena? 
Answer: 2008-09 season 
ModelAnswer: the game 5

------------------------------ 





  0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 0.6631:   0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 0.6631:   2%|▏         | 1/51 [00:00<00:22,  2.18it/s][A
train step loss: 0.6237:   2%|▏         | 1/51 [00:00<00:22,  2.18it/s][A
train step loss: 0.6237:   4%|▍         | 2/51 [00:00<00:20,  2.37it/s][A
train step loss: 0.6261:   4%|▍         | 2/51 [00:01<00:20,  2.37it/s][A
train step loss: 0.6261:   6%|▌         | 3/51 [00:01<00:19,  2.43it/s][A
train step loss: 0.5631:   6%|▌         | 3/51 [00:01<00:19,  2.43it/s][A
train step loss: 0.5631:   8%|▊         | 4/51 [00:01<00:19,  2.46it/s][A
train step loss: 0.5641:   8%|▊         | 4/51 [00:02<00:19,  2.46it/s][A
train step loss: 0.5641:  10%|▉         | 5/51 [00:02<00:18,  2.48it/s][A
train step loss: 0.5881:  10%|▉         | 5/51 [00:02<00:18,  2.48it/s][A
train step loss: 0.5881:  12%|█▏        | 6/51 [00:02<00:18,  2.49it/s][A
train step loss: 0.6939:  12%|█▏        | 6/51 [00:02<00:18,  2.4

epoch 9 | train_loss: 0.6191 valid_loss: 4.7664
saving epoch checkpoint...
predicting with current epoch model...

Question: What format was the first time the Finals were played? 
Answer: best-of-five 
ModelAnswer: 1988


Question: What was the name of the sports arena from 1971 to 1972? 
Answer: San Diego Sports Arena 
ModelAnswer: boston celtics


Question: Who narrated the show? 
Answer: Tom Brokaw 
ModelAnswer: tom heinsohn


Question: When did Cooke become a full-time coach? 
Answer: February 1897 
ModelAnswer: january 15, 1994



 31%|███▏      | 10/32 [04:43<10:16, 28.01s/it]


Question: What was Webber's career-high for the 2000-01 season? 
Answer: 27 
ModelAnswer: 2 points - field, and 2 rebounds, 3 1973 - 74 53 31. 631

------------------------------ 





  0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 0.3984:   0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 0.3984:   2%|▏         | 1/51 [00:00<00:23,  2.10it/s][A
train step loss: 0.3557:   2%|▏         | 1/51 [00:00<00:23,  2.10it/s][A
train step loss: 0.3557:   4%|▍         | 2/51 [00:00<00:21,  2.33it/s][A
train step loss: 0.3553:   4%|▍         | 2/51 [00:01<00:21,  2.33it/s][A
train step loss: 0.3553:   6%|▌         | 3/51 [00:01<00:19,  2.40it/s][A
train step loss: 0.3796:   6%|▌         | 3/51 [00:01<00:19,  2.40it/s][A
train step loss: 0.3796:   8%|▊         | 4/51 [00:01<00:19,  2.45it/s][A
train step loss: 0.3631:   8%|▊         | 4/51 [00:02<00:19,  2.45it/s][A
train step loss: 0.3631:  10%|▉         | 5/51 [00:02<00:18,  2.47it/s][A
train step loss: 0.3622:  10%|▉         | 5/51 [00:02<00:18,  2.47it/s][A
train step loss: 0.3622:  12%|█▏        | 6/51 [00:02<00:18,  2.48it/s][A
train step loss: 0.3388:  12%|█▏        | 6/51 [00:02<00:18,  2.4

epoch 10 | train_loss: 0.3542 valid_loss: 4.9106
saving epoch checkpoint...
predicting with current epoch model...

Question: On what date had he scored 20 points and a career-high 14 assists? 
Answer: March 26, 2017 
ModelAnswer: december 15, 2010


Question: What is part of a dynamic urban center? 
Answer: entertainment 
ModelAnswer: los angeles clippers and center center and center center


Question: What was the European Player of the Year award in 2014? 
Answer: Euroscar 
ModelAnswer: 2015 - 16


Question: Who is the best player in the region? 
Answer: Alexander 
ModelAnswer: michael jordan



 34%|███▍      | 11/32 [05:11<09:49, 28.05s/it]


Question: What was his average per game per game? 
Answer: 15.6 points 
ModelAnswer: 6. 7

------------------------------ 





  0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 0.2438:   0%|          | 0/51 [00:00<?, ?it/s][A
train step loss: 0.2438:   2%|▏         | 1/51 [00:00<00:23,  2.14it/s][A
train step loss: 0.2098:   2%|▏         | 1/51 [00:00<00:23,  2.14it/s][A
train step loss: 0.2098:   4%|▍         | 2/51 [00:00<00:20,  2.35it/s][A
train step loss: 0.1974:   4%|▍         | 2/51 [00:01<00:20,  2.35it/s][A
train step loss: 0.1974:   6%|▌         | 3/51 [00:01<00:19,  2.41it/s][A
train step loss: 0.2433:   6%|▌         | 3/51 [00:01<00:19,  2.41it/s][A
train step loss: 0.2433:   8%|▊         | 4/51 [00:01<00:19,  2.45it/s][A
train step loss: 0.2130:   8%|▊         | 4/51 [00:02<00:19,  2.45it/s][A
train step loss: 0.2130:  10%|▉         | 5/51 [00:02<00:18,  2.47it/s][A
train step loss: 0.2393:  10%|▉         | 5/51 [00:02<00:18,  2.47it/s][A
train step loss: 0.2393:  12%|█▏        | 6/51 [00:02<00:18,  2.48it/s][A