In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from copy import deepcopy

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer

In [2]:
import os
import torch
import random
import numpy as np

def seed_everything(seed):
    global SEED
    SEED = seed
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def train(model, train_dataset, val_dataset=None, epochs=16, batch_size=(64, None), criterion=nn.MSELoss(), lr=(1e-3, 1e-6), weight_decay=0.01, metrics=None, device=None):
    torch.cuda.empty_cache()
    gc.collect()

    if not device:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size[0], shuffle=True, num_workers=4)
    if val_dataset:
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size[1], shuffle=False, num_workers=4)
    optimizer = optim.AdamW(model.parameters(), lr=lr[0], weight_decay=weight_decay)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(train_dataset) * epochs, lr[1])

    if val_dataset:
        best_loss = np.inf
        best_model = deepcopy(model)
    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}/{epochs}')
        train_losses = []
        pb = tqdm(train_dataloader)
        if metrics:
            trues = []
            preds = []
        model.train()
        for (x, y) in pb:
            for i in range(len(x)):
                x[i] = x[i].to(device)
            y = y.to(device)
            optimizer.zero_grad()
            pred = model(*x)
            train_loss = criterion(pred, y)
            train_loss.backward()
            optimizer.step()
            scheduler.step()
            train_losses.append(train_loss.item())
            if metrics:
                trues += y.tolist()
                preds += pred.tolist()
            pb.set_description(f'Train      | loss={np.mean(train_losses):.4f}')
        if metrics:
            metrics_info = ''
            for metric_name, metric in metrics.items():
                metrics_info += f'{metric_name}={metric(trues, preds):.4f}; '
            print('Metrics:', metrics_info[:-2])
        if val_dataset:
            model.eval()
            with torch.no_grad():
                val_losses = []
                if metrics:
                    trues = []
                    preds = []
                pb = tqdm(val_dataloader)
                for (x, y) in pb:
                    for i in range(len(x)):
                        x[i] = x[i].to(device)
                    y = y.to(device)
                    pred = model(*x)
                    val_loss = criterion(pred, y)
                    val_losses.append(val_loss.item())
                    if metrics:
                        trues += y.tolist()
                        preds += pred.tolist()
                    pb.set_description(f'Validation | loss={np.mean(val_losses):.4f}')
                if metrics:
                    metrics_info = ''
                    for metric_name, metric in metrics.items():
                        metrics_info += f'{metric_name}={metric(trues, preds):.4f}; '
                    print('Metrics:', metrics_info[:-2])
                if np.mean(val_losses) < best_loss:
                    best_loss = np.mean(val_losses)
                    best_model = deepcopy(model)
        print()
    
    torch.cuda.empty_cache()
    gc.collect()

    if val_dataset:
        return best_model

In [4]:
train_df = pd.read_csv('data/external/train.csv')

classes = ['yes', 'no', 'insufficient information']
train_df['answer_'] = train_df['answer'].apply(
    lambda x: classes.index(x) if x in classes else len(classes)
)

train_df

Unnamed: 0,questions,answer,answer_
0,Between the TechCrunch report on Sam Bankman-F...,yes,0
1,Between the report from The Verge on Apple's d...,no,1
2,Between the Polygon article published on Septe...,yes,0
3,Did the reporting on player actions in sports ...,no,1
4,Does the Sporting News article claim that Caes...,yes,0
...,...,...,...
1976,Did Engadget fail to report a discount on the ...,no,1
1977,"Between the TechCrunch article on December 7, ...",yes,0
1978,Did the FOX News - Entertainment article attri...,yes,0
1979,"Which company, covered by The Verge for exclus...",valve,3


In [5]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_length=256):
        self.texts = list(texts)
        if targets is not None:
            self.targets = list(targets)
        else:
            self.targets = None
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx], 
            padding="max_length",
            truncation=True, 
            max_length=self.max_length,
            return_tensors="pt"
        )

        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)

        if self.targets is not None:
            target = torch.tensor(self.targets[idx], dtype=torch.int64)
        else:
            target = torch.nan

        return [[input_ids, attention_mask], target]

In [6]:
class BERTModelEncoder(nn.Module):
    def __init__(self, backbone_model):
        super().__init__()
        self.bert = AutoModel.from_pretrained(backbone_model)
        self.dropout = nn.Dropout(0.3)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 4)
    
    def embed(self, input_ids, attention_mask):
        x = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = x.pooler_output
        return x

    def forward(self, input_ids, attention_mask):
        x = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = x.pooler_output
        x = self.dropout(x)
        x = self.regressor(x)
        return x

In [7]:
def acc(y_true, y_pred):
    y_pred = np.array(y_pred)
    y_pred = np.argmax(y_pred, axis=-1)
    return accuracy_score(y_true, y_pred)

In [8]:
backbone_model = 'google-bert/bert-large-cased'

tokenizer = AutoTokenizer.from_pretrained(backbone_model)
model = BERTModelEncoder(backbone_model).to(device)

In [9]:
df_texts = train_df[['questions', 'answer_']].copy().dropna()

train_texts, val_texts, train_targets, val_targets = \
    train_test_split(df_texts['questions'], df_texts['answer_'].values,
                     test_size=0.1, shuffle=True, random_state=SEED)

train_dataset = TextClassificationDataset(train_texts, train_targets, tokenizer)
val_dataset = TextClassificationDataset(val_texts, val_targets, tokenizer)

In [10]:
best_model = train(
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    epochs=24,
    batch_size=(8, 8),
    criterion=nn.CrossEntropyLoss(),
    lr=(3e-5, 1e-6),
    weight_decay=0.01,
    metrics={'acc': acc},
    device=device
)

torch.save(best_model.state_dict(), 'data/working/bme_model.pth')
best_model.bert.save_pretrained('data/working/base_bert_model')
with open('data/working/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

Epoch 1/24


Train      | loss=0.6534: 100%|██████████| 223/223 [01:19<00:00,  2.81it/s]


Metrics: acc=0.7031


Validation | loss=0.4483: 100%|██████████| 25/25 [00:02<00:00,  9.34it/s]


Metrics: acc=0.8040

Epoch 2/24


Train      | loss=0.4047: 100%|██████████| 223/223 [01:20<00:00,  2.78it/s]


Metrics: acc=0.8199


Validation | loss=0.3853: 100%|██████████| 25/25 [00:02<00:00,  9.33it/s]


Metrics: acc=0.8392

Epoch 3/24


Train      | loss=0.2962: 100%|██████████| 223/223 [01:20<00:00,  2.78it/s]


Metrics: acc=0.8743


Validation | loss=0.3676: 100%|██████████| 25/25 [00:02<00:00,  9.34it/s]


Metrics: acc=0.8442

Epoch 4/24


Train      | loss=0.2752: 100%|██████████| 223/223 [01:19<00:00,  2.80it/s]


Metrics: acc=0.8911


Validation | loss=0.3779: 100%|██████████| 25/25 [00:02<00:00,  9.50it/s]


Metrics: acc=0.8593

Epoch 5/24


Train      | loss=0.2163: 100%|██████████| 223/223 [01:18<00:00,  2.83it/s]


Metrics: acc=0.9181


Validation | loss=0.4025: 100%|██████████| 25/25 [00:02<00:00,  9.63it/s]


Metrics: acc=0.8492

Epoch 6/24


Train      | loss=0.1718: 100%|██████████| 223/223 [01:15<00:00,  2.94it/s]


Metrics: acc=0.9315


Validation | loss=0.4400: 100%|██████████| 25/25 [00:02<00:00,  9.59it/s]


Metrics: acc=0.8593

Epoch 7/24


Train      | loss=0.1258: 100%|██████████| 223/223 [01:18<00:00,  2.84it/s]


Metrics: acc=0.9495


Validation | loss=0.4401: 100%|██████████| 25/25 [00:02<00:00,  9.48it/s]


Metrics: acc=0.8945

Epoch 8/24


Train      | loss=0.0984: 100%|██████████| 223/223 [01:20<00:00,  2.76it/s]


Metrics: acc=0.9686


Validation | loss=0.5826: 100%|██████████| 25/25 [00:02<00:00,  9.33it/s]


Metrics: acc=0.8593

Epoch 9/24


Train      | loss=0.0787: 100%|██████████| 223/223 [01:21<00:00,  2.74it/s]


Metrics: acc=0.9714


Validation | loss=0.5268: 100%|██████████| 25/25 [00:02<00:00,  9.34it/s]


Metrics: acc=0.8794

Epoch 10/24


Train      | loss=0.0668: 100%|██████████| 223/223 [01:20<00:00,  2.76it/s]


Metrics: acc=0.9776


Validation | loss=0.4627: 100%|██████████| 25/25 [00:02<00:00,  9.27it/s]


Metrics: acc=0.8643

Epoch 11/24


Train      | loss=0.0704: 100%|██████████| 223/223 [01:21<00:00,  2.75it/s]


Metrics: acc=0.9747


Validation | loss=0.5853: 100%|██████████| 25/25 [00:02<00:00,  9.27it/s]


Metrics: acc=0.8442

Epoch 12/24


Train      | loss=0.1572: 100%|██████████| 223/223 [01:20<00:00,  2.76it/s]


Metrics: acc=0.9540


Validation | loss=0.4664: 100%|██████████| 25/25 [00:02<00:00,  9.14it/s]


Metrics: acc=0.8593

Epoch 13/24


Train      | loss=0.2270: 100%|██████████| 223/223 [01:21<00:00,  2.75it/s]


Metrics: acc=0.9355


Validation | loss=0.6465: 100%|██████████| 25/25 [00:02<00:00,  9.23it/s]


Metrics: acc=0.8191

Epoch 14/24


Train      | loss=0.1587: 100%|██████████| 223/223 [01:21<00:00,  2.74it/s]


Metrics: acc=0.9551


Validation | loss=0.5681: 100%|██████████| 25/25 [00:02<00:00,  9.31it/s]


Metrics: acc=0.8794

Epoch 15/24


Train      | loss=0.0749: 100%|██████████| 223/223 [01:18<00:00,  2.84it/s]


Metrics: acc=0.9764


Validation | loss=0.5208: 100%|██████████| 25/25 [00:02<00:00,  9.67it/s]


Metrics: acc=0.8543

Epoch 16/24


Train      | loss=0.0841: 100%|██████████| 223/223 [01:19<00:00,  2.82it/s]


Metrics: acc=0.9781


Validation | loss=0.4847: 100%|██████████| 25/25 [00:02<00:00,  9.49it/s]


Metrics: acc=0.8593

Epoch 17/24


Train      | loss=0.0400: 100%|██████████| 223/223 [01:19<00:00,  2.80it/s]


Metrics: acc=0.9848


Validation | loss=0.7824: 100%|██████████| 25/25 [00:02<00:00,  9.78it/s]


Metrics: acc=0.8593

Epoch 18/24


Train      | loss=0.0271: 100%|██████████| 223/223 [01:19<00:00,  2.82it/s]


Metrics: acc=0.9910


Validation | loss=0.7414: 100%|██████████| 25/25 [00:02<00:00,  9.46it/s]


Metrics: acc=0.8543

Epoch 19/24


Train      | loss=0.0124: 100%|██████████| 223/223 [01:20<00:00,  2.77it/s]


Metrics: acc=0.9938


Validation | loss=0.8206: 100%|██████████| 25/25 [00:02<00:00,  9.28it/s]


Metrics: acc=0.8643

Epoch 20/24


Train      | loss=0.0264: 100%|██████████| 223/223 [01:20<00:00,  2.77it/s]


Metrics: acc=0.9910


Validation | loss=0.8853: 100%|██████████| 25/25 [00:02<00:00,  9.11it/s]


Metrics: acc=0.8693

Epoch 21/24


Train      | loss=0.0565: 100%|██████████| 223/223 [01:20<00:00,  2.77it/s]


Metrics: acc=0.9848


Validation | loss=0.6856: 100%|██████████| 25/25 [00:02<00:00,  9.35it/s]


Metrics: acc=0.8693

Epoch 22/24


Train      | loss=0.0733: 100%|██████████| 223/223 [01:19<00:00,  2.79it/s]


Metrics: acc=0.9804


Validation | loss=0.6706: 100%|██████████| 25/25 [00:02<00:00,  9.15it/s]


Metrics: acc=0.8191

Epoch 23/24


Train      | loss=0.9489: 100%|██████████| 223/223 [01:20<00:00,  2.78it/s]


Metrics: acc=0.5584


Validation | loss=1.3238: 100%|██████████| 25/25 [00:02<00:00,  9.65it/s]


Metrics: acc=0.2915

Epoch 24/24


Train      | loss=1.3656: 100%|██████████| 223/223 [01:17<00:00,  2.86it/s]


Metrics: acc=0.3401


Validation | loss=1.3253: 100%|██████████| 25/25 [00:02<00:00,  9.67it/s]


Metrics: acc=0.3668



In [11]:
test_df = pd.read_csv('data/external/test.csv')
test_df

Unnamed: 0,ID,questions
0,0,"After the TechCrunch report on November 18, 20..."
1,1,Considering the information from an article by...
2,2,Considering the information from an article in...
3,3,Was Owen Teale's career impact discussed in Th...
4,4,What company developed the world’s first succe...
...,...,...
843,843,Who was the CEO of Alameda and former girlfrie...
844,844,Does the article from The Verge suggest that G...
845,845,Does the Polygon article suggest that 'The Pos...
846,846,Between the report by FOX News - Health on 'Pe...


In [12]:
test_dataset = TextClassificationDataset(test_df['questions'], None, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [14]:
preds = []
model.eval()
with torch.no_grad():
    for (x, y) in tqdm(test_dataloader, desc='Processing BERT...'):
        for i in range(len(x)):
            x[i] = x[i].to(device)
        ans = best_model(*x)
        preds += ans.tolist()
preds = np.array(preds)
preds = np.argmax(preds, axis=-1)
preds

Processing BERT...: 100%|██████████| 106/106 [00:11<00:00,  9.59it/s]


array([0, 2, 2, 0, 3, 1, 3, 1, 0, 3, 1, 0, 0, 0, 0, 0, 1, 1, 0, 3, 3, 3,
       3, 1, 0, 3, 3, 3, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0,
       2, 3, 0, 3, 2, 3, 1, 0, 2, 0, 0, 1, 3, 3, 1, 0, 0, 0, 3, 3, 3, 3,
       3, 3, 0, 0, 0, 3, 0, 3, 3, 2, 0, 3, 1, 3, 2, 1, 1, 1, 3, 0, 0, 0,
       3, 1, 1, 2, 3, 1, 0, 0, 3, 0, 0, 0, 0, 3, 2, 1, 3, 0, 3, 2, 3, 3,
       3, 2, 0, 0, 1, 1, 1, 2, 1, 1, 3, 3, 0, 3, 2, 1, 0, 0, 0, 3, 3, 3,
       1, 3, 2, 3, 3, 3, 3, 0, 3, 3, 3, 0, 0, 0, 1, 3, 0, 1, 3, 3, 0, 3,
       3, 3, 3, 1, 3, 0, 2, 3, 3, 3, 1, 0, 3, 3, 0, 3, 3, 3, 1, 1, 3, 0,
       3, 0, 1, 3, 1, 3, 1, 0, 1, 3, 3, 1, 0, 3, 2, 3, 1, 1, 0, 0, 3, 3,
       0, 0, 3, 0, 3, 0, 0, 1, 0, 1, 3, 2, 1, 3, 0, 1, 0, 0, 3, 3, 3, 2,
       3, 3, 0, 3, 3, 3, 3, 0, 3, 0, 0, 3, 2, 3, 0, 1, 3, 3, 0, 3, 3, 3,
       3, 3, 0, 3, 3, 0, 3, 1, 1, 2, 3, 3, 0, 2, 3, 0, 1, 3, 1, 3, 3, 0,
       3, 0, 0, 1, 2, 0, 1, 0, 3, 0, 2, 0, 3, 1, 0, 2, 0, 0, 3, 3, 3, 0,
       0, 1, 3, 3, 3, 0, 3, 3, 3, 3, 0, 1, 1, 0, 2,

In [15]:
test_df['answer_'] = preds
test_df['answer'] = test_df['answer_'].apply(
    lambda x: classes[x] if x < len(classes) else 'sam bankman-fried'
)
test_df

Unnamed: 0,ID,questions,answer_,answer
0,0,"After the TechCrunch report on November 18, 20...",0,yes
1,1,Considering the information from an article by...,2,insufficient information
2,2,Considering the information from an article in...,2,insufficient information
3,3,Was Owen Teale's career impact discussed in Th...,0,yes
4,4,What company developed the world’s first succe...,3,sam bankman-fried
...,...,...,...,...
843,843,Who was the CEO of Alameda and former girlfrie...,3,sam bankman-fried
844,844,Does the article from The Verge suggest that G...,0,yes
845,845,Does the Polygon article suggest that 'The Pos...,1,no
846,846,Between the report by FOX News - Health on 'Pe...,3,sam bankman-fried


In [18]:
test_df[['ID', 'answer']].to_csv('data/submissions/submission.csv', index=False)