In [1]:
import re
import os
import tqdm

In [2]:
import pandas as pd
import torch
import numpy as np
import sklearn

In [3]:
import transformers

In [77]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [5]:
tokenizer = transformers.AlbertTokenizer.from_pretrained('albert-base-v2')

In [6]:
class Model(torch.nn.Module):
    def __init__(self, pretrained="albert-base-v2", hidden_size=768):
        super(Model, self).__init__()
        self.base_model = transformers.AlbertModel.from_pretrained(pretrained)
        self.linear = torch.nn.Linear(hidden_size, 1)
        
    def forward(self, ids, masks):
        x = self.base_model(ids, attention_mask=masks)[1]
        x = self.linear(x)
        return x

In [7]:
df_train = pd.read_csv("train.csv")  # use preprocessed ?
df_test = pd.read_csv("test.csv")

In [8]:
def bert_encode(text, max_len=512):
    """padds up to max_len"""
    text = tokenizer.tokenize(text)
    text = text[:max_len-2]
    input_sequence = ["[CLS]"] + text + ["[SEP]"]
    tokens = tokenizer.convert_tokens_to_ids(input_sequence)
    padding_len = max_len - len(input_sequence)
    tokens.extend(0 for _ in range(padding_len))
    pad_masks = [1] * len(input_sequence) + [0] * padding_len

    return tokens, pad_masks

In [9]:
N_TRAIN = 6000

In [10]:
train_texts = df_train.text[:N_TRAIN]
train_targets = df_train.target[:N_TRAIN]
val_texts = df_train.text[N_TRAIN:]
val_targets = df_train.target[N_TRAIN:]

In [11]:
def build_tokens(texts):
    tokenss = []
    pad_maskss = []
    for tokens, masks in map(bert_encode, texts):
        tokenss.append(tokens)
        pad_maskss.append(masks)
    
    return np.array(tokenss), np.array(pad_maskss)

In [12]:
train_tokens, train_pad_masks = build_tokens(train_texts)

In [71]:
class TrainSet(torch.utils.data.Dataset):
    def __init__(self, tokens, pad_masks, targets):
        super().__init__()
        self.tokens = tokens
        self.pad_masks = pad_masks
        self.targets = targets
        
    def __getitem__(self, index):
        tokens = self.tokens[index]
        masks = self.pad_masks[index]
        target = self.targets[index]
        
        return (tokens, masks), target
    
    def __len__(self):
        return len(self.tokens)

class EvalSet(torch.utils.data.Dataset):
    def __init__(self, tokens, pad_masks):
        super().__init__()
        self.tokens = tokens
        self.pad_masks = pad_masks
    
    def __getitem__(self, index):
        return self.tokens[index], self.pad_masks[index]
    
    def __len__(self):
        return len(self.tokens)

In [72]:
train_dataset = TrainSet(tokens=train_tokens,
                         pad_masks=train_pad_masks,
                         targets=train_targets)

In [16]:
BATCH_SIZE = 4
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=BATCH_SIZE,
                                               shuffle=True)

## train

In [17]:
model = Model()

In [18]:
sum(map(torch.Tensor.numel,
        filter(lambda t: t.requires_grad,
               model.parameters())))

11684353

In [19]:
criterion = torch.nn.BCEWithLogitsLoss()

In [20]:
model.to(DEVICE)
criterion.to(DEVICE)

BCEWithLogitsLoss()

In [21]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)

In [22]:
EPOCHS = 2

In [23]:
model.train()
torch.cuda.empty_cache()

for epoch in range(EPOCHS):
    for i, ((tokens, masks), target) in enumerate(train_dataloader):
        output = model(tokens.long().to(DEVICE), 
                       masks.long().to(DEVICE))
        loss = criterion(output, target[:, None].float().to(DEVICE))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"\rEpoch: {epoch+1}/{EPOCHS}, "
              f"{i/len(train_dataloader)*100:.1f}% "
              f"loss: {loss.item():.2f}", end='')
    print()

Epoch: 1/2, 99.9% loss: 0.35
Epoch: 2/2, 99.9% loss: 0.37


In [24]:
torch.save(model.state_dict(), "albert.pt")

# eval

In [38]:
def accuracy(y_actual, output):
    y_ = output > 0
    return np.sum(y_actual == y_).astype('int') / y_actual.shape[0]

In [66]:
def output_to_pred(output):
    return output > 0

In [73]:
val_dataset = EvalSet(*build_tokens(val_texts))

val_dataloader = torch.utils.data.DataLoader(dataset=val_dataset,
                                             batch_size=3,
                                             shuffle=False)

In [60]:
DEVICE = torch.device("cpu")

In [79]:
model = model.to(DEVICE)

In [63]:
model.eval()
outputs = np.array([])
torch.cuda.empty_cache()

for i, ((tokens, masks), target) in enumerate(val_dataloader):
    output = model(tokens.long().to(DEVICE),
                   masks.long().to(DEVICE))
    outputs = np.concatenate([outputs,
                             output.detach().cpu().numpy().squeeze()])
    # loss = criterion(output,  target[:, None].float().to(DEVICE))
    # acc = accuracy(target.cpu().numpy(), output.detach().cpu().numpy().squeeze())
    # avg_acc += acc
    print('\r%0.2f%%'% (i/len(val_dataloader)*100), end='')
# print('\nAverage accuracy: ', avg_acc / len(val_dataloader))

99.81%

In [67]:
preds = output_to_pred(outputs)

In [64]:
accuracy(val_targets.to_numpy().squeeze(), outputs)

0.8226906385616863

In [68]:
sklearn.metrics.f1_score(val_targets.to_numpy().squeeze(), preds)

0.811842105263158

In [76]:
test_dataset = EvalSet(*build_tokens(df_test.text))
test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=3,
                                              shuffle=False)

In [80]:
model.eval()
outputs = np.array([])
torch.cuda.empty_cache()

for tokens, masks in test_dataloader:
    output = model(tokens.long().to(DEVICE),
                   masks.long().to(DEVICE))
    outputs = np.concatenate([outputs,
                             output.detach().cpu().numpy().squeeze()])
    print('\r%0.2f%%'% (i/len(val_dataloader)*100), end='')

In [83]:
df_submission = pd.read_csv("sample_submission.csv")

In [94]:
df_submission.loc[:, "target"] = output_to_pred(outputs).astype("int")

In [95]:
df_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1


In [96]:
df_submission.to_csv("albert1_submission.csv", index=False)