In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
!pip install pytorch-pretrained-bert pytorch-nlp

In [0]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
import pandas as pd
import io
import numpy as np
from sklearn.preprocessing import LabelEncoder
import time

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

# Config

In [0]:
TRAIN_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/f52d4a981c5bd45436951f4474759b684ff59fa7/data/dumb-train-task2-TC-with-spans.txt?token=AD7GEDPEQYQYNPM2FMX7AH26NNPCE'
TEST_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/f52d4a981c5bd45436951f4474759b684ff59fa7/data/dev-task2-TC-with-spans-with-repetition.txt?token=AD7GEDJAL6XK2C6SXLIDCYK6NNPBG'


MAX_LEN = 256
BATCH_SIZE = 12
LEARNING_RATE = 2e-5
WARMUP = .1
N_EPOCHS = 4  # 2-4 recommended
BERT_MODEL = 'bert-base-cased'


SAVE_LAYER_REP = True
ROUNDING_ACC = 9


UNCASED = 'uncased' in BERT_MODEL
FILE_PREFIX = 'gdrive/My Drive/colab_projects/'
now = time.strftime("%Y%m%d-%H%M%S", time.localtime())
PREDICTIONS_FILE = FILE_PREFIX + 'semeval-predictions/labels_bert_' + now + '.txt'
LOG_FILE = FILE_PREFIX + 'semeval-predictions/log_bert_' + now + '.txt'
BERT_FILE_TRAIN = FILE_PREFIX + 'data/tc_train_' + now + '.tsv'
BERT_FILE_TEST= FILE_PREFIX + 'data/tc_test_' + now + '.tsv'

# Data

In [0]:
def get_data(url, training=True):
    df = pd.read_csv(url, sep='\t', quoting=3, header=None,
                     usecols=[0, 1, 2, 3, 4],
                     names=['document_id', 'label', 'span_start', 'span_end',
                            'text'])
    labels = None
    label_encoder = None
    if training:
        label_encoder = LabelEncoder()
        labels = label_encoder.fit_transform(df['label'])

    sentences = ["[CLS] " + sentence + " [SEP]" for sentence in df.text.values]
    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL,
                                              do_lower_case=not UNCASED)
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long",
                              truncating="post", padding="post")

    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # Used for extracting the data in the right order:
    spans = df.text.tolist()
    span_ids = list(range(len(spans)))

    if training:
        data = TensorDataset(torch.tensor(input_ids),
                             torch.tensor(attention_masks),
                             torch.tensor(labels),
                             torch.tensor(span_ids))
        sampler = RandomSampler(data)
    else:
        data = TensorDataset(torch.tensor(input_ids),
                             torch.tensor(attention_masks))
        sampler = SequentialSampler(data)

    dataloader = DataLoader(data, sampler=sampler, batch_size=BATCH_SIZE)

    return df, label_encoder, dataloader, spans

In [0]:
_, label_encoder, train_dataloader, spans_train = get_data(TRAIN_URL)
test_df, _, test_dataloader, spans_test = get_data(TEST_URL, training=False)

# Training the model

In [0]:
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=14)
model.cuda()

In [0]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

# This variable contains all of the hyperparemeter information our training loop needs
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=LEARNING_RATE,
                     warmup=WARMUP)

In [0]:
entries = []
train_loss_steps = []
train_loss_epochs = []

for epoch in range(1, N_EPOCHS + 1):
    print('Epoch', epoch)
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)  # Add batch to GPU

        b_input_ids, b_input_mask, b_labels, b_span_ids = batch

        optimizer.zero_grad()  # Clear out the gradients (by default they accumulate)
        # Forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        train_loss_steps.append(loss.item())    
        loss.backward()  # Backward pass
        optimizer.step()  # Update parameters and take a step using the computed gradient
        
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1 

        if SAVE_LAYER_REP and epoch == N_EPOCHS:
            # Save predictions (pre-softmax)
            layers = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
            b_span_ids = b_span_ids.tolist()
            for entry in range(layers.size(0)):
                predictions = layers[entry].detach().cpu().numpy()
                values = [round(x, ROUNDING_ACC) for x in predictions]
                entries.append((b_span_ids[entry],
                                spans_train[b_span_ids[entry]],
                                str(values)))

    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    train_loss_epochs.append(tr_loss/nb_tr_steps)

In [0]:
if SAVE_LAYER_REP:
    entries = sorted(entries, key=lambda entry: entry[0])
    with open(BERT_FILE_TRAIN, 'w', encoding='utf-8') as f:
        for entry in entries:
            f.write('1\tclass\t' + entry[1] + '\t' + entry[2] + '\n')

with open(LOG_FILE, 'w', encoding='utf-8') as f:
    f.write('MAX_LEN: ' + str(MAX_LEN) + '\n')
    f.write('BATCH_SIZE: ' + str(BATCH_SIZE) + '\n')
    f.write('LEARNING_RATE: ' + str(LEARNING_RATE) + '\n')
    f.write('WARMUP: ' + str(WARMUP) + '\n')
    f.write('N_EPOCHS: ' + str(N_EPOCHS) + '\n')
    f.write('BERT_MODEL: ' + BERT_MODEL + '\n')
    f.write('TRAIN LOSS BY EPOCH: ' + str(train_loss_epochs))

# Predictions

In [0]:
model.eval()

# Tracking variables 
preds = []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)  # Add batch to GPU
    b_input_ids, b_input_mask = batch

    # Telling the model not to compute or store gradients, saving memory and speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        logits = model(b_input_ids, token_type_ids=None,
                       attention_mask=b_input_mask)

    logits = logits.detach().cpu().numpy()  # Move logits and labels to CPU
    preds.append(logits)

In [0]:
predictions = [item for sublist in preds for item in sublist]
flat_predictions = np.argmax(predictions, axis=1).flatten()

if SAVE_LAYER_REP:
    with open(BERT_FILE_TEST, 'w', encoding='utf-8') as f:
        for pred, span in zip(predictions, spans_test):
            f.write('1\tclass\t' + span + '\t')
            values = [round(x, ROUNDING_ACC) for x in pred]
            f.write(str(values) + '\n')

predicted_labels = label_encoder.inverse_transform(flat_predictions)
test_df['label'] = predicted_labels
del test_df['text']
test_df.to_csv(PREDICTIONS_FILE, sep='\t', header=False, index=False)