In [0]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [0]:
import pandas as pd

In [0]:
!pip install pytorch-pretrained-bert pytorch-nlp

In [0]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

## Load Dataset

In [0]:
TRAIN_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/f52d4a981c5bd45436951f4474759b684ff59fa7/data/dumb-train-task2-TC-with-spans.txt?token=AD7GEDPEQYQYNPM2FMX7AH26NNPCE'
DEV_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/f52d4a981c5bd45436951f4474759b684ff59fa7/data/dev-task2-TC-with-spans-with-repetition.txt?token=AD7GEDJAL6XK2C6SXLIDCYK6NNPBG'

SAVE_LAYER_REP = True

In [0]:
train_df = pd.read_csv(TRAIN_URL, sep='\t', quoting=3, header=None, names=["document_id", "label", 'span_start', 'span_end', "text"])

In [0]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()
train_df['label_int'] = LE.fit_transform(train_df['label'])

In [0]:
train_df.shape

In [0]:
train_df.head(10)

In [0]:
max(train_df["label_int"])

In [0]:
# Create sentence and label lists
sentences = train_df.text.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = train_df.label_int.values

## Inputs

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

In [0]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = 128

In [0]:
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [0]:
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [0]:
input_ids.shape

In [0]:
# span_infos = []
# for row in train_df.itertuples():
#     span_infos.append((row.document_id, row.span_start, row.span_end, row.text))
# span_infos = torch.tensor(span_infos)
spans = train_df.text.tolist()
span_ids = list(range(len(spans)))
span_ids = torch.tensor(span_ids)

Create the attention masks

In [0]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [0]:
# Use train_test_split to split our data into train and validation sets for training

# train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
#                                                             random_state=2018, test_size=0.1)
# train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
#                                              random_state=2018, test_size=0.1)

In [0]:
train_inputs = input_ids
train_labels = labels
train_masks = attention_masks

In [0]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
# validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
# validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
# validation_masks = torch.tensor(validation_masks)

In [0]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 16


# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels, span_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
# validation_sampler = SequentialSampler(validation_data)
# validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

## Train Model

In [0]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 

model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=14)
model.cuda()

In [0]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [0]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

In [0]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [0]:
if SAVE_LAYER_REP:
    ROUNDING_ACC = 9
    OUT_FILE = '/content/gdrive/My Drive/colab_projects/data/bertforseq-train.tsv'
    f = open(OUT_FILE, 'w', encoding='utf-8')

t = [] 

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
n_epochs = 4


# for epoch in trange(1, n_epochs + 1, desc="Epoch"):
for epoch in range(1, n_epochs + 1):
    print('Epoch', epoch)

    # Set our model to training mode (as opposed to evaluation mode)
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels, b_span_ids = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        train_loss_set.append(loss.item())    
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1 

        if SAVE_LAYER_REP and epoch == n_epochs:
            # Save predictions (pre-softmax)
            layers = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            b_span_ids = b_span_ids.tolist()
            for entry in range(layers.size(0)):
                predictions = layers[entry].detach().cpu().numpy()
                values = [round(x, ROUNDING_ACC) for x in predictions]
                # for info in b_span_infos[entry].tolist():
                #     f.write(str(info) + '\t')
                f.write(str(step) + '\t' + 'class' + '\t')
                f.write(spans[b_span_ids[entry]] + '\t')
                f.write(str(values) + '\n')

    print("Train loss: {}".format(tr_loss/nb_tr_steps))

if SAVE_LAYER_REP:
    f.close()
    
  # # Validation

  # # Put model in evaluation mode to evaluate loss on the validation set
  # model.eval()

  # # Tracking variables 
  # eval_loss, eval_accuracy = 0, 0
  # nb_eval_steps, nb_eval_examples = 0, 0

  # # Evaluate data for one epoch
  # for batch in validation_dataloader:
  #   # Add batch to GPU
  #   batch = tuple(t.to(device) for t in batch)
  #   # Unpack the inputs from our dataloader
  #   b_input_ids, b_input_mask, b_labels = batch
  #   # Telling the model not to compute or store gradients, saving memory and speeding up validation
  #   with torch.no_grad():
  #     # Forward pass, calculate logit predictions
  #     logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
  #   # Move logits and labels to CPU
  #   logits = logits.detach().cpu().numpy()
  #   label_ids = b_labels.to('cpu').numpy()

  #   tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
  #   eval_accuracy += tmp_eval_accuracy
  #   nb_eval_steps += 1

  # print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

In [0]:
# plt.figure(figsize=(15,8))
# plt.title("Training loss")
# plt.xlabel("Batch")
# plt.ylabel("Loss")
# plt.plot(train_loss_set)
# plt.show()

## Predictions

In [0]:
test_df = pd.read_csv(DEV_URL, sep='\t', quoting=3, header=None, usecols=[0, 2, 3, 4], names=["document_id", 'span_start', 'span_end', "text", ])
test_df.head()

In [0]:
# Create sentence and label lists
test_sentences = test_df.text.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
test_sentences = ["[CLS] " + sentence + " [SEP]" for sentence in test_sentences]

In [0]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in test_sentences]

In [0]:
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)

spans = test_df.text.tolist()

In [0]:
prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [0]:
# Prediction on test set

# Put model in evaluation mode
model.eval()

# Tracking variables 
preds = []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  
  # Store predictions and
  preds.append(logits)

In [0]:
predictions = [item for sublist in preds for item in sublist]
flat_predictions = np.argmax(predictions, axis=1).flatten()
flat_predictions[:50]

In [0]:
if SAVE_LAYER_REP:
    ROUNDING_ACC = 9
    OUT_FILE = '/content/gdrive/My Drive/colab_projects/data/bertforseq-dev.tsv'
    with open(OUT_FILE, 'w', encoding='utf-8') as f:
        for pred, span in zip(predictions, spans):
            f.write('1\tclass\t' + span + '\t')
            values = [round(x, ROUNDING_ACC) for x in pred]
            f.write(str(values) + '\n')

In [0]:
test_df = pd.read_csv(DEV_URL, sep='\t', quoting=3, header=None, names=["document_id", "label", "from_idx", "to_idx", "text", "rep"])
predicted_labels = LE.inverse_transform(flat_predictions)
test_df["label"] = predicted_labels
test_df.head(50)

In [0]:
del test_df["text"]
del test_df["rep"]
test_df.to_csv("dev-task-TC.txt", sep='\t', header=False, index=False)