In [1]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [0]:
import pandas as pd

In [3]:
!pip install pytorch-pretrained-bert pytorch-nlp

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 3.4MB/s 
[?25hCollecting pytorch-nlp
[?25l  Downloading https://files.pythonhosted.org/packages/4f/51/f0ee1efb75f7cc2e3065c5da1363d6be2eec79691b2821594f3f2329528c/pytorch_nlp-0.5.0-py3-none-any.whl (90kB)
[K     |████████████████████████████████| 92kB 7.1MB/s 
Installing collected packages: pytorch-pretrained-bert, pytorch-nlp
Successfully installed pytorch-nlp-0.5.0 pytorch-pretrained-bert-0.6.2


In [4]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

Using TensorFlow backend.


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla K80'

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


## Load Dataset

In [0]:
TRAIN_URL = "https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/train-task2-TC-with-spans.txt?token=AF75TYYAVJ7QAUVYP67GZ7S6EZDRW"
DEV_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/dev-task2-TC-with-spans.txt?token=AF75TY7AZTCVWJ4YHDZFW7S6EZDVO'

In [0]:
train_df = pd.read_csv(TRAIN_URL, sep='\t', quoting=3, usecols=[0, 1, 4], header=None, names=["document_id", "label", "text"])

In [0]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()
train_df['label_int'] = LE.fit_transform(train_df['label'])

In [10]:
train_df.shape

(6129, 4)

In [11]:
train_df.head(10)

Unnamed: 0,document_id,label,text,label_int
0,111111111,Appeal_to_Authority,The next transmission could be more pronounced...,0
1,111111111,Appeal_to_Authority,when (the plague) comes again it starts from m...,0
2,111111111,Doubt,appeared,5
3,111111111,Repetition,"a very, very different",10
4,111111111,Appeal_to_fear-prejudice,He also pointed to the presence of the pneumon...,1
5,111111111,Appeal_to_fear-prejudice,but warned that the danger was not over,1
6,111111111,Appeal_to_fear-prejudice,the magnitude in the next transmission could b...,1
7,111111111,Appeal_to_fear-prejudice,it could even spill over into neighbouring cou...,1
8,111111112,Slogans,Stop Islamization of America,11
9,111111112,Black-and-White_Fallacy,We condemn all those whose behaviours and view...,3


In [12]:
max(train_df["label_int"])

13

In [0]:
# Create sentence and label lists
sentences = train_df.text.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = train_df.label_int.values

## Inputs

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

100%|██████████| 213450/213450 [00:00<00:00, 2464995.91B/s]


Tokenize the first sentence:
['[CLS]', 'The', 'next', 'transmission', 'could', 'be', 'more', 'pronounced', 'or', 'stronger', '[SEP]']


In [0]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = 128

In [0]:
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [0]:
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [18]:
input_ids.shape

(6129, 128)

Create the attention masks

In [0]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [0]:
# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [0]:
train_inputs = input_ids
train_labels = labels
train_masks = attention_masks

In [0]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
# validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
# validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
# validation_masks = torch.tensor(validation_masks)

In [0]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
# validation_sampler = SequentialSampler(validation_data)
# validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

## Train Model

In [24]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 

model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=14)
model.cuda()

100%|██████████| 404400730/404400730 [00:08<00:00, 50174693.55B/s]


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [0]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [26]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

t_total value of -1 results in schedule not being applied


In [0]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [28]:
t = [] 

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # # Validation

  # # Put model in evaluation mode to evaluate loss on the validation set
  # model.eval()

  # # Tracking variables 
  # eval_loss, eval_accuracy = 0, 0
  # nb_eval_steps, nb_eval_examples = 0, 0

  # # Evaluate data for one epoch
  # for batch in validation_dataloader:
  #   # Add batch to GPU
  #   batch = tuple(t.to(device) for t in batch)
  #   # Unpack the inputs from our dataloader
  #   b_input_ids, b_input_mask, b_labels = batch
  #   # Telling the model not to compute or store gradients, saving memory and speeding up validation
  #   with torch.no_grad():
  #     # Forward pass, calculate logit predictions
  #     logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
  #   # Move logits and labels to CPU
  #   logits = logits.detach().cpu().numpy()
  #   label_ids = b_labels.to('cpu').numpy()

  #   tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
  #   eval_accuracy += tmp_eval_accuracy
  #   nb_eval_steps += 1

  # print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch:  25%|██▌       | 1/4 [04:38<13:54, 278.14s/it]

Train loss: 1.506809282116592


Epoch:  50%|█████     | 2/4 [09:15<09:16, 278.02s/it]

Train loss: 0.9378166234431168


Epoch:  75%|███████▌  | 3/4 [13:54<04:38, 278.11s/it]

Train loss: 0.5815918151444445


Epoch: 100%|██████████| 4/4 [18:32<00:00, 278.30s/it]

Train loss: 0.36470406925460946





In [0]:
# plt.figure(figsize=(15,8))
# plt.title("Training loss")
# plt.xlabel("Batch")
# plt.ylabel("Loss")
# plt.plot(train_loss_set)
# plt.show()

## Predictions

In [0]:
test_df = pd.read_csv(DEV_URL, sep='\t', quoting=3, usecols=[0, 1, 4], header=None, names=["document_id", "label", "text"])

In [31]:
test_df.head()

Unnamed: 0,document_id,label,text
0,730093263,?,white
1,730093263,?,black
2,730093263,?,“true American heroes.”
3,730093263,?,black
4,730093263,?,"If these two men had survived, and Quentin Lam..."


In [0]:
# Create sentence and label lists
test_sentences = test_df.text.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
test_sentences = ["[CLS] " + sentence + " [SEP]" for sentence in test_sentences]

In [0]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in test_sentences]

In [0]:
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []

In [0]:
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)

In [0]:
batch_size = 16  


prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [0]:
# Prediction on test set

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions = []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  
  # Store predictions and true labels
  predictions.append(logits)

In [0]:
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

In [39]:
flat_predictions[:50]

array([10, 10,  7, 10, 13,  8,  9,  8,  6,  1,  8,  6,  8, 10,  8,  8,  1,
        1,  1,  5,  6,  1,  8,  1,  1,  1,  8,  8,  8,  8,  8,  8,  8,  6,
        6,  8,  6,  8,  9,  9,  8,  4,  8,  8,  9,  8,  6,  4,  2,  1])

In [0]:
test_df = pd.read_csv(DEV_URL, sep='\t', quoting=3, header=None, names=["document_id", "label", "from_idx", "to_idx", "text"])

In [0]:
predicted_labels = LE.inverse_transform(flat_predictions)

In [0]:
test_df["label"] = predicted_labels

In [43]:
test_df.head(50)

Unnamed: 0,document_id,label,from_idx,to_idx,text
0,730093263,Repetition,123,128,white
1,730093263,Repetition,352,357,black
2,730093263,Flag-Waving,1370,1393,“true American heroes.”
3,730093263,Repetition,2434,2439,black
4,730093263,"Whataboutism,Straw_Men,Red_Herring",2699,2807,"If these two men had survived, and Quentin Lam..."
5,730093263,Loaded_Language,2458,2487,"""Black Murders Of White Cops"""
6,730246508,"Name_Calling,Labeling",1654,1676,"""true American heroes"""
7,730246508,Loaded_Language,4557,4573,terrible tragedy
8,730246508,"Exaggeration,Minimisation",4406,4412,finest
9,730246508,Appeal_to_fear-prejudice,3840,3948,And quite frankly there’s a special place in h...


In [0]:
del test_df["text"]

In [0]:
test_df.to_csv("dev-task-TC.out", sep='\t', header=False, index=False)