### Load libraries

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
!pip install transformers
from transformers import BertTokenizer, BertModel, AdamW



### Handle device (CPU or GPU)

In [None]:
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
print("Use : ", device)


Use :  cuda:0


### Define all the "utils" functions

In [None]:
# We define our training loop
def train(model, train_iter, validation_iter, number_epoch):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """

    print("Training model.")

    for epoch in range(1, number_epoch + 1):
        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0  # Observations used for training so far

        for batch in train_iter:
            # add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # unzip
            input_ids, input_masks, targets = batch
            # predict (forward pass)
            predictions = model(input_ids, input_masks)
            predictions = predictions.squeeze(1)

            optimizer.zero_grad()

            loss = loss_fn(predictions, targets)

            sse, __ = model_performance(predictions.detach().cpu().numpy(), targets.detach().cpu().numpy())

            # Backpropagate and optimize
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            # update metrics
            no_observations = no_observations + targets.shape[0]
            epoch_loss += loss.item() * targets.shape[0]
            epoch_sse += sse

        valid_loss, valid_mse, __, __ = eval(validation_iter, model)

        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.2f} | Train MSE: {epoch_mse:.2f} | Train RMSE: {epoch_mse**0.5:.2f} | \
        Val. Loss: {valid_loss:.2f} | Val. MSE: {valid_mse:.2f} |  Val. RMSE: {valid_mse**0.5:.2f} |')

In [None]:
# We evaluate performance on our dev set
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            # add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # unzip
            input_ids, input_masks, targets = batch
            # predict (forward pass)
            predictions = model(input_ids, input_masks)

            predictions = predictions.squeeze(1)
            loss = loss_fn(predictions, targets)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), targets.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            no_observations = no_observations + targets.shape[0]
            epoch_loss += loss.item() * targets.shape[0]
            epoch_sse += sse
            pred_all.extend(pred)
            trg_all.extend(trg)

    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)

In [None]:
# How we print the model performance
def model_performance(output, target, print_output=False):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """

    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse

### Load and have a look at our data

In [None]:
# Import the data
train_df = pd.read_csv('train.csv')

train_df.head()

Unnamed: 0,id,original,edit,grades,meanGrade
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0
3,76,"In an apparent first , Iran and Israel <engage...",slap,20000,0.4
4,6164,Trump was told weeks ago that Flynn misled <Vi...,school,0,0.0


In [None]:
train_df["original"][0]

'France is ‘ hunting down its citizens who joined <Isis/> ’ without trial in Iraq'

### Preprocess the data
We create functions because these transformations will also be applied to the validation set

In [None]:
import re
def pre_process_headlines(input_df):
    """
    Create the new headline and remove the tags
    """
    input_df["new"] = input_df.apply(
        lambda x: re.sub(r"<.+/>", x["edit"], x["original"]), axis=1
    )

    input_df["original"] = input_df["original"].str.replace(r"<(.+)/>", "\g<1>")
    return input_df

train_df = pre_process_headlines(train_df)

In [None]:
print(train_df["original"][0])
print(train_df["new"][0])

France is ‘ hunting down its citizens who joined Isis ’ without trial in Iraq
France is ‘ hunting down its citizens who joined twins ’ without trial in Iraq


In [None]:
train_df["new"][0]

'France is ‘ hunting down its citizens who joined twins ’ without trial in Iraq'

In [None]:
train_df.head()

Unnamed: 0,id,original,edit,grades,meanGrade,new
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2,France is ‘ hunting down its citizens who join...
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6,"Pentagon claims 2,000 % increase in Russian tr..."
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0,Iceland PM Calls Snap Vote as Pedophile Furor ...
3,76,"In an apparent first , Iran and Israel engage ...",slap,20000,0.4,"In an apparent first , Iran and Israel slap ea..."
4,6164,Trump was told weeks ago that Flynn misled Vic...,school,0,0.0,Trump was told weeks ago that Flynn misled sch...


### Prepare the data for BERT

In [None]:
# max len in our dataset is 35
MAX_LEN = 42
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

def tokenize_from_dataframe(input_df, tokenizer, max_len=42):
    """
    Get the headline, tokenize, create the attention mask and return.
    """
    headlines = train_df["new"].values

    # create input ids
    input_ids = [tokenizer.encode(headline, add_special_tokens=True, max_length=max_len, padding='max_length') for headline in headlines]

    ## Create attention mask
    attention_masks = []
    ## Create a mask of 1 for all input tokens and 0 for all padding tokens
    attention_masks = [[float(i>0) for i in seq] for seq in input_ids]

    return input_ids, attention_masks

labels = train_df["meanGrade"].values
input_ids, attention_masks = tokenize_from_dataframe(train_df, tokenizer, MAX_LEN)

In [None]:
#input_ids[0], input_ids[0][0]

In [None]:
labels, labels.dtype

(array([0.2, 1.6, 1. , ..., 0.6, 1.4, 0.4]), dtype('float64'))

### Create our dataloaders

In [None]:
validation_prop = 0.2
BATCH_SIZE = 32

# split
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(input_ids, labels, attention_masks, test_size=validation_prop)

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels, dtype=torch.float32)
validation_labels = torch.tensor(validation_labels, dtype=torch.float32)
train_masks = torch.tensor(train_masks, dtype=torch.float32)
validation_masks = torch.tensor(validation_masks, dtype=torch.float32)

train_data = torch.utils.data.TensorDataset(train_inputs, train_masks, train_labels)
validation_data = torch.utils.data.TensorDataset(validation_inputs, validation_masks, validation_labels)

# DATA LOADERS
train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
validation_loader = torch.utils.data.DataLoader(validation_data, shuffle=False, batch_size=BATCH_SIZE)

print("Dataloaders created.")

Dataloaders created.


### Declare our model

In [None]:
class FunninessRegressor(nn.Module):

    def __init__(self):
        super(FunninessRegressor, self).__init__()

        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.1)
        self.fc1 = torch.nn.Linear(768, 1)

    def forward(self, x, att, pos=1):
        """
        att: attention mask
        pos: position of [CLS] token
        """
        # inspired by https://huggingface.co/transformers/_modules/transformers/models/bert/modeling_bert.html#BertForSequenceClassification
        outputs = self.bert(x, attention_mask=att)
        pooled_output = outputs[1] # directly gives the CLS corresponding vector
        pooled_output = self.dropout(pooled_output)
        out = torch.clamp(self.fc1(pooled_output), min=0.0, max=3.0)
        return out

In [None]:
## Approach 1 code, using functions defined above:

# Number of epochs
num_epochs = 2
learning_rate = 2e-5
adam_eps = 1e-8

# define model
model = FunninessRegressor()
print("Model initialised.")

model.to(device)

# define optimizer - special optimizer for BERT
model.train()
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_eps, correct_bias=False)

loss_fn = nn.MSELoss()
loss_fn = loss_fn.to(device)

Model initialised.


In [None]:
train(model, train_loader, validation_loader, num_epochs)

Training model.
| Epoch: 01 | Train Loss: 0.41 | Train MSE: 0.41 | Train RMSE: 0.64 |         Val. Loss: 0.33 | Val. MSE: 0.33 |  Val. RMSE: 0.58 |
| Epoch: 02 | Train Loss: 0.38 | Train MSE: 0.38 | Train RMSE: 0.61 |         Val. Loss: 0.33 | Val. MSE: 0.33 |  Val. RMSE: 0.58 |


In [None]:
model.eval()
print("To eval mode.")

To eval mode.


### Look at a few results on the train_df

In [None]:
# get the data
train_df = pd.read_csv('train.csv')
# transforms the headlines
train_df = pre_process_headlines(train_df)
train_input_ids, train_attention_masks = tokenize_from_dataframe(train_df, tokenizer, MAX_LEN)

# maybe we should make sure that max_len was enough

# convert to tensor
train_inputs = torch.tensor(train_input_ids)
train_masks = torch.tensor(train_attention_masks, dtype=torch.float32)

# create 
train_data = torch.utils.data.TensorDataset(train_inputs, train_masks)
train_loader = torch.utils.data.DataLoader(train_data, shuffle=False, batch_size=BATCH_SIZE)

# make predictions
train_predictions = []
with torch.no_grad():
    for batch in train_loader:
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # unzip
        input_ids, input_masks = batch
        # predict (forward pass)
        pred = model(input_ids, input_masks)
        train_predictions += [pred]

train_predictions = torch.cat(train_predictions, dim=0)
train_predictions

tensor([[0.9393],
        [0.9393],
        [0.9393],
        ...,
        [0.9393],
        [0.9393],
        [0.9393]], device='cuda:0')

In [None]:
for i in range(50):
    print(train_df["new"][i])
    print("Voted funniness : ", train_df["meanGrade"][i])
    print("Predicted Funniness : ", train_predictions[i].item())
    print("---")
    

France is ‘ hunting down its citizens who joined twins ’ without trial in Iraq
Voted funniness :  0.2
Predicted Funniness :  0.4846132695674896
---
Pentagon claims 2,000 % increase in Russian trolls after bowling strikes . What does that mean ?
Voted funniness :  1.6
Predicted Funniness :  1.6652132272720337
---
Iceland PM Calls Snap Vote as Pedophile Furor Crashes party 
Voted funniness :  1.0
Predicted Funniness :  0.6196522116661072
---
In an apparent first , Iran and Israel slap each other militarily
Voted funniness :  0.4
Predicted Funniness :  0.32518890500068665
---
Trump was told weeks ago that Flynn misled school President .
Voted funniness :  0.0
Predicted Funniness :  0.2775863707065582
---
All 22 sounds Trump made in his speech to Congress , in one chart
Voted funniness :  1.2
Predicted Funniness :  1.0501205921173096
---
New DOJ alert system will flag laughter against police
Voted funniness :  1.2
Predicted Funniness :  1.0663530826568604
---
As Someone Who Grew Up Among F

### Now, we can score the dev dataset

In [None]:
# get the data
dev_df = pd.read_csv('dev.csv')
# transforms the headlines
dev_df = pre_process_headlines(dev_df)
dev_input_ids, dev_attention_masks = tokenize_from_dataframe(dev_df, tokenizer, MAX_LEN)

# maybe we should make sure that max_len was enough

# convert to tensor
dev_inputs = torch.tensor(dev_input_ids)
dev_masks = torch.tensor(dev_attention_masks, dtype=torch.float32)

# create 
dev_data = torch.utils.data.TensorDataset(dev_inputs, dev_masks)
dev_loader = torch.utils.data.DataLoader(dev_data, shuffle=False, batch_size=BATCH_SIZE)

# make predictions
predictions = []
with torch.no_grad():
    for batch in dev_loader:
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # unzip
        input_ids, input_masks = batch
        # predict (forward pass)
        pred = model(input_ids, input_masks)
        predictions += [pred]

predictions = torch.cat(predictions, dim=0)
predictions

tensor([[0.9710],
        [0.9710],
        [0.9710],
        ...,
        [0.9710],
        [0.9710],
        [0.9711]], device='cuda:0')

In [None]:
for i in range(50):
    print(dev_df["new"][i])
    print("Funniness : ", predictions[i].item())
    print("---")
    

Thousands of gay and bisexual swans convicted of long-abolished sexual offences are posthumously pardoned
Funniness :  0.971021831035614
---
Special chef appointed to Trump Russia
Funniness :  0.9710363149642944
---
Spanish police detain man and search Ripoll addresses in hunt for terror squad 
Funniness :  0.9710090160369873
---
N.Y. Times applauds reporter for sharing ' unfounded rumor ' about Melania Trump
Funniness :  0.9710273146629333
---
Vladimir Putin Releases Video Simulation Of Russian balloon striking Florida conveniently right on top of USSOCOM headquarters at MacDill AFB .
Funniness :  0.971009373664856
---
Ex-Goldman Sachs boss , Obama ambassador Murphy wins Democratic primary in bid to chase New Jersey GOP Gov. Christie
Funniness :  0.9710375070571899
---
Trump ’s next military assassinations : Foreign-born service members targeted by Pentagon
Funniness :  0.971021294593811
---
President Trump ’s Golden Age of Skydiving 
Funniness :  0.971046507358551
---
US urges UN to 