### Load libraries

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
!pip install transformers
from transformers import BertTokenizer, BertModel, AdamW

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 14.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 56.3MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 43.6MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=d0f4e36be5

### Handle device (CPU or GPU)

In [3]:
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
print("Use : ", device)


Use :  cuda:0


### Define all the "utils" functions

In [4]:
# We define our training loop
def train(model, train_iter, validation_iter, number_epoch):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """

    print("Training model.")

    for epoch in range(1, number_epoch + 1):
        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0  # Observations used for training so far

        for batch in train_iter:
            # add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # unzip
            input_ids, input_masks, input_toktypes, targets = batch
            # predict (forward pass)
            predictions = model(input_ids, input_masks, input_toktypes)
            predictions = predictions.squeeze(1)

            optimizer.zero_grad()

            loss = loss_fn(predictions, targets)

            sse, __ = model_performance(predictions.detach().cpu().numpy(), targets.detach().cpu().numpy())

            # Backpropagate and optimize
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            # update metrics
            no_observations = no_observations + targets.shape[0]
            epoch_loss += loss.item() * targets.shape[0]
            epoch_sse += sse

        valid_loss, valid_mse, __, __ = eval(validation_iter, model)

        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.2f} | Train MSE: {epoch_mse:.2f} | Train RMSE: {epoch_mse**0.5:.2f} | \
        Val. Loss: {valid_loss:.2f} | Val. MSE: {valid_mse:.2f} |  Val. RMSE: {valid_mse**0.5:.2f} |')

In [5]:
# We evaluate performance on our dev set
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            # add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # unzip
            input_ids, input_masks, input_toktypes, targets = batch
            # predict (forward pass)
            predictions = model(input_ids, input_masks, input_toktypes)

            predictions = predictions.squeeze(1)
            loss = loss_fn(predictions, targets)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), targets.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            no_observations = no_observations + targets.shape[0]
            epoch_loss += loss.item() * targets.shape[0]
            epoch_sse += sse
            pred_all.extend(pred)
            trg_all.extend(trg)

    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)

In [6]:
# How we print the model performance
def model_performance(output, target, print_output=False):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """

    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse

### Load and have a look at our data

In [10]:
# Import the data
train_df = pd.read_csv('train.csv')

train_df.head()

Unnamed: 0,id,original,edit,grades,meanGrade
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0
3,76,"In an apparent first , Iran and Israel <engage...",slap,20000,0.4
4,6164,Trump was told weeks ago that Flynn misled <Vi...,school,0,0.0


In [11]:
train_df["original"][0]

'France is ‘ hunting down its citizens who joined <Isis/> ’ without trial in Iraq'

### Preprocess the data
We create functions because these transformations will also be applied to the validation set

In [12]:
import re
def pre_process_headlines(input_df):
    """
    Create the new headline and remove the tags
    """
    input_df["new"] = input_df.apply(
        lambda x: re.sub(r"<.+/>", x["edit"], x["original"]), axis=1
    )

    input_df["original"] = input_df["original"].str.replace(r"<(.+)/>", "\g<1>")
    return input_df

train_df = pre_process_headlines(train_df)

In [13]:
print(train_df["original"][0])
print(train_df["new"][0])

France is ‘ hunting down its citizens who joined Isis ’ without trial in Iraq
France is ‘ hunting down its citizens who joined twins ’ without trial in Iraq


In [14]:
train_df["new"][0]

'France is ‘ hunting down its citizens who joined twins ’ without trial in Iraq'

In [15]:
train_df.head()

Unnamed: 0,id,original,edit,grades,meanGrade,new
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2,France is ‘ hunting down its citizens who join...
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6,"Pentagon claims 2,000 % increase in Russian tr..."
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0,Iceland PM Calls Snap Vote as Pedophile Furor ...
3,76,"In an apparent first , Iran and Israel engage ...",slap,20000,0.4,"In an apparent first , Iran and Israel slap ea..."
4,6164,Trump was told weeks ago that Flynn misled Vic...,school,0,0.0,Trump was told weeks ago that Flynn misled sch...


In [37]:
# train_df

### Prepare the data for BERT

In [16]:
train_df["original"].values + " [SEP] " + train_df["new"].values

array(['France is ‘ hunting down its citizens who joined Isis ’ without trial in Iraq [SEP] France is ‘ hunting down its citizens who joined twins ’ without trial in Iraq',
       'Pentagon claims 2,000 % increase in Russian trolls after Syria strikes . What does that mean ? [SEP] Pentagon claims 2,000 % increase in Russian trolls after bowling strikes . What does that mean ?',
       'Iceland PM Calls Snap Vote as Pedophile Furor Crashes Coalition  [SEP] Iceland PM Calls Snap Vote as Pedophile Furor Crashes party ',
       ...,
       "Cruise line Carnival Corp. joins the fight against Bermuda 's same-sex marriage ban [SEP] Cruise line Carnival Corp. joins the fight against Bermuda 's same-sex raisin ban",
       'Columbia police hunt woman seen with gun near University of Missouri campus [SEP] Columbia police hunt woman seen with cake near University of Missouri campus',
       "Here 's What 's In The House-Approved Health Care Bill [SEP] Here 's What 's In The House-Approved Health 

In [17]:
# max len in our dataset is 35
MAX_LEN = 42
concat = True
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

def tokenize_from_dataframe(input_df, tokenizer, max_len=42, concat=False):
    """
    Get the headline, tokenize, create the attention mask and return.
    """
    if concat:
        max_len = 2 * max_len
        headlines = input_df["original"].values + " [SEP] " + input_df["new"].values
    else:
        headlines = input_df["new"].values

    # create input ids
    input_ids = [tokenizer.encode(headline, add_special_tokens=True, max_length=max_len, padding='max_length') for headline in headlines]

    ## Create attention and segment mask
    attention_masks = []
    token_type_ids = []
    for seq in input_ids:
        attention_mask = [float(i>0) for i in seq]
        attention_mask = []
        segment_mask = []
        seen_sep = False
        for i in seq:
            attention_mask += [float(i>0)]
            segment_mask += [int(seen_sep)]
            if i == 102:
                seen_sep = True
        attention_masks += [attention_mask]
        token_type_ids += [segment_mask]

    ## Create a mask of 1 for all input tokens and 0 for all padding tokens
    #attention_masks = [[float(i>0) for i in seq] for seq in input_ids]

    return input_ids, attention_masks, token_type_ids

labels = train_df["meanGrade"].values
input_ids, attention_masks, token_type_ids = tokenize_from_dataframe(train_df, tokenizer, MAX_LEN, concat)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [18]:
#input_ids

In [19]:
# max_len = 42
# one_headline = train_df["original"].values[0]
# modif_headline = train_df["new"].values[0]
# concat_headline = one_headline + " [SEP] " + modif_headline
# test = "france is hunting down"
# test2 = "france is hunting down [SEP] france is hunting down"

# encoded_headline = tokenizer.encode(one_headline, add_special_tokens=True, max_length=max_len, padding='max_length')
# encoded_modif_headline = tokenizer.encode(modif_headline, add_special_tokens=True, max_length=max_len, padding='max_length')
# encoded_concat_headline = tokenizer.encode(concat_headline, add_special_tokens=True, max_length=max_len, padding='max_length')
# encoded_test = tokenizer.encode(test, add_special_tokens=True, max_length=max_len, padding='max_length')
# encoded_test2 = tokenizer.encode(test2, add_special_tokens=True, max_length=max_len, padding='max_length')

# print("original headline : ", one_headline)
# print("Modified headline : ", modif_headline)
# print("Modified headline : ", concat_headline)
# print("Test headline : ", test)
# print("Test2 headline : ", test2)
# print("Original headline encoded : ", np.array(encoded_headline))
# print("Modified headline encoded : ", np.array(encoded_modif_headline))
# print("Concat headline encoded : ", np.array(encoded_concat_headline))
# print("Test headline encoded : ", np.array(encoded_test))
# print("Test 2 headline encoded : ", np.array(encoded_test2))

In [20]:
labels.mean()

0.9355712114933001

In [21]:
#input_ids[0], input_ids[0][0]

In [22]:
labels, labels.dtype

(array([0.2, 1.6, 1. , ..., 0.6, 1.4, 0.4]), dtype('float64'))

### Create our dataloaders

In [23]:
validation_prop = 0.2
BATCH_SIZE = 32

# split
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks, train_token_type_ids, validation_token_type_ids = train_test_split(input_ids, labels, attention_masks, token_type_ids, test_size=validation_prop)

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels, dtype=torch.float32)
validation_labels = torch.tensor(validation_labels, dtype=torch.float32)
train_masks = torch.tensor(train_masks, dtype=torch.float32)
validation_masks = torch.tensor(validation_masks, dtype=torch.float32)
train_token_type_ids = torch.tensor(train_token_type_ids, dtype=torch.long)
validation_token_type_ids = torch.tensor(validation_token_type_ids, dtype=torch.long)

train_data = torch.utils.data.TensorDataset(train_inputs, train_masks, train_token_type_ids, train_labels)
validation_data = torch.utils.data.TensorDataset(validation_inputs, validation_masks, validation_token_type_ids, validation_labels)

# DATA LOADERS
train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
validation_loader = torch.utils.data.DataLoader(validation_data, shuffle=False, batch_size=BATCH_SIZE)

print("Dataloaders created.")

Dataloaders created.


### Declare our model

In [25]:
class FunninessRegressor(nn.Module):

    def __init__(self):
        super(FunninessRegressor, self).__init__()

        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.1)
        self.fc1 = torch.nn.Linear(768, 1)

    def forward(self, x, att, token_type_ids):
        """
        att: attention mask
        """
        # inspired by https://huggingface.co/transformers/_modules/transformers/models/bert/modeling_bert.html#BertForSequenceClassification
        outputs = self.bert(x, token_type_ids=token_type_ids, attention_mask=att)
        pooled_output = outputs[1] # directly gives the CLS corresponding vector
        pooled_output = self.dropout(pooled_output)
        out = torch.clamp(self.fc1(pooled_output), min=0.0, max=3.0)
        return out

In [26]:
## Approach 1 code, using functions defined above:

# Number of epochs
num_epochs = 2 #2
learning_rate = 2e-6 #2e-6
adam_eps = 1e-8

# define model
model = FunninessRegressor()
print("Model initialised.")

model.to(device)

# define optimizer - special optimizer for BERT
model.train()
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_eps, correct_bias=False)

loss_fn = nn.MSELoss()
loss_fn = loss_fn.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


Model initialised.


In [27]:
train(model, train_loader, validation_loader, num_epochs)

Training model.
| Epoch: 01 | Train Loss: 0.47 | Train MSE: 0.47 | Train RMSE: 0.68 |         Val. Loss: 0.31 | Val. MSE: 0.31 |  Val. RMSE: 0.56 |
| Epoch: 02 | Train Loss: 0.30 | Train MSE: 0.30 | Train RMSE: 0.55 |         Val. Loss: 0.30 | Val. MSE: 0.30 |  Val. RMSE: 0.54 |


In [28]:
model.eval()
print("To eval mode.")

To eval mode.


### Look at a few results on the train_loader

In [29]:
mean_value = 0.9355712114933001

In [30]:
torch.ones((2, 2)) * mean_value

tensor([[0.9356, 0.9356],
        [0.9356, 0.9356]])

In [31]:
# make predictions
train_predictions = []
model_perf = 0
mean_perf = 0
true_labels = []
with torch.no_grad():
    for batch in train_loader:
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # unzip
        input_ids, input_masks, input_toktypes, targets = batch
        # predict (forward pass)
        pred = model(input_ids, input_masks, input_toktypes)
        train_predictions += [pred]
        true_labels += [targets]

        mean_predictions = torch.ones_like(pred) * mean_value
        mean_predictions = mean_predictions.to(device)

        # for the model
        # get scores
        pred = pred.squeeze(1)
        train_loss = loss_fn(pred, targets)

        # We get the mse
        pred, trg = pred.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse, __ = model_performance(pred, trg)
        model_perf += sse

        # for the mean prediction
        
        # get scores
        mean_predictions = mean_predictions.squeeze(1)
        mean_loss = loss_fn(mean_predictions, targets)

        # We get the mse
        pred_mean, trg_mean = mean_predictions.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse_mean, __ = model_performance(pred_mean, trg_mean)
        mean_perf += sse_mean


train_predictions = torch.cat(train_predictions, dim=0)
true_labels = torch.cat(true_labels, dim=0).unsqueeze(1)
check_values = torch.cat((train_predictions, true_labels), dim=1)
print("Performances : --- Model : {} --- --- Mean predictor : {} ---".format(model_perf, mean_perf))
print("Values : ", check_values)

Performances : --- Model : 1920.6084697246552 --- --- Mean predictor : 2621.7857179641724 ---
Values :  tensor([[0.6450, 1.0000],
        [0.8564, 0.8000],
        [1.2365, 1.4000],
        ...,
        [0.6389, 0.4000],
        [0.6657, 0.0000],
        [0.8402, 0.8000]], device='cuda:0')


In [35]:
print((2645 - 2020)/2645, (642 - 561)/642)
print((2614 - 2002)/2614, (673-598)/673)
print("With new version")
print((2634 - 2267)/2634, (653-634)/653)
print((2667 - 1809)/2667, (619-586)/619)
print((2651 - 1787)/2651, (635-555)/635)
print((2621 - 1920)/2621, (666-573)/666)

0.23629489603024575 0.1261682242990654
0.234123947972456 0.11144130757800892
With new version
0.13933181473044798 0.02909647779479326
0.3217097862767154 0.05331179321486268
0.32591474915126367 0.12598425196850394
0.26745516978252576 0.13963963963963963


In [33]:
# make predictions
validation_predictions = []
model_perf = 0
mean_perf = 0
true_labels = []
with torch.no_grad():
    for batch in validation_loader:
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # unzip
        input_ids, input_masks, input_toktypes, targets = batch
        # predict (forward pass)
        pred = model(input_ids, input_masks, input_toktypes)
        validation_predictions += [pred]
        true_labels += [targets]

        mean_predictions = torch.ones_like(pred) * mean_value
        mean_predictions = mean_predictions.to(device)

        # for the model
        # get scores
        pred = pred.squeeze(1)
        validation_loss = loss_fn(pred, targets)

        # We get the mse
        pred, trg = pred.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse, __ = model_performance(pred, trg)
        model_perf += sse

        # for the mean prediction
        
        # get scores
        mean_predictions = mean_predictions.squeeze(1)
        mean_loss = loss_fn(mean_predictions, targets)

        # We get the mse
        pred_mean, trg_mean = mean_predictions.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse_mean, __ = model_performance(pred_mean, trg_mean)
        mean_perf += sse_mean


validation_predictions = torch.cat(validation_predictions, dim=0)
true_labels = torch.cat(true_labels, dim=0).unsqueeze(1)
check_values = torch.cat((validation_predictions, true_labels), dim=1)
print("Performances : --- Model : {} --- --- Mean predictor : {} ---".format(model_perf, mean_perf))
print("Values : ", check_values)

Performances : --- Model : 573.1840415000916 --- --- Mean predictor : 665.7237586975098 ---
Values :  tensor([[1.5418, 0.6000],
        [1.0082, 0.8000],
        [0.9202, 1.4000],
        ...,
        [0.6346, 0.2000],
        [0.9323, 1.2000],
        [0.5633, 0.8000]], device='cuda:0')


### Now, we can score the dev dataset

In [34]:
# get the data
dev_df = pd.read_csv('dev.csv')
# transforms the headlines
dev_df = pre_process_headlines(dev_df)
dev_input_ids, dev_attention_masks = tokenize_from_dataframe(dev_df, tokenizer, MAX_LEN)

# maybe we should make sure that max_len was enough

# convert to tensor
dev_inputs = torch.tensor(dev_input_ids)
dev_masks = torch.tensor(dev_attention_masks, dtype=torch.float32)

# create 
dev_data = torch.utils.data.TensorDataset(dev_inputs, dev_masks)
dev_loader = torch.utils.data.DataLoader(dev_data, shuffle=False, batch_size=BATCH_SIZE)

# make predictions
predictions = []
with torch.no_grad():
    for batch in dev_loader:
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # unzip
        input_ids, input_masks = batch
        # predict (forward pass)
        pred = model(input_ids, input_masks)
        predictions += [pred]

predictions = torch.cat(predictions, dim=0)
predictions

ValueError: ignored

In [None]:
for i in range(50):
    print(dev_df["new"][i])
    print("Funniness : ", predictions[i].item())
    print("---")
    