### Load libraries

In [100]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [101]:
!pip install transformers
from transformers import BertTokenizer, BertModel, AdamW



### Handle device (CPU or GPU)

In [102]:
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
print("Use : ", device)


Use :  cuda:0


### Define all the "utils" functions

In [103]:
# We define our training loop
def train(model, train_iter, validation_iter, number_epoch):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """

    print("Training model.")

    for epoch in range(1, number_epoch + 1):
        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0  # Observations used for training so far

        for batch in train_iter:
            # add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # unzip
            input_ids, input_masks, input_toktypes, targets = batch
            # predict (forward pass)
            predictions = model(input_ids, input_masks, input_toktypes)
            predictions = predictions.squeeze(1)

            optimizer.zero_grad()

            loss = loss_fn(predictions, targets)

            sse, __ = model_performance(predictions.detach().cpu().numpy(), targets.detach().cpu().numpy())

            # Backpropagate and optimize
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            # update metrics
            no_observations = no_observations + targets.shape[0]
            epoch_loss += loss.item() * targets.shape[0]
            epoch_sse += sse

        valid_loss, valid_mse, __, __ = eval(validation_iter, model)

        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.2f} | Train MSE: {epoch_mse:.2f} | Train RMSE: {epoch_mse**0.5:.2f} | \
        Val. Loss: {valid_loss:.2f} | Val. MSE: {valid_mse:.2f} |  Val. RMSE: {valid_mse**0.5:.2f} |')

In [104]:
# We evaluate performance on our dev set
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            # add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # unzip
            input_ids, input_masks, input_toktypes, targets = batch
            # predict (forward pass)
            predictions = model(input_ids, input_masks, input_toktypes)

            predictions = predictions.squeeze(1)
            loss = loss_fn(predictions, targets)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), targets.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            no_observations = no_observations + targets.shape[0]
            epoch_loss += loss.item() * targets.shape[0]
            epoch_sse += sse
            pred_all.extend(pred)
            trg_all.extend(trg)

    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)

In [105]:
# How we print the model performance
def model_performance(output, target, print_output=False):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """

    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse

### Load and have a look at our data

In [106]:
# Import the data
train_df = pd.read_csv('train.csv')

train_df.head()

Unnamed: 0,id,original,edit,grades,meanGrade
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0
3,76,"In an apparent first , Iran and Israel <engage...",slap,20000,0.4
4,6164,Trump was told weeks ago that Flynn misled <Vi...,school,0,0.0


In [107]:
train_df["original"][0]

'France is ‘ hunting down its citizens who joined <Isis/> ’ without trial in Iraq'

### Preprocess the data
We create functions because these transformations will also be applied to the validation set

In [108]:
import re
def pre_process_headlines(input_df):
    """
    Create the new headline and remove the tags
    """
    input_df["new"] = input_df.apply(
        lambda x: re.sub(r"<.+/>", x["edit"], x["original"]), axis=1
    )

    input_df["original"] = input_df["original"].str.replace(r"<(.+)/>", "\g<1>")
    return input_df

train_df = pre_process_headlines(train_df)

In [109]:
print(train_df["original"][0])
print(train_df["new"][0])

France is ‘ hunting down its citizens who joined Isis ’ without trial in Iraq
France is ‘ hunting down its citizens who joined twins ’ without trial in Iraq


In [110]:
train_df["new"][0]

'France is ‘ hunting down its citizens who joined twins ’ without trial in Iraq'

In [111]:
train_df.head()

Unnamed: 0,id,original,edit,grades,meanGrade,new
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2,France is ‘ hunting down its citizens who join...
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6,"Pentagon claims 2,000 % increase in Russian tr..."
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0,Iceland PM Calls Snap Vote as Pedophile Furor ...
3,76,"In an apparent first , Iran and Israel engage ...",slap,20000,0.4,"In an apparent first , Iran and Israel slap ea..."
4,6164,Trump was told weeks ago that Flynn misled Vic...,school,0,0.0,Trump was told weeks ago that Flynn misled sch...


In [112]:
# train_df

### Prepare the data for BERT

In [113]:
# max len in our dataset is 35
MAX_LEN = 42
concat = True
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

def tokenize_from_dataframe(input_df, tokenizer, max_len=42, concat=False):
    """
    Get the headline, tokenize, create the attention mask and return.
    """
    if concat:
        max_len = 2 * max_len
        headlines = input_df["original"].values + " [SEP] " + input_df["new"].values
    else:
        headlines = input_df["new"].values

    # create input ids
    input_ids = [tokenizer.encode(headline, add_special_tokens=True, max_length=max_len, padding='max_length') for headline in headlines]

    ## Create attention and segment mask
    attention_masks = []
    token_type_ids = []
    for seq in input_ids:
        attention_mask = [float(i>0) for i in seq]
        attention_mask = []
        segment_mask = []
        seen_sep = False
        for i in seq:
            attention_mask += [float(i>0)]
            segment_mask += [int(seen_sep)]
            if i == 102:
                seen_sep = True
        attention_masks += [attention_mask]
        token_type_ids += [segment_mask]

    ## Create a mask of 1 for all input tokens and 0 for all padding tokens
    #attention_masks = [[float(i>0) for i in seq] for seq in input_ids]

    return input_ids, attention_masks, token_type_ids

labels = train_df["meanGrade"].values
input_ids, attention_masks, token_type_ids = tokenize_from_dataframe(train_df, tokenizer, MAX_LEN, concat)

In [114]:
labels.mean()

0.9355712114933001

In [115]:
#input_ids[0], input_ids[0][0]

In [116]:
labels, labels.dtype

(array([0.2, 1.6, 1. , ..., 0.6, 1.4, 0.4]), dtype('float64'))

### Create our dataloaders

In [117]:
validation_prop = 0.2
BATCH_SIZE = 32

# split - no need to split now that we have dev's labels
#train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks, train_token_type_ids, validation_token_type_ids = train_test_split(input_ids, labels, attention_masks, token_type_ids, test_size=validation_prop)

# transform data into appropriate tensors
train_inputs = torch.tensor(input_ids)
train_labels = torch.tensor(labels, dtype=torch.float32)
train_masks = torch.tensor(attention_masks, dtype=torch.float32)
train_token_type_ids = torch.tensor(token_type_ids, dtype=torch.long)

# prepare the validation set (developpement set)
dev_df = pd.read_csv('dev.csv')
dev_df = pre_process_headlines(dev_df)
validation_labels = dev_df["meanGrade"].values
validation_inputs, validation_masks, validation_token_type_ids = tokenize_from_dataframe(dev_df, tokenizer, MAX_LEN, concat)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels, dtype=torch.float32)
validation_masks = torch.tensor(validation_masks, dtype=torch.float32)
validation_token_type_ids = torch.tensor(validation_token_type_ids, dtype=torch.long)

train_data = torch.utils.data.TensorDataset(train_inputs, train_masks, train_token_type_ids, train_labels)
validation_data = torch.utils.data.TensorDataset(validation_inputs, validation_masks, validation_token_type_ids, validation_labels)

# DATA LOADERS
train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
validation_loader = torch.utils.data.DataLoader(validation_data, shuffle=False, batch_size=BATCH_SIZE)

print("Dataloaders created.")

Dataloaders created.


### Declare our model

In [118]:
class FunninessRegressor(nn.Module):

    def __init__(self):
        super(FunninessRegressor, self).__init__()

        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.1)
        self.fc1 = torch.nn.Linear(768, 1)

    def forward(self, x, att, token_type_ids):
        """
        att: attention mask
        """
        # inspired by https://huggingface.co/transformers/_modules/transformers/models/bert/modeling_bert.html#BertForSequenceClassification
        outputs = self.bert(x, token_type_ids=token_type_ids, attention_mask=att)
        pooled_output = outputs[1] # directly gives the CLS corresponding vector
        pooled_output = self.dropout(pooled_output)
        out = torch.clamp(self.fc1(pooled_output), min=0.0, max=3.0)
        return out

In [127]:
## Approach 1 code, using functions defined above:

# Number of epochs
num_epochs = 2 #2
learning_rate = 2e-6 #2e-6
adam_eps = 1e-8

# define model
model = FunninessRegressor()
print("Model initialised.")

model.to(device)

# define optimizer - special optimizer for BERT
model.train()
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_eps, correct_bias=False)

loss_fn = nn.MSELoss()
loss_fn = loss_fn.to(device)

Model initialised.


In [128]:
train(model, train_loader, validation_loader, num_epochs)

Training model.
| Epoch: 01 | Train Loss: 0.37 | Train MSE: 0.37 | Train RMSE: 0.61 |         Val. Loss: 0.33 | Val. MSE: 0.33 |  Val. RMSE: 0.58 |
| Epoch: 02 | Train Loss: 0.33 | Train MSE: 0.33 | Train RMSE: 0.57 |         Val. Loss: 0.32 | Val. MSE: 0.32 |  Val. RMSE: 0.56 |


In [129]:
model.eval()
print("To eval mode.")

To eval mode.


### Look at a few results on the train_loader

In [130]:
mean_value = 0.9355712114933001

In [131]:
torch.ones((2, 2)) * mean_value

tensor([[0.9356, 0.9356],
        [0.9356, 0.9356]])

In [132]:
# make predictions
train_predictions = []
model_perf = 0
mean_perf = 0
true_labels = []
with torch.no_grad():
    for batch in train_loader:
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # unzip
        input_ids, input_masks, input_toktypes, targets = batch
        # predict (forward pass)
        pred = model(input_ids, input_masks, input_toktypes)
        train_predictions += [pred]
        true_labels += [targets]

        mean_predictions = torch.ones_like(pred) * mean_value
        mean_predictions = mean_predictions.to(device)

        # for the model
        # get scores
        pred = pred.squeeze(1)
        train_loss = loss_fn(pred, targets)

        # We get the mse
        pred, trg = pred.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse, __ = model_performance(pred, trg)
        model_perf += sse

        # for the mean prediction
        
        # get scores
        mean_predictions = mean_predictions.squeeze(1)
        mean_loss = loss_fn(mean_predictions, targets)

        # We get the mse
        pred_mean, trg_mean = mean_predictions.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse_mean, __ = model_performance(pred_mean, trg_mean)
        mean_perf += sse_mean


train_predictions = torch.cat(train_predictions, dim=0)
true_labels = torch.cat(true_labels, dim=0).unsqueeze(1)
check_values = torch.cat((train_predictions, true_labels), dim=1)
print("Performances : --- Model : {} --- --- Mean predictor : {} ---".format(model_perf, mean_perf))
print("Values : ", check_values)

Performances : --- Model : 2821.4396090507507 --- --- Mean predictor : 3287.5094604492188 ---
Values :  tensor([[1.1417, 1.0000],
        [1.0725, 1.4000],
        [0.8878, 0.2000],
        ...,
        [1.0950, 0.2000],
        [1.0840, 0.8000],
        [1.0207, 1.8000]], device='cuda:0')


In [133]:
print((2645 - 2020)/2645, (642 - 561)/642)
print((2614 - 2002)/2614, (673-598)/673)
print("With new version")
print((2634 - 2267)/2634, (653-634)/653)
print((2667 - 1809)/2667, (619-586)/619)
print((2651 - 1787)/2651, (635-555)/635)
print((2621 - 1920)/2621, (666-573)/666)
print((3287 - 2415)/3287, (809-697)/809)

0.23629489603024575 0.1261682242990654
0.234123947972456 0.11144130757800892
With new version
0.13933181473044798 0.02909647779479326
0.3217097862767154 0.05331179321486268
0.32591474915126367 0.12598425196850394
0.26745516978252576 0.13963963963963963
0.26528749619714026 0.138442521631644


In [134]:
# make predictions
validation_predictions = []
model_perf = 0
mean_perf = 0
true_labels = []
with torch.no_grad():
    for batch in validation_loader:
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # unzip
        input_ids, input_masks, input_toktypes, targets = batch
        # predict (forward pass)
        pred = model(input_ids, input_masks, input_toktypes)
        validation_predictions += [pred]
        true_labels += [targets]

        mean_predictions = torch.ones_like(pred) * mean_value
        mean_predictions = mean_predictions.to(device)

        # for the model
        # get scores
        pred = pred.squeeze(1)
        validation_loss = loss_fn(pred, targets)

        # We get the mse
        pred, trg = pred.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse, __ = model_performance(pred, trg)
        model_perf += sse

        # for the mean prediction
        
        # get scores
        mean_predictions = mean_predictions.squeeze(1)
        mean_loss = loss_fn(mean_predictions, targets)

        # We get the mse
        pred_mean, trg_mean = mean_predictions.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse_mean, __ = model_performance(pred_mean, trg_mean)
        mean_perf += sse_mean


validation_predictions = torch.cat(validation_predictions, dim=0)
true_labels = torch.cat(true_labels, dim=0).unsqueeze(1)
check_values = torch.cat((validation_predictions, true_labels), dim=1)
print("Performances : --- Model : {} --- --- Mean predictor : {} ---".format(model_perf, mean_perf))
print("Values : ", check_values)

Performances : --- Model : 769.3987545967102 --- --- Mean predictor : 809.26771068573 ---
Values :  tensor([[0.6939, 1.0000],
        [1.2319, 0.8000],
        [0.8907, 0.6000],
        ...,
        [1.0805, 1.4000],
        [1.1760, 1.4000],
        [1.2491, 0.6000]], device='cuda:0')


### Now, we can score the test dataset

In [85]:
# get the data
test_df = pd.read_csv('test.csv')
# transforms the headlines
test_df = pre_process_headlines(test_df)
test_input_ids, test_attention_masks, test_token_type_ids = tokenize_from_dataframe(test_df, tokenizer, MAX_LEN)

# maybe we should make sure that max_len was enough

# convert to tensor
test_inputs = torch.tensor(test_input_ids)
test_masks = torch.tensor(test_attention_masks, dtype=torch.float32)
test_token_type_ids = torch.tensor(test_token_type_ids, dtype=torch.long)


# create 
test_data = torch.utils.data.TensorDataset(test_inputs, test_masks, test_token_type_ids)
test_loader = torch.utils.data.DataLoader(test_data, shuffle=False, batch_size=BATCH_SIZE)

# make predictions
test_predictions = []
with torch.no_grad():
    for batch in test_loader:
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # unzip
        input_ids, input_masks, input_toktypes = batch
        # predict (forward pass)
        pred = model(input_ids, input_masks, input_toktypes)
        test_predictions += [pred]

test_predictions = torch.cat(test_predictions, dim=0)
test_predictions

tensor([[0.6947],
        [0.8524],
        [0.8733],
        ...,
        [1.1452],
        [0.8435],
        [0.7416]], device='cuda:0')

In [95]:
submission_ids = test_df.id.values
submission_preds = test_predictions.cpu().squeeze().numpy()
submission = pd.DataFrame({'id': submission_ids, 'pred': submission_preds})
submission

Unnamed: 0,id,pred
0,36,0.694663
1,2157,0.852432
2,9385,0.873314
3,14434,1.100261
4,9462,0.700945
...,...,...
3019,3921,1.136275
3020,12371,0.990181
3021,6845,1.145185
3022,2902,0.843490


In [98]:
submission.to_csv("submission_task1_approach1_try1.csv", index=False)

In [99]:
for i in range(50):
    print(dev_df["new"][i])
    print("Funniness : ", test_predictions[i].item())
    print("---")
    

Thousands of gay and bisexual swans convicted of long-abolished sexual offences are posthumously pardoned
Funniness :  0.6946625709533691
---
Special chef appointed to Trump Russia
Funniness :  0.852432370185852
---
Spanish police detain man and search Ripoll addresses in hunt for terror squad 
Funniness :  0.8733142018318176
---
N.Y. Times applauds reporter for sharing ' unfounded rumor ' about Melania Trump
Funniness :  1.1002613306045532
---
Vladimir Putin Releases Video Simulation Of Russian balloon striking Florida conveniently right on top of USSOCOM headquarters at MacDill AFB .
Funniness :  0.7009454369544983
---
Ex-Goldman Sachs boss , Obama ambassador Murphy wins Democratic primary in bid to chase New Jersey GOP Gov. Christie
Funniness :  0.9785406589508057
---
Trump ’s next military assassinations : Foreign-born service members targeted by Pentagon
Funniness :  0.7711414694786072
---
President Trump ’s Golden Age of Skydiving 
Funniness :  1.1677621603012085
---
US urges UN 