In [1]:
# import libraries

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, random_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import codecs

In [2]:
# set random seed and device

SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [3]:
# load datasets

train_df = pd.read_csv('train.csv')
dev_df = pd.read_csv('dev.csv')

In [37]:
# We define our training loop
def train(train_iter, dev_iter, model, number_epoch):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """

    
    print("Training model.")

    for epoch in range(1, number_epoch+1):

        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0  # Observations used for training so far

        for batch in train_iter:

            feature, target = batch

            feature, target = feature.to(device), target.to(device)

            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]

            predictions = model(feature).squeeze(1)

            optimizer.zero_grad()

            loss = loss_fn(predictions, target)

            sse, __ = model_performance(predictions.detach().cpu().numpy(), target.detach().cpu().numpy())

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse

        valid_loss, valid_mse, __, __ = eval(dev_iter, model)

        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.2f} | Train MSE: {epoch_mse:.2f} | Train RMSE: {epoch_mse**0.5:.2f} | \
        Val. Loss: {valid_loss:.2f} | Val. MSE: {valid_mse:.2f} |  Val. RMSE: {valid_mse**0.5:.2f} |')

In [41]:
# We evaluate performance on our dev set
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            feature, target = batch

            feature, target = feature.to(device), target.to(device)

            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]

            predictions = model(feature).squeeze(1)
            loss = loss_fn(predictions, target)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), target.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse
            pred_all.extend(pred)
            trg_all.extend(trg)

    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)

In [4]:
# How we print the model performance
def model_performance(output, target, print_output=False):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """

    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse

In [5]:
def create_vocab(data):
    """
    Creating a corpus of all the tokens used
    """
    tokenized_corpus = [] # Let us put the tokenized corpus in a list

    for sentence in data:

        tokenized_sentence = []

        for token in sentence.split(' '): # simplest split is

            tokenized_sentence.append(token)

        tokenized_corpus.append(tokenized_sentence)

    # Create single list of all vocabulary
    vocabulary = []  # Let us put all the tokens (mostly words) appearing in the vocabulary in a list

    for sentence in tokenized_corpus:

        for token in sentence:

            if token not in vocabulary:

                if True:
                    vocabulary.append(token)

    return vocabulary, tokenized_corpus

# Approach 2

## a. TF-IDF code (given) and baseline

In [6]:
train_proportion = 0.8
train_and_dev = train_df['edit']

training_data, dev_data, training_y, dev_y = train_test_split(train_df['edit'], train_df['meanGrade'],
                                                                        test_size=(1-train_proportion),
                                                                        random_state=42)

# We train a Tf-idf model
count_vect = CountVectorizer(stop_words='english')
train_counts = count_vect.fit_transform(training_data)
transformer = TfidfTransformer().fit(train_counts)
train_counts = transformer.transform(train_counts)
regression_model = LinearRegression().fit(train_counts, training_y)

# Train predictions
predicted_train = regression_model.predict(train_counts)

# Calculate Tf-idf using train and dev, and validate model on dev:
test_and_test_counts = count_vect.transform(train_and_dev)
transformer = TfidfTransformer().fit(test_and_test_counts)

test_counts = count_vect.transform(dev_data)

test_counts = transformer.transform(test_counts)

# Dev predictions
predicted = regression_model.predict(test_counts)

# We run the evaluation:
print("\nTrain performance:")
sse, mse = model_performance(predicted_train, training_y, True)

print("\nDev performance:")
sse, mse = model_performance(predicted, dev_y, True)


Train performance:
| MSE: 0.13 | RMSE: 0.37 |

Dev performance:
| MSE: 0.36 | RMSE: 0.60 |


In [7]:
# baseline for the task

pred_baseline = torch.zeros(len(dev_y)) + np.mean(training_y)
print("\nBaseline performance:")
sse, mse = model_performance(pred_baseline, dev_y, True)


Baseline performance:
| MSE: 0.34 | RMSE: 0.58 |


## b. BERT-like model
First attempt at training a BERT-like transformer model. We thought we would create a model from scratch using the given libraries, train it on a smaller but dedicated dataset, and implement it in place of BERT. We went for a RoBERTa-like model. All elements of the model are identified with the suffix "_2b"

Largely inspired by https://towardsdatascience.com/transformers-retraining-roberta-base-using-the-roberta-mlm-procedure-7422160d5764 

In [8]:
# mount google drive to work in colab

from google.colab import drive
drive.mount("/content/drive")

!mkdir "/content/drive/My Drive/abc/aBERTc2"

Mounted at /content/drive
mkdir: cannot create directory ‘/content/drive/My Drive/abc/aBERTc2’: File exists


In [9]:
# extract txt from original csv dataset
# commented out as only needed the first time

# import pandas as pd
# abcnews = pd.read_csv('/content/drive/My Drive/abc/abcnews.csv')

# with open("/content/drive/My Drive/abc/abcheads.txt", "a") as file:
#   for line in abcnews['headline_text']:
#     line += "\n"
#     file.write(line)

In [10]:
# import premade tokenizer (byte-level BPE)

from transformers import RobertaTokenizer

tokenizer_2b = RobertaTokenizer.from_pretrained('roberta-base')

ModuleNotFoundError: ignored

In [None]:
# prepare txt dataset with tokenizer

from transformers import LineByLineTextDataset

dataset_2b = LineByLineTextDataset(
    tokenizer=tokenizer_2b,
    file_path="/content/drive/My Drive/abc/abcheads.txt",
    block_size=128,
)

In [None]:
# prepare data collator (speeds up things, not sure what)

from transformers import DataCollatorForLanguageModeling

data_collator_2b = DataCollatorForLanguageModeling(
    tokenizer=tokenizer_2b, mlm=True, mlm_probability=0.15
)

In [None]:
# create Roberta model from scratch, just prepare config

from transformers import RobertaConfig, RobertaForMaskedLM

config_2b = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

model_2b = RobertaForMaskedLM(config=config_2b)

In [None]:
# set up training configurations

from transformers import Trainer, TrainingArguments

training_args_2b = TrainingArguments(
    output_dir="/content/drive/My Drive/abc/aBERTc2",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    seed=1
)

trainer_2b = Trainer(
    model=model_2b,
    args=training_args_2b,
    data_collator=data_collator_2b,
    train_dataset=dataset_2b
)

In [None]:
# verify model parameters ~ 84M

model_2b.num_parameters()

In [None]:
# train and save trained model

trainer_2b.train()
trainer_2b.save_model("/content/drive/My Drive/abc/aBERTc2")

In [None]:
# sanity check on model performance

from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="/content/drive/My Drive/abc/aBERTc2",
    tokenizer="roberta-base"
)
fill_mask("Send these <mask> back!")

At this point we tried replacing the newly trained RoBERTa model in the pipeline we had designed for Approach 1.

(Follows Samy's code and explanation of what didn't work)

## c. FFN with word embeddings
As second attempt we tried a modified version of a 2-layered feed-forward neural networks, modified from the 2nd lab of the course. We substituted the classification layer at the top with another linear layer in order to make it appropriate for the regression. We used all the words from the original headline + the edited word, stripped of its punctuation.

In [11]:
# recreate tok_corpus, word2idx and related functions, with punctuation removal

import re
re_punctuation_string = '[\s,/.\'<>]'

def get_tokenized_corpus(corpus):
  tokenized_corpus = []

  for sentence in corpus:
    tokenized_sentence = []
    for token in re.split(' ', sentence): 
      tokenized_sentence.append(token)
    tokenized_corpus.append(tokenized_sentence)
 
  return tokenized_corpus


def get_word2idx(tokenized_corpus):
  vocabulary = []
  for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)
  
  word2idx = {w: idx+1 for (idx, w) in enumerate(vocabulary)}
  word2idx['<pad>'] = 0
  
  return word2idx


def get_model_inputs(tokenized_corpus, word2idx, labels):
  vectorized_sents = [[word2idx[tok] for tok in sent if tok in word2idx] for sent in tokenized_corpus]
  sent_lengths = [len(sent) for sent in vectorized_sents]
  max_len = max(sent_lengths)
  sent_tensor = torch.zeros((len(vectorized_sents), max_len)).long()
  for idx, (sent, sentlen) in enumerate(zip(vectorized_sents, sent_lengths)):
    sent_tensor[idx, :sentlen] = torch.LongTensor(sent)
  label_tensor = torch.FloatTensor(labels)
  return sent_tensor, label_tensor

In [15]:
# construct network model

class FFNN_2c(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size):  
        super(FFNN_2c, self).__init__()

        # padding_idx argument makes sure that the 0-th token in the vocabulary
        # is used for padding purposes i.e. its embedding will be a 0-vector
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.relu2 = nn.ReLU()  
    
    def forward(self, x):
        # x has shape (batch_size, max_sent_len)

        embedded = self.embedding(x)
        sent_lens = x.ne(0).sum(1, keepdims=True)
        averaged = embedded.sum(1) / sent_lens

        out = self.fc1(averaged)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = torch.clamp(out, max=3.0)
        return out

In [13]:
# prepare traning set

tokenized_corpus = get_tokenized_corpus(train_df["original"].str.cat(train_df["edit"], sep=" ").tolist())
word2idx = get_word2idx(tokenized_corpus)
train_sent_tensor, train_label_tensor = get_model_inputs(tokenized_corpus, word2idx, train_df['meanGrade'])

In [18]:
# run model on training dataset

EPOCHS = 10
LRATE = 0.5
EMBEDDING_DIM = 80
HIDDEN_DIM = 80

model_2c = FFNN_2c(EMBEDDING_DIM, HIDDEN_DIM, len(word2idx))
optimizer_2c = torch.optim.SGD(model_2c.parameters(), lr=LRATE)
loss_fn = nn.MSELoss()

feature = train_sent_tensor
target = train_label_tensor

for epoch in range(1, EPOCHS + 1):
  model_2c.train()
  optimizer_2c.zero_grad()
  
  predictions = model_2c(feature).squeeze(1)
  loss = loss_fn(predictions, target)
  train_loss = loss.item()
  
  loss.backward()
  optimizer_2c.step()
  
  print(f'| Epoch: {epoch:02} | Train Loss: {train_loss:.3f}')

| Epoch: 01 | Train Loss: 1.205
| Epoch: 02 | Train Loss: 0.786
| Epoch: 03 | Train Loss: 0.580
| Epoch: 04 | Train Loss: 0.544
| Epoch: 05 | Train Loss: 0.398
| Epoch: 06 | Train Loss: 0.367
| Epoch: 07 | Train Loss: 0.351
| Epoch: 08 | Train Loss: 0.347
| Epoch: 09 | Train Loss: 0.345
| Epoch: 10 | Train Loss: 0.344


In [19]:
# prepare validation dataset

dev_tokenized_corpus = get_tokenized_corpus(dev_df["original"].str.cat(dev_df["edit"], sep=" ").tolist())
dev_sent_tensor, dev_label_tensor = get_model_inputs(dev_tokenized_corpus, word2idx, dev_df['meanGrade'])

In [20]:
# optimise hyperparameters on validation set

EPOCHS = 30
LRATE = 0.5
EMBEDDING_DIM = 50
HIDDEN_DIM = 50
OUTPUT_DIM = 1

model_2c = FFNN_2c(EMBEDDING_DIM, HIDDEN_DIM, len(word2idx))
optimizer_2c = torch.optim.SGD(model_2c.parameters(), lr=LRATE)
loss_fn = nn.MSELoss()

feature_train = train_sent_tensor
target_train = train_label_tensor

feature_valid = dev_sent_tensor
target_valid = dev_label_tensor

print(f'Will train for {EPOCHS} epochs')
for epoch in range(1, EPOCHS + 1):
  model_2c.train()
  optimizer_2c.zero_grad()
  predictions = model_2c(feature_train).squeeze(1)

  loss = loss_fn(predictions, target_train)
  train_loss = loss.item()

  loss.backward()
  optimizer_2c.step()
  
  model_2c.eval()
  with torch.no_grad():
    predictions_valid = model_2c(feature_valid).squeeze(1)
    valid_loss = loss_fn(predictions_valid, target_valid).item()
  
  print(f'| Epoch: {epoch:02} | Train Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f} |')

Will train for 30 epochs
| Epoch: 01 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 02 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 03 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 04 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 05 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 06 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 07 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 08 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 09 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 10 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 11 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 12 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 13 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 14 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 15 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 16 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 17 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 18 | Train Loss: 1.216 | Val. Loss: 1.208 |
| Epoch: 19 | Train L

## d. Freestyle approach with text-based engineered features

In [21]:
punctuations = '''!()-[]{};:‘’"\,.?@#$%^&*_~'''
sex_words = ["orgy", "dick", "suck", "underwear", "sex", "sexual", "massage", "flirt", "kiss", "ass", "penis"]

def create_engineered_features(train_df, punctuations, sex_words):
    train_df["lower_case"] = train_df.apply(
            lambda x: x["original"].lower(), axis=1
        )
    train_df["edit_lower"] = train_df.apply(
            lambda x: x["edit"].lower(), axis=1
        )
    train_df["nb_words"] = train_df.apply(
            lambda x: len(x["lower_case"].split(" ")), axis=1
        )
    train_df["nb_caracters"] = train_df.apply(
            lambda x: len(x["lower_case"]), axis=1
        )
    train_df["nb_ponctuations"] = train_df.apply(
            lambda x: sum([1 if char in punctuations else 0 for char in x["lower_case"]]), axis=1
        )
    train_df["edit_position"] = train_df.apply(
            lambda x: x["lower_case"].find("<"), axis=1
        )
    train_df["edit_rel_position"] = train_df.apply(
            lambda x: x["edit_position"]/x["nb_caracters"], axis=1
        )
    train_df["len_edit"] = train_df.apply(
            lambda x: len(x["edit_lower"]), axis=1
        )
    # i need trump
    train_df["trump_in_original"] = train_df.apply(
            lambda x: float(x["lower_case"].find("trump") != -1), axis=1
        )
    train_df["trump_in_edit"] = train_df.apply(
            lambda x: float(x["edit_lower"].find("trump") != -1), axis=1
        )

    # i need hair
    train_df["hair_in_original"] = train_df.apply(
            lambda x: float(x["lower_case"].find("hair") != -1), axis=1
        )
    train_df["hair_in_edit"] = train_df.apply(
            lambda x: float(x["edit_lower"].find("hair") != -1), axis=1
        )
    train_df["sex_in_edit"] = train_df.apply(
            lambda x: float(x["edit_lower"] in sex_words), axis=1
        )

    return train_df

In [22]:
train_df = create_engineered_features(train_df, punctuations, sex_words)
train_df.head()

Unnamed: 0,id,original,edit,grades,meanGrade,lower_case,edit_lower,nb_words,nb_caracters,nb_ponctuations,edit_position,edit_rel_position,len_edit,trump_in_original,trump_in_edit,hair_in_original,hair_in_edit,sex_in_edit
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2,france is ‘ hunting down its citizens who join...,twins,15,80,2,49,0.6125,5,0.0,0.0,0.0,0.0,0.0
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6,"pentagon claims 2,000 % increase in russian tr...",bowling,17,97,4,57,0.587629,7,0.0,0.0,0.0,0.0,0.0
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0,iceland pm calls snap vote as pedophile furor ...,party,11,67,0,54,0.80597,5,0.0,0.0,0.0,0.0,0.0
3,76,"In an apparent first , Iran and Israel <engage...",slap,20000,0.4,"in an apparent first , iran and israel <engage...",slap,12,70,1,39,0.557143,4,0.0,0.0,0.0,0.0,0.0
4,6164,Trump was told weeks ago that Flynn misled <Vi...,school,0,0.0,trump was told weeks ago that flynn misled <vi...,school,11,62,1,43,0.693548,6,1.0,0.0,0.0,0.0,0.0


In [23]:
# extract labels and features from training set 

columns = [
    "nb_words",
    "nb_caracters",
    "nb_ponctuations",
    "edit_position",
    "edit_rel_position",
    "len_edit",	"trump_in_original",
    "trump_in_edit",
    "hair_in_original",
    "hair_in_edit",
    "sex_in_edit"
    ]
    
features = train_df[columns].values
labels = train_df.meanGrade.values

In [24]:
# prepare the validation set

dev_df = pd.read_csv('dev.csv')
dev_df = create_engineered_features(dev_df, punctuations, sex_words)
dev_inputs = dev_df[columns].values
dev_labels = dev_df['meanGrade'].values

In [25]:
# create dataloaders

BATCH_SIZE = 32

train_inputs = torch.tensor(features, dtype=torch.float32)
train_labels = torch.tensor(labels, dtype=torch.float32)

validation_inputs = torch.tensor(dev_inputs, dtype=torch.float32)
validation_labels = torch.tensor(dev_labels, dtype=torch.float32)

train_data = torch.utils.data.TensorDataset(train_inputs, train_labels)
dev_data = torch.utils.data.TensorDataset(validation_inputs, validation_labels)

train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
validation_loader = torch.utils.data.DataLoader(dev_data, shuffle=False, batch_size=BATCH_SIZE)

print("Dataloaders created.")

Dataloaders created.


In [28]:
# design model architecture in torch

class FFNN_2d(nn.Module):

    def __init__(self):
        super(FFNN_2d, self).__init__()

        self.fc1 = torch.nn.Linear(11, 10)
        self.relu1 = nn.ReLU()
        self.fc2 = torch.nn.Linear(10, 1)

    def forward(self, x):
        
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = torch.clamp(out, min=0.0, max=3.0)
        return out

In [39]:
# set hyperparameters, model, optimizer and loss function

import torch.optim as optim

num_epochs = 2 #2
learning_rate = 1.8e-6 #2e-6
adam_eps = 1e-8

model_2d = FFNN_2d()
model_2d.to(device)
model_2d.train()
optimizer = optim.Adam(model_2d.parameters(), lr=learning_rate, eps=adam_eps)

loss_fn = nn.MSELoss()
loss_fn = loss_fn.to(device)

In [42]:
# train model

train(train_loader, validation_loader, model_2d, num_epochs)

Training model.
| Epoch: 01 | Train Loss: 1.22 | Train MSE: 1.22 | Train RMSE: 1.10 |         Val. Loss: 1.21 | Val. MSE: 1.21 |  Val. RMSE: 1.10 |
| Epoch: 02 | Train Loss: 1.22 | Train MSE: 1.22 | Train RMSE: 1.10 |         Val. Loss: 1.21 | Val. MSE: 1.21 |  Val. RMSE: 1.10 |


In [44]:
# make predictions
train_predictions = []
model_perf = 0
mean_perf = 0
true_labels = []
with torch.no_grad():
    for batch in train_loader:
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # unzip
        input_ids, targets = batch
        # predict (forward pass)
        pred = model_2d(input_ids)
        train_predictions += [pred]
        true_labels += [targets]

        mean_predictions = torch.ones_like(pred) * mean_value
        mean_predictions = mean_predictions.to(device)

        # for the model
        # get scores
        pred = pred.squeeze(1)
        train_loss = loss_fn(pred, targets)

        # We get the mse
        pred, trg = pred.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse, __ = model_performance(pred, trg)
        model_perf += sse

        # for the mean prediction
        
        # get scores
        mean_predictions = mean_predictions.squeeze(1)
        mean_loss = loss_fn(mean_predictions, targets)

        # We get the mse
        pred_mean, trg_mean = mean_predictions.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse_mean, __ = model_performance(pred_mean, trg_mean)
        mean_perf += sse_mean


train_predictions = torch.cat(train_predictions, dim=0)
true_labels = torch.cat(true_labels, dim=0).unsqueeze(1)
check_values = torch.cat((train_predictions, true_labels), dim=1)
print("Performances : --- Model : {} --- --- Mean predictor : {} ---".format(model_perf, mean_perf))
print("Values : ", check_values)

NameError: ignored

In [45]:
# make predictions

mean_value = 0.9355712114933001
validation_predictions = []
model_perf = 0
mean_perf = 0
true_labels = []
with torch.no_grad():
    for batch in validation_loader:
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # unzip
        input_ids, targets = batch
        # predict (forward pass)
        pred = model_2d(input_ids)
        validation_predictions += [pred]
        true_labels += [targets]

        mean_predictions = torch.ones_like(pred) * mean_value
        mean_predictions = mean_predictions.to(device)

        # for the model
        # get scores
        pred = pred.squeeze(1)
        validation_loss = loss_fn(pred, targets)

        # We get the mse
        pred, trg = pred.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse, __ = model_performance(pred, trg)
        model_perf += sse

        # for the mean prediction
        
        # get scores
        mean_predictions = mean_predictions.squeeze(1)
        mean_loss = loss_fn(mean_predictions, targets)

        # We get the mse
        pred_mean, trg_mean = mean_predictions.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse_mean, __ = model_performance(pred_mean, trg_mean)
        mean_perf += sse_mean


validation_predictions = torch.cat(validation_predictions, dim=0)
true_labels = torch.cat(true_labels, dim=0).unsqueeze(1)
check_values = torch.cat((validation_predictions, true_labels), dim=1)
print("Performances : --- Model : {} --- --- Mean predictor : {} ---".format(model_perf, mean_perf))
print("Values : ", check_values)

Performances : --- Model : 2922.9600105285645 --- --- Mean predictor : 809.26771068573 ---
Values :  tensor([[0.0000, 1.0000],
        [0.0000, 0.8000],
        [0.0000, 0.6000],
        ...,
        [0.0000, 1.4000],
        [0.0000, 1.4000],
        [0.0000, 0.6000]], device='cuda:0')


In [46]:
# get the data
test_df = pd.read_csv('test.csv')
# transforms the headlines
test_df = create_engineered_features(test_df, punctuations, sex_words)
test_input_ids = test_df[columns].values

# convert to tensor
test_inputs = torch.tensor(test_input_ids)

# create 
test_data = torch.utils.data.TensorDataset(test_inputs)
test_loader = torch.utils.data.DataLoader(test_data, shuffle=False, batch_size=BATCH_SIZE)

# make predictions
test_predictions = []
with torch.no_grad():
    for batch in test_loader:
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # unzip
        input_ids = batch
        # predict (forward pass)
        pred = model_2d(input_ids)
        test_predictions += [pred]

test_predictions = torch.cat(test_predictions, dim=0)
test_predictions

FileNotFoundError: ignored