In [None]:
# import libraries

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, random_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import codecs

In [None]:
# set random seed and device

SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

### Define useful functions
We had to slightly modify train and eval to adapt to the forward pass of our model.

In [None]:
# We define our training loop
def train(train_iter, dev_iter, model, number_epoch):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """

    
    print("Training model.")

    for epoch in range(1, number_epoch+1):

        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0  # Observations used for training so far

        for batch in train_iter:

            input_sent, input_feat, target = batch

            input_sent, input_feat, target = input_sent.to(device), input_feat.to(device), target.to(device)

            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]

            predictions = model(input_sent, input_feat).squeeze(1)

            optimizer.zero_grad()

            loss = loss_fn(predictions, target)

            sse, __ = model_performance(predictions.detach().cpu().numpy(), target.detach().cpu().numpy())

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse

        valid_loss, valid_mse, __, __ = eval(dev_iter, model)

        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train MSE: {epoch_mse:.3f} | Train RMSE: {epoch_mse**0.5:.3f} | \
        Val. Loss: {valid_loss:.3f} | Val. MSE: {valid_mse:.3f} |  Val. RMSE: {valid_mse**0.5:.3f} |')

In [None]:
# We evaluate performance on our dev set
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            input_sent, input_feat, target = batch

            input_sent, input_feat, target = input_sent.to(device), input_feat.to(device), target.to(device)

            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]

            predictions = model(input_sent, input_feat).squeeze(1)
            loss = loss_fn(predictions, target)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), target.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse
            pred_all.extend(pred)
            trg_all.extend(trg)

    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)

In [None]:
# How we print the model performance
def model_performance(output, target, print_output=False):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """

    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse

In [None]:
def create_vocab(data):
    """
    Creating a corpus of all the tokens used
    """
    tokenized_corpus = [] # Let us put the tokenized corpus in a list

    for sentence in data:

        tokenized_sentence = []

        for token in sentence.split(' '): # simplest split is

            tokenized_sentence.append(token)

        tokenized_corpus.append(tokenized_sentence)

    # Create single list of all vocabulary
    vocabulary = []  # Let us put all the tokens (mostly words) appearing in the vocabulary in a list

    for sentence in tokenized_corpus:

        for token in sentence:

            if token not in vocabulary:

                if True:
                    vocabulary.append(token)

    return vocabulary, tokenized_corpus

# Approach 2

In [None]:
# load datasets

train_df = pd.read_csv('train.csv')
dev_df = pd.read_csv('dev.csv')

## a. TF-IDF code (given) and baseline

In [None]:
train_proportion = 0.8
train_and_dev = train_df['edit']

training_data, dev_data, training_y, dev_y = train_test_split(train_df['edit'], train_df['meanGrade'],
                                                                        test_size=(1-train_proportion),
                                                                        random_state=42)

# We train a Tf-idf model
count_vect = CountVectorizer(stop_words='english')
train_counts = count_vect.fit_transform(training_data)
transformer = TfidfTransformer().fit(train_counts)
train_counts = transformer.transform(train_counts)
regression_model = LinearRegression().fit(train_counts, training_y)

# Train predictions
predicted_train = regression_model.predict(train_counts)

# Calculate Tf-idf using train and dev, and validate model on dev:
test_and_test_counts = count_vect.transform(train_and_dev)
transformer = TfidfTransformer().fit(test_and_test_counts)

test_counts = count_vect.transform(dev_data)

test_counts = transformer.transform(test_counts)

# Dev predictions
predicted = regression_model.predict(test_counts)

# We run the evaluation:
print("\nTrain performance:")
sse, mse = model_performance(predicted_train, training_y, True)

print("\nDev performance:")
sse, mse = model_performance(predicted, dev_y, True)


Train performance:
| MSE: 0.13 | RMSE: 0.37 |

Dev performance:
| MSE: 0.36 | RMSE: 0.60 |


In [None]:
# baseline for the task

pred_baseline = torch.zeros(len(dev_y)) + np.mean(training_y)
print("\nBaseline performance:")
sse, mse = model_performance(pred_baseline, dev_y, True)


Baseline performance:
| MSE: 0.34 | RMSE: 0.58 |


## b. BERT-like model
First attempt at training a BERT-like transformer model. We thought we would create a model from scratch using the given libraries, train it on a smaller but dedicated dataset, and implement it in place of BERT. We went for a RoBERTa-like model. All elements of the model are identified with the suffix "_2b"

Largely inspired by https://towardsdatascience.com/transformers-retraining-roberta-base-using-the-roberta-mlm-procedure-7422160d5764 

With finally had issues and couldn't achieve what we wanted, thus, we set the bool attempt_bert to False, so this lines of code do not run but you can see what we tried.

In [1]:
# mount google drive to work in colab
attempt_bert = False
"""
from google.colab import drive
drive.mount("/content/drive")

!mkdir "/content/drive/My Drive/abc/aBERTc2"
"""
print(" ")

 


In [2]:
# extract txt from original csv dataset
# commented out as only needed the first time

# import pandas as pd
# abcnews = pd.read_csv('/content/drive/My Drive/abc/abcnews.csv')

# with open("/content/drive/My Drive/abc/abcheads.txt", "a") as file:
#   for line in abcnews['headline_text']:
#     line += "\n"
#     file.write(line)

In [3]:
# import premade tokenizer (byte-level BPE)
if attempt_bert:
    from transformers import RobertaTokenizer
    from transformers import LineByLineTextDataset
    from transformers import DataCollatorForLanguageModeling
    from transformers import RobertaConfig, RobertaForMaskedLM
    from transformers import Trainer, TrainingArguments
    from transformers import pipeline

    tokenizer_2b = RobertaTokenizer.from_pretrained('roberta-base')

In [4]:
# prepare txt dataset with tokenizer
if attempt_bert:
    dataset_2b = LineByLineTextDataset(
        tokenizer=tokenizer_2b,
        file_path="/content/drive/My Drive/abc/abcheads.txt",
        block_size=128,
    )

In [5]:
# prepare data collator (speeds up things, not sure what)
if attempt_bert:
    data_collator_2b = DataCollatorForLanguageModeling(
        tokenizer=tokenizer_2b, mlm=True, mlm_probability=0.15
    )

In [6]:
# create Roberta model from scratch, just prepare config
if attempt_bert:
    config_2b = RobertaConfig(
        vocab_size=52_000,
        max_position_embeddings=514,
        num_attention_heads=12,
        num_hidden_layers=6,
        type_vocab_size=1,
    )

    model_2b = RobertaForMaskedLM(config=config_2b)

In [7]:
# set up training configurations
if attempt_bert:
    training_args_2b = TrainingArguments(
        output_dir="/content/drive/My Drive/abc/aBERTc2",
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=64,
        save_steps=10_000,
        save_total_limit=2,
        seed=1
    )

    trainer_2b = Trainer(
        model=model_2b,
        args=training_args_2b,
        data_collator=data_collator_2b,
        train_dataset=dataset_2b
    )

In [8]:
# verify model parameters ~ 84M
if attempt_bert:
    model_2b.num_parameters()

In [9]:
# train and save trained model
if attempt_bert:
    trainer_2b.train()
    trainer_2b.save_model("/content/drive/My Drive/abc/aBERTc2")

In [10]:
# sanity check on model performance
if attempt_bert:
    fill_mask = pipeline(
        "fill-mask",
        model="/content/drive/My Drive/abc/aBERTc2",
        tokenizer="roberta-base"
    )
    fill_mask("Send these <mask> back!")

At this point, we wanted to use what we did in approach by replacing the previous pretrained BERT with this RoBERTa model but we had issue that we couldn't fix.

## c. Our final choice for approach 2 : FFN with word embeddings and engineered features
As second attempt we tried to combine a simple embedding with engineered features. We used all the words from the original headline + the edited word, stripped of its punctuation. Pass them to an embedding layer than to a linear layer with a relu activation function. We also use engineered features extracted from the headlines, pass them through two layers and concatenated both results into a single array than when two pass through two layers. We added drop out at every step to avoid overfitting.

In [None]:
# load datasets

train_df = pd.read_csv('train.csv')
dev_df = pd.read_csv('dev.csv')

#### Define useful functions

In [None]:
# recreate tok_corpus, word2idx and related functions, with punctuation removal

import re
re_punctuation_string = '[\s,/.\'<>]'

def get_tokenized_corpus(corpus):
  tokenized_corpus = []

  for sentence in corpus:
    tokenized_sentence = []
    for token in re.split(' ', sentence): 
      tokenized_sentence.append(token)
    tokenized_corpus.append(tokenized_sentence)
 
  return tokenized_corpus


def get_word2idx(tokenized_corpus):
  vocabulary = []
  for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)
  
  word2idx = {w: idx+1 for (idx, w) in enumerate(vocabulary)}
  word2idx['<pad>'] = 0
  
  return word2idx


def get_model_inputs(tokenized_corpus, word2idx):
  vectorized_sents = [[word2idx[tok] for tok in sent if tok in word2idx] for sent in tokenized_corpus]
  sent_lengths = [len(sent) for sent in vectorized_sents]
  max_len = max(sent_lengths)
  sent_tensor = torch.zeros((len(vectorized_sents), max_len)).long()
  for idx, (sent, sentlen) in enumerate(zip(vectorized_sents, sent_lengths)):
    sent_tensor[idx, :sentlen] = torch.LongTensor(sent)
  return sent_tensor

In [None]:
punctuations = '''!()-[]{};:‘’"\,.?@#$%^&*_~'''
# sorry for the words...
sex_words = ["orgy", "dick", "suck", "underwear", "sex", "sexual", "massage", "flirt", "kiss", "ass", "penis"]

def create_engineered_features(input_df, punctuations, sex_words):
    """
    Take a dataset as input, create eleven engineered features and output the modified dataset.
    """
    input_df["lower_case"] = input_df.apply(
            lambda x: x["original"].lower(), axis=1
        )
    input_df["edit_lower"] = input_df.apply(
            lambda x: x["edit"].lower(), axis=1
        )
    input_df["nb_words"] = input_df.apply(
            lambda x: len(x["lower_case"].split(" ")), axis=1
        )
    input_df["nb_caracters"] = input_df.apply(
            lambda x: len(x["lower_case"]), axis=1
        )
    input_df["nb_ponctuations"] = input_df.apply(
            lambda x: sum([1 if char in punctuations else 0 for char in x["lower_case"]]), axis=1
        )
    input_df["edit_position"] = input_df.apply(
            lambda x: x["lower_case"].find("<"), axis=1
        )
    input_df["edit_rel_position"] = input_df.apply(
            lambda x: x["edit_position"]/x["nb_caracters"], axis=1
        )
    input_df["len_edit"] = input_df.apply(
            lambda x: len(x["edit_lower"]), axis=1
        )
    # i need trump
    input_df["trump_in_original"] = input_df.apply(
            lambda x: float(x["lower_case"].find("trump") != -1), axis=1
        )
    input_df["trump_in_edit"] = input_df.apply(
            lambda x: float(x["edit_lower"].find("trump") != -1), axis=1
        )

    # i need hair
    input_df["hair_in_original"] = input_df.apply(
            lambda x: float(x["lower_case"].find("hair") != -1), axis=1
        )
    input_df["hair_in_edit"] = input_df.apply(
            lambda x: float(x["edit_lower"].find("hair") != -1), axis=1
        )
    input_df["sex_in_edit"] = input_df.apply(
            lambda x: float(x["edit_lower"] in sex_words), axis=1
        )

    return input_df

#### Prepare the training set

##### Prepare the headline to be embedded

In [None]:
# prepare for the embedding part
tokenized_corpus = get_tokenized_corpus(train_df["original"].str.cat(train_df["edit"], sep=" ").tolist())
word2idx = get_word2idx(tokenized_corpus)
train_sentence = get_model_inputs(tokenized_corpus, word2idx)

In [None]:
columns = [
    "nb_words", "nb_caracters", "nb_ponctuations", "edit_position",
    "edit_rel_position", "len_edit", "trump_in_original", "trump_in_edit",
    "hair_in_original", "hair_in_edit", "sex_in_edit"
]

##### Create and scale the engineered features

In [None]:
# prepare the engineered features
train_df = create_engineered_features(train_df, punctuations, sex_words)
    
train_features = train_df[columns].values
train_labels = train_df.meanGrade.values

scaler = StandardScaler()
scaler.fit(train_features)
train_features = scaler.transform(train_features)

#### Prepare the validation set in the same way

In [None]:
# prepare validation dataset
dev_tokenized_corpus = get_tokenized_corpus(dev_df["original"].str.cat(dev_df["edit"], sep=" ").tolist())
dev_sentence = get_model_inputs(dev_tokenized_corpus, word2idx)

dev_df = create_engineered_features(dev_df, punctuations, sex_words)
dev_features = dev_df[columns].values
dev_labels = dev_df['meanGrade'].values

dev_features = scaler.transform(dev_features)

#### Create the dataloaders

In [None]:
# create dataloaders

BATCH_SIZE = 32

train_features = torch.tensor(train_features, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.float32)

dev_features = torch.tensor(dev_features, dtype=torch.float32)
dev_labels = torch.tensor(dev_labels, dtype=torch.float32)

train_data = torch.utils.data.TensorDataset(train_sentence, train_features, train_labels)
dev_data = torch.utils.data.TensorDataset(dev_sentence, dev_features, dev_labels)

train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
validation_loader = torch.utils.data.DataLoader(dev_data, shuffle=False, batch_size=BATCH_SIZE)

print("Dataloaders created.")

Dataloaders created.


### Define the model described

In [None]:
# construct network model

class FFNN_2c(nn.Module):
    def __init__(self, embedding_dim, hidden_dim1, hidden_dim2, vocab_size, features_dim):  
        super(FFNN_2c, self).__init__()

        # padding_idx argument makes sure that the 0-th token in the vocabulary
        # is used for padding purposes i.e. its embedding will be a 0-vector
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        self.fc_eh1 = nn.Linear(embedding_dim, hidden_dim1)

        self.fc_fh2 = nn.Linear(features_dim, hidden_dim2)
        self.fc_h2h2 = nn.Linear(hidden_dim2, hidden_dim2)

        self.fc_hh = nn.Linear(hidden_dim1 + hidden_dim2, hidden_dim1)
        self.fc_ho = nn.Linear(hidden_dim1, 1)

        self.dropout = nn.Dropout(0.6)
    
    def forward(self, x_sent, x_feat):
        # x has shape (batch_size, max_sent_len)

        embedded = self.embedding(x_sent)
        sent_lens = x_sent.ne(0).sum(1, keepdims=True)
        averaged = embedded.sum(1) / sent_lens

        # pass the embedding though a layer
        out_emb = torch.relu(self.fc_eh1(averaged))
        out_emb = self.dropout(out_emb)

        # pass the engineered features through two layers
        out_feat = torch.relu(self.fc_fh2(x_feat))
        out_feat = torch.relu(self.fc_h2h2(out_feat))
        out_feat = self.dropout(out_feat)

        # concatenate
        out = torch.cat((out_emb, out_feat), dim=1)
        out = torch.relu(self.fc_hh(out))
        out = self.dropout(out)

        # prepare the regression output
        out = self.fc_ho(out)
        out = torch.clamp(out, max=3.0)
        return out

### Define the hyper-parameters, the model, the optimizer and the loss

In [None]:
# run model on training dataset

EPOCHS = 15
LRATE = 0.0004 # 0.001
EMBEDDING_DIM = 10
FEATURES_DIM = 11
HIDDEN_DIM1 = 40
HIDDEN_DIM2 = 6
OUTPUT_DIM = 1

model_2c = FFNN_2c(EMBEDDING_DIM, HIDDEN_DIM1, HIDDEN_DIM2, len(word2idx), FEATURES_DIM)

model_2c.to(device)
model_2c.train()

optimizer = optim.Adam(model_2c.parameters(), lr=LRATE)


loss_fn = nn.MSELoss()
loss_fn = loss_fn.to(device)

#### Train the model

In [None]:
train(train_loader, validation_loader, model_2c, EPOCHS)

Training model.
| Epoch: 01 | Train Loss: 0.576 | Train MSE: 0.576 | Train RMSE: 0.759 |         Val. Loss: 0.353 | Val. MSE: 0.353 |  Val. RMSE: 0.594 |
| Epoch: 02 | Train Loss: 0.434 | Train MSE: 0.434 | Train RMSE: 0.658 |         Val. Loss: 0.339 | Val. MSE: 0.339 |  Val. RMSE: 0.583 |
| Epoch: 03 | Train Loss: 0.410 | Train MSE: 0.410 | Train RMSE: 0.640 |         Val. Loss: 0.340 | Val. MSE: 0.340 |  Val. RMSE: 0.583 |
| Epoch: 04 | Train Loss: 0.396 | Train MSE: 0.396 | Train RMSE: 0.629 |         Val. Loss: 0.336 | Val. MSE: 0.336 |  Val. RMSE: 0.580 |
| Epoch: 05 | Train Loss: 0.381 | Train MSE: 0.381 | Train RMSE: 0.617 |         Val. Loss: 0.335 | Val. MSE: 0.335 |  Val. RMSE: 0.579 |
| Epoch: 06 | Train Loss: 0.374 | Train MSE: 0.374 | Train RMSE: 0.611 |         Val. Loss: 0.331 | Val. MSE: 0.331 |  Val. RMSE: 0.575 |
| Epoch: 07 | Train Loss: 0.365 | Train MSE: 0.365 | Train RMSE: 0.604 |         Val. Loss: 0.334 | Val. MSE: 0.334 |  Val. RMSE: 0.578 |
| Epoch: 08 | Trai

#### Verify performance of the final model on the training set (compared to mean baseline)

In [None]:
mean_value = 0.9355712114933001

# make predictions
train_predictions = []
model_perf = 0
mean_perf = 0
true_labels = []
with torch.no_grad():
    for batch in train_loader:
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # unzip
        input_sent, input_feat, targets = batch
        # predict (forward pass)
        pred = model_2c(input_sent, input_feat)
        train_predictions += [pred]
        true_labels += [targets]

        mean_predictions = torch.ones_like(pred) * mean_value
        mean_predictions = mean_predictions.to(device)

        # for the model
        # get scores
        pred = pred.squeeze(1)
        train_loss = loss_fn(pred, targets)

        # We get the mse
        pred, trg = pred.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse, __ = model_performance(pred, trg)
        model_perf += sse

        # for the mean prediction
        
        # get scores
        mean_predictions = mean_predictions.squeeze(1)
        mean_loss = loss_fn(mean_predictions, targets)

        # We get the mse
        pred_mean, trg_mean = mean_predictions.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse_mean, __ = model_performance(pred_mean, trg_mean)
        mean_perf += sse_mean


train_predictions = torch.cat(train_predictions, dim=0)
true_labels = torch.cat(true_labels, dim=0).unsqueeze(1)
check_values = torch.cat((train_predictions, true_labels), dim=1)
print("Performances : --- Model : {} --- --- Mean predictor : {} ---".format(model_perf, mean_perf))
print("Values : ", check_values)

Performances : --- Model : 3080.504093170166 --- --- Mean predictor : 3287.509463787079 ---
Values :  tensor([[0.9391, 1.0000],
        [0.7741, 0.2000],
        [0.9754, 1.2000],
        ...,
        [0.9690, 1.4000],
        [0.9749, 1.4000],
        [0.8969, 0.2000]], device='cuda:0')


#### Verify performance of the final model on the validation set (compared to mean baseline)

In [None]:
# make predictions

mean_value = 0.9355712114933001
validation_predictions = []
model_perf = 0
mean_perf = 0
true_labels = []
with torch.no_grad():
    for batch in validation_loader:
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # unzip
        input_sent, input_feat, targets = batch
        # predict (forward pass)
        pred = model_2c(input_sent, input_feat)
        validation_predictions += [pred]
        true_labels += [targets]

        mean_predictions = torch.ones_like(pred) * mean_value
        mean_predictions = mean_predictions.to(device)

        # for the model
        # get scores
        pred = pred.squeeze(1)
        validation_loss = loss_fn(pred, targets)

        # We get the mse
        pred, trg = pred.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse, __ = model_performance(pred, trg)
        model_perf += sse

        # for the mean prediction
        
        # get scores
        mean_predictions = mean_predictions.squeeze(1)
        mean_loss = loss_fn(mean_predictions, targets)

        # We get the mse
        pred_mean, trg_mean = mean_predictions.detach().cpu().numpy(), targets.detach().cpu().numpy()
        sse_mean, __ = model_performance(pred_mean, trg_mean)
        mean_perf += sse_mean


validation_predictions = torch.cat(validation_predictions, dim=0)
true_labels = torch.cat(true_labels, dim=0).unsqueeze(1)
check_values = torch.cat((validation_predictions, true_labels), dim=1)
print("Performances : --- Model : {} --- --- Mean predictor : {} ---".format(model_perf, mean_perf))
print("Values : ", check_values)

Performances : --- Model : 796.3753771781921 --- --- Mean predictor : 809.26771068573 ---
Values :  tensor([[0.8796, 1.0000],
        [1.2529, 0.8000],
        [0.9884, 0.6000],
        ...,
        [0.9092, 1.4000],
        [0.8978, 1.4000],
        [1.1957, 0.6000]], device='cuda:0')


### Launch prediction on the test set

In [None]:
# get the data
test_df = pd.read_csv('test.csv')

# prepare input for embedding
test_tokenized_corpus = get_tokenized_corpus(test_df["original"].str.cat(test_df["edit"], sep=" ").tolist())
test_sentence = get_model_inputs(test_tokenized_corpus, word2idx)

# transforms the headlines
test_df = create_engineered_features(test_df, punctuations, sex_words)
test_features = test_df[columns].values

test_features = scaler.transform(test_features)

test_features = torch.tensor(test_features, dtype=torch.float32)
test_data = torch.utils.data.TensorDataset(test_sentence, test_features)
test_loader = torch.utils.data.DataLoader(test_data, shuffle=False, batch_size=BATCH_SIZE)


# make predictions
test_predictions = []
with torch.no_grad():
    for batch in test_loader:
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # unzip
        input_sent, input_feat = batch
        # predict (forward pass)
        pred = model_2c(input_sent, input_feat)
        test_predictions += [pred]

test_predictions = torch.cat(test_predictions, dim=0)
test_predictions

tensor([[0.8626],
        [1.0501],
        [0.8488],
        ...,
        [0.9714],
        [0.9008],
        [0.9115]], device='cuda:0')