In [1]:
# Imports
import pandas as pd
import numpy as np
import os

# Preprocessing

In [2]:
# Load data

team_id = '20' #put your team id here
split = 'test_1' # replace by 'test_2' for FINAL submission

df = pd.read_csv('dataset/tweets_train.csv')
df_test = pd.read_csv(f'dataset/tweets_{split}.csv')

In [3]:
# df['words_str'] = df['words'].apply(lambda words: ' '.join(eval(words)))
# df_test['words_str'] = df_test['words'].apply(lambda words: ' '.join(eval(words)))

In [4]:
# Necessary for ROberta based models
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [5]:
df['words_str'] = df['text'].apply(preprocess)
df_test['words_str'] = df_test['text'].apply(preprocess)

# Regression

In [6]:
from transformers import BertTokenizer, BertModel, BertPreTrainedModel, TrainingArguments, Trainer
from transformers import RobertaTokenizer, RobertaPreTrainedModel, RobertaModel, AutoTokenizer, AutoModel, PreTrainedModel
from transformers import TrainerCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch
import numpy as np
import torch.nn.functional as F
from torch import optim
from transformers import get_linear_schedule_with_warmup

2023-08-15 15:26:18.353168: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['words_str'], df['score_compound'], test_size=0.3, random_state=42)

In [9]:
# Tokenize the input
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer_twitter = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base')
tokenizer_twitter_sentiment = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
tokenizer_mpnet = AutoTokenizer.from_pretrained('sentence-transformers/stsb-mpnet-base-v2')
tokenizer_bert_twitter = AutoTokenizer.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis')

In [10]:
tokenizer = tokenizer_twitter_sentiment
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

In [11]:
# Create a custom dataset
class RegressionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.values.astype('float32') # Convert Series to NumPy array and then to float32

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]) # This is already float32
        return item

    def __len__(self):
        return len(self.labels)


class BertRegression(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.regressor = nn.Linear(config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.regressor(pooled_output)
        logits = logits.squeeze()
        loss = None
        if labels is not None:
            loss = torch.sqrt(nn.MSELoss()(logits, labels))
        return (loss, logits) if loss is not None else logits

class RobertaRegression(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.roberta = RobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.regressor = nn.Linear(config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.regressor(pooled_output)
        logits = logits.squeeze()
        loss = None
        if labels is not None:
            loss = torch.sqrt(nn.MSELoss()(logits, labels))
        return (loss, logits) if loss is not None else logits
    
class RobertaRegressionTwitter(nn.Module):
    def __init__(self):
        super(RobertaRegressionTwitter, self).__init__()
        self.roberta = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
        self.dropout = nn.Dropout(0.1)
        self.regressor = nn.Linear(self.roberta.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.regressor(pooled_output)
        logits = logits.squeeze()
        loss = None
        if labels is not None:
            loss = torch.sqrt(nn.MSELoss()(logits, labels))
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
    
class BertRegressionTwitter(nn.Module):
    def __init__(self):
        super(BertRegressionTwitter, self).__init__()
        self.bert = AutoModel.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis')
        self.dropout = nn.Dropout(0.1)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.regressor(pooled_output)
        logits = logits.squeeze()
        loss = None
        if labels is not None:
            loss = torch.sqrt(nn.MSELoss()(logits, labels))
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

# Define a function to compute RMSE
def compute_rmse(eval_pred):
    predictions, labels = eval_pred
    return {'rmse': np.sqrt(mean_squared_error(labels, predictions))}


class BertRegressionTwitter_2(nn.Module):
    def __init__(self, delta=1.0):
        super(BertRegressionTwitter_2, self).__init__()
        self.bert = AutoModel.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis')
        self.dropout = nn.Dropout(0.1)
        hidden_size = self.bert.config.hidden_size

        # Adding an additional hidden layer
        self.hidden_layer = nn.Linear(hidden_size, hidden_size//2)
        
        # Adding L2 regularization (weight decay) to the hidden layer
        self.regularization = nn.LayerNorm(hidden_size//2)
        
        # Final regression layer
        self.regressor = nn.Linear(hidden_size//2, 1)
        self.huber_loss = nn.HuberLoss(delta=delta) # Delta controls the transition point in the loss

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        
        # Passing through the hidden layer with ReLU activation
        hidden_output = self.hidden_layer(pooled_output)
        hidden_output = F.relu(hidden_output)
        
        # Applying Layer Normalization (regularization)
        hidden_output = self.regularization(hidden_output)
        
        logits = self.regressor(hidden_output)
        logits = logits.squeeze()
        
        loss = None
        if labels is not None:
            loss = self.huber_loss(logits, labels) # Using Huber Loss here
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
    
class BertRegressionTwitter_3(nn.Module):
    def __init__(self, delta=1.0):
        super(BertRegressionTwitter_3, self).__init__()
        self.bert = AutoModel.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis')
        hidden_size = self.bert.config.hidden_size
        
        # Bi-directional LSTM
        self.lstm = nn.LSTM(hidden_size, hidden_size // 2, num_layers=2, batch_first=True, bidirectional=True, dropout=0.2)
        
        # Deeper Feed-Forward layers
        self.hidden1 = nn.Linear(hidden_size, hidden_size)
        self.hidden2 = nn.Linear(hidden_size, hidden_size//2)
        self.regressor = nn.Linear(hidden_size//2, 1)
        
        # Activation and regularization
        self.dropout = nn.Dropout(0.5)
        self.batchnorm1 = nn.BatchNorm1d(hidden_size)
        self.batchnorm2 = nn.BatchNorm1d(hidden_size//2)
        self.huber_loss = nn.HuberLoss(delta=delta)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        
        lstm_out, _ = self.lstm(sequence_output)
        lstm_out = lstm_out[:, -1, :]
        
        # Passing through deeper layers
        hidden_output = F.relu(self.hidden1(lstm_out))
        hidden_output = self.batchnorm1(hidden_output)
        hidden_output = self.dropout(hidden_output)
        
        hidden_output = F.relu(self.hidden2(hidden_output))
        hidden_output = self.batchnorm2(hidden_output)
        
        logits = self.regressor(hidden_output)
        logits = logits.squeeze()
        
        loss = None
        if labels is not None:
            loss = self.huber_loss(logits, labels)
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

class BertRegressionTwitter_4(nn.Module):
    def __init__(self, delta=1.0):
        super(BertRegressionTwitter_4, self).__init__()
        self.bert = AutoModel.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis')
        self.dropout = nn.Dropout(0.1)
        hidden_size = self.bert.config.hidden_size
        
        # Adding LSTM layer
        self.lstm = nn.LSTM(hidden_size, hidden_size // 2, batch_first=True)
        
        # Final regression layer
        self.regressor = nn.Linear(hidden_size//2, 1)
        self.huber_loss = nn.HuberLoss(delta=delta) # Delta controls the transition point in the loss

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        
        # Instead of using pooled_output, we'll utilize the last hidden state (sequence of embeddings)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        
        # LSTM
        lstm_out, _ = self.lstm(sequence_output)
        
        # Taking the last hidden state of LSTM for regression
        lstm_out = lstm_out[:, -1, :]
        
        logits = self.regressor(lstm_out)
        logits = logits.squeeze()
        
        loss = None
        if labels is not None:
            loss = self.huber_loss(logits, labels) # Using Huber Loss here
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}




class RobertaRegressionTwitter_2(nn.Module):
    def __init__(self, delta=1.0):
        super(RobertaRegressionTwitter_2, self).__init__()
        self.roberta = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
        self.dropout = nn.Dropout(0.1)
        hidden_size = self.roberta.config.hidden_size

        # Adding an additional hidden layer
        self.hidden_layer = nn.Linear(hidden_size, hidden_size//2)
        
        # Adding L2 regularization (weight decay) to the hidden layer
        self.regularization = nn.LayerNorm(hidden_size//2)
        
        # Final regression layer
        self.regressor = nn.Linear(hidden_size//2, 1)
        self.huber_loss = nn.HuberLoss(delta=delta) # Delta controls the transition point in the loss

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        
        # Passing through the hidden layer with ReLU activation
        hidden_output = self.hidden_layer(pooled_output)
        hidden_output = F.relu(hidden_output)
        
        # Applying Layer Normalization (regularization)
        hidden_output = self.regularization(hidden_output)
        
        logits = self.regressor(hidden_output)
        logits = logits.squeeze()
        
        loss = None
        if labels is not None:
            loss = self.huber_loss(logits, labels) # Using Huber Loss here
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

    
class RobertaRegressionTwitter_3(nn.Module):
    def __init__(self):
        super(RobertaRegressionTwitter_3, self).__init__()
        self.roberta = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base')
        self.dropout = nn.Dropout(0.1)
        hidden_size = self.roberta.config.hidden_size

        # Adding an additional hidden layer
        self.hidden_layer = nn.Linear(hidden_size, hidden_size//2)
        
        # Adding L2 regularization (weight decay) to the hidden layer
        self.regularization = nn.LayerNorm(hidden_size//2)
        
        # Final regression layer
        self.regressor = nn.Linear(hidden_size//2, 1)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        
        # Passing through the hidden layer with ReLU activation
        hidden_output = self.hidden_layer(pooled_output)
        hidden_output = F.relu(hidden_output)
        
        # Applying Layer Normalization (regularization)
        hidden_output = self.regularization(hidden_output)
        
        logits = self.regressor(hidden_output)
        logits = logits.squeeze()
        
        loss = None
        if labels is not None:
            loss = torch.sqrt(nn.MSELoss()(logits, labels))
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
    

class MpnetRegression(nn.Module):
    def __init__(self, delta=1.0):
        super(MpnetRegression, self).__init__()
        self.mpnet = AutoModel.from_pretrained('sentence-transformers/stsb-mpnet-base-v2')
        self.dropout = nn.Dropout(0.1)
        hidden_size = self.mpnet.config.hidden_size

        # Adding an additional hidden layer
        self.hidden_layer = nn.Linear(hidden_size, hidden_size//2)
        
        # Adding L2 regularization (weight decay) to the hidden layer
        self.regularization = nn.LayerNorm(hidden_size//2)
        
        # Final regression layer
        self.regressor = nn.Linear(hidden_size//2, 1)
        self.huber_loss = nn.SmoothL1Loss(delta) # Delta controls the transition point in the loss

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.mpnet(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        
        # Passing through the hidden layer with ReLU activation
        hidden_output = self.hidden_layer(pooled_output)
        hidden_output = F.relu(hidden_output)
        
        # Applying Layer Normalization (regularization)
        hidden_output = self.regularization(hidden_output)
        
        logits = self.regressor(hidden_output)
        logits = logits.squeeze()
        
        loss = None
        if labels is not None:
            loss = self.huber_loss(logits, labels) # Using Huber Loss here
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}


    
    
    
class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, patience=4):
        self.patience = patience
        self.best_score = None
        self.early_stop_counter = 0

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        rmse = metrics['eval_rmse']  # Make sure this key matches what's returned by your compute_metrics function
        if self.best_score is None or rmse < self.best_score:
            self.best_score = rmse
            self.early_stop_counter = 0
        else:
            self.early_stop_counter += 1
            if self.early_stop_counter >= self.patience:
                control.should_training_stop = True
        return control
    
class ThresholdEarlyStoppingCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        rmse = metrics['eval_rmse'] # Make sure this key matches what's returned by your compute_metrics function
        if rmse < 0.18:
            control.should_training_stop = True
        return control


In [12]:
train_dataset = RegressionDataset(train_encodings, train_labels)
val_dataset = RegressionDataset(val_encodings, val_labels)

In [13]:
model_bert = BertRegression.from_pretrained('bert-base-uncased')
model_roberta = RobertaRegression.from_pretrained('roberta-base')
model_twitter = RobertaRegressionTwitter_2()
model_mpnet = MpnetRegression()
model_bert_twitter = BertRegressionTwitter_4()

Some weights of BertRegression were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['regressor.weight', 'regressor.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaRegression were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'regressor.weight', 'regressor.bias', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
model = model_twitter.to(device)

# Define training arguments and trainer
training_args = TrainingArguments(
    output_dir='./output',
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    learning_rate=0.00001,
    num_train_epochs=1000,
    logging_dir='./logs',
    evaluation_strategy='steps',
    logging_steps=100,
    weight_decay=0.0001,
    lr_scheduler_type='cosine',  # Using a cosine scheduler
    warmup_steps=100  # Number of warmup steps
)





trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_rmse,
    callbacks=[ThresholdEarlyStoppingCallback()],
)


# Train the model
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)



Step,Training Loss,Validation Loss,Rmse
100,0.0734,0.041069,0.286928
200,0.0278,0.035377,0.266785
300,0.0166,0.033861,0.261238
400,0.0116,0.034527,0.263859
500,0.0089,0.032813,0.257093
600,0.0071,0.03247,0.256033
700,0.0059,0.031408,0.251728
800,0.0048,0.03077,0.249205
900,0.0042,0.03157,0.252412
1000,0.0037,0.03088,0.249546


KeyboardInterrupt: 

In [None]:
# Save the model
trainer.save_model('pretrained_models/bert-twitter-regression')

# model = BertRegression.from_pretrained("./path/to/save/directory")

# Test

In [None]:
# Define a dataset without labels for testing
class RegressionTestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
# # Tokenize the test sentences
# sentences = list(df_test.words_str.values)
# test_encodings = tokenizer(sentences, truncation=True, padding=True)

# # Convert to a PyTorch Dataset
# test_dataset = RegressionTestDataset(test_encodings)

# # Get predictions with the neural network
# predictions = trainer.predict(test_dataset)
# y_hat_tensor = torch.tensor(predictions.predictions, dtype=torch.float32)

# # Convert the predictions back to a numpy array
# y_hat = y_hat_tensor.cpu().numpy()

# # Save the results with the specified format
# directory = 'results'
# np.save(os.path.join(directory, f'{team_id}__{split}__reg_pred.npy'), np.squeeze(y_hat))

In [None]:
# Load 20__test_1__reg_pred.npy

d = np.load('results/20__test_1__reg_pred.npy', allow_pickle=True)
d.shape

(1000,)