In [15]:
# Imports
import pandas as pd
import numpy as np
import os
from transformers import BertTokenizer, BertModel, BertPreTrainedModel, TrainingArguments, Trainer
from transformers import RobertaTokenizer, RobertaPreTrainedModel, RobertaModel, AutoTokenizer, AutoModel, PreTrainedModel
from transformers import TrainerCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch
import torch.nn.functional as F
from torch import optim
from transformers import get_linear_schedule_with_warmup

In [35]:
# Load data

team_id = '20' #put your team id here
split = 'test_2' # replace by 'test_2' for FINAL submission

df = pd.read_csv('dataset/tweets_train.csv')
df_test = pd.read_csv(f'dataset/tweets_{split}.csv')

In [36]:
# Necessary for ROberta based models
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [37]:
df['words_str'] = df['text'].apply(preprocess)
df_test['words_str'] = df_test['text'].apply(preprocess)

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['words_str'], df['score_compound'], test_size=0.5, random_state=42)

In [21]:
tokenizer_twitter_sentiment = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')

In [22]:
tokenizer = tokenizer_twitter_sentiment
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [23]:
# Create a custom dataset
class RegressionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.values.astype('float32') # Convert Series to NumPy array and then to float32

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]) # This is already float32
        return item

    def __len__(self):
        return len(self.labels)
    
# Define a function to compute RMSE
def compute_rmse(eval_pred):
    predictions, labels = eval_pred
    return {'rmse': np.sqrt(mean_squared_error(labels, predictions))}


class RobertaRegressionTwitter_2(nn.Module):
    def __init__(self, delta=1.0):
        super(RobertaRegressionTwitter_2, self).__init__()
        self.roberta = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
        self.dropout = nn.Dropout(0.1)
        hidden_size = self.roberta.config.hidden_size

        # Adding an additional hidden layer
        self.hidden_layer = nn.Linear(hidden_size, hidden_size//2)
        
        # Adding L2 regularization (weight decay) to the hidden layer
        self.regularization = nn.LayerNorm(hidden_size//2)
        
        # Final regression layer
        self.regressor = nn.Linear(hidden_size//2, 1)
        self.huber_loss = nn.HuberLoss(delta=delta) # Delta controls the transition point in the loss

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        
        # Passing through the hidden layer with ReLU activation
        hidden_output = self.hidden_layer(pooled_output)
        hidden_output = F.relu(hidden_output)
        
        # Applying Layer Normalization (regularization)
        hidden_output = self.regularization(hidden_output)
        
        logits = self.regressor(hidden_output)
        logits = logits.squeeze()
        
        loss = None
        if labels is not None:
            loss = self.huber_loss(logits, labels) # Using Huber Loss here
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
    
class ThresholdEarlyStoppingCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        rmse = metrics['eval_rmse'] # Make sure this key matches what's returned by your compute_metrics function
        if rmse < 0.17:
            control.should_training_stop = True
        return control


In [25]:
model_path = 'pretrained_models/best-reg/pytorch_model.bin'
model = RobertaRegressionTwitter_2()
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [26]:
train_dataset = RegressionDataset(train_encodings, train_labels)
val_dataset = RegressionDataset(val_encodings, val_labels)

In [27]:
model = model.to(device)

In [28]:
# Define training arguments and trainer
training_args = TrainingArguments(
    output_dir='./output',
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    learning_rate=0.00001,
    num_train_epochs=1000,
    logging_dir='./logs',
    evaluation_strategy='steps',
    logging_steps=100,
    weight_decay=0.0001,
    lr_scheduler_type='cosine',  # Using a cosine scheduler
    warmup_steps=100  # Number of warmup steps
)





trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_rmse,
    callbacks=[ThresholdEarlyStoppingCallback()],
)


# Train the model
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)



Step,Training Loss,Validation Loss,Rmse
100,0.0002,0.010155,0.143141


{'eval_loss': 0.010155106894671917, 'eval_rmse': 0.1431414932012558, 'eval_runtime': 5.0755, 'eval_samples_per_second': 788.097, 'eval_steps_per_second': 3.152, 'epoch': 6.25}


In [29]:
trainer.save_model('pretrained_models/best-reg-v2')

In [30]:
# Define a dataset without labels for testing
class RegressionTestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [38]:
# Tokenize the test sentences
sentences = list(df_test.words_str.values)
test_encodings = tokenizer(sentences, truncation=True, padding=True)

# Convert to a PyTorch Dataset
test_dataset = RegressionTestDataset(test_encodings)

# Get predictions with the neural network
predictions = trainer.predict(test_dataset)
y_hat_tensor = torch.tensor(predictions.predictions, dtype=torch.float32)

# Convert the predictions back to a numpy array
y_hat = y_hat_tensor.cpu().numpy()

# Save the results with the specified format
directory = 'results'
np.save(os.path.join(directory, f'{team_id}__{split}__reg_pred.npy'), np.squeeze(y_hat))

In [41]:
# Load 20__test_1__reg_pred.npy

d = np.load('results/20__test_2__reg_pred.npy', allow_pickle=True)
d.shape

(1000,)