In [1]:
# Imports
import pandas as pd
import numpy as np
import os
from transformers import BertTokenizer, BertModel, BertPreTrainedModel, TrainingArguments, Trainer
from transformers import RobertaTokenizer, RobertaPreTrainedModel, RobertaModel, AutoTokenizer, AutoModel, PreTrainedModel
from transformers import TrainerCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch
import torch.nn.functional as F
from torch import optim
from transformers import get_linear_schedule_with_warmup
from transformers import TrainerCallback, TrainerState, TrainerControl
import torch.nn.init as init
import random
import nltk
from nltk.corpus import wordnet

2023-08-17 12:26:58.326372: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load data

team_id = '20' #put your team id here
split = 'test_2' # replace by 'test_2' for FINAL submission

df = pd.read_csv('dataset/tweets_train.csv')
df_test = pd.read_csv(f'dataset/tweets_{split}.csv')

In [3]:
# Necessary for ROberta based models
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [4]:
df['words_str'] = df['text'].apply(preprocess)
df_test['words_str'] = df_test['text'].apply(preprocess)

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['words_str'], df['score_compound'], test_size=0.4, random_state=42)

In [7]:
def synonym_replacement(text, num_replacements=1):
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word.isalpha()]))
    random.shuffle(random_word_list)
    num_replaced = 0
    
    for random_word in random_word_list:
        synonyms = []
        for syn in wordnet.synsets(random_word):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name())
        if len(synonyms) >= 1:
            synonym = random.choice(list(set(synonyms)))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= num_replacements: 
            break

    return ' '.join(new_words)

def random_swap(text, n=1):
    words = text.split()
    if len(words) < 2:
        return text
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)


In [8]:
augmented_texts = []
augmented_labels = []

for text, label in zip(train_texts, train_labels):
    augmented_texts.append(text)
    augmented_labels.append(label)
    
    # Add 4 augmented versions
    for _ in range(4):
        new_text = text
        if random.random() > 0.5:
            new_text = synonym_replacement(new_text)
        if random.random() > 0.5:
            new_text = random_swap(new_text)
        augmented_texts.append(new_text)
        augmented_labels.append(label)

train_texts = augmented_texts
train_labels = augmented_labels

In [9]:
tokenizer_twitter_sentiment = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')

In [10]:
tokenizer = tokenizer_twitter_sentiment
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [11]:
# Create a custom dataset
class RegressionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = np.array(labels).astype('float32')

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]) # This is already float32
        return item

    def __len__(self):
        return len(self.labels)

    
class StopOnZeroLossCallback(TrainerCallback):
    def on_log(self, args, state: TrainerState, control: TrainerControl, logs=None, **kwargs):
        # Check if the training loss is exactly zero
        if logs.get("loss", 1) == 0: 
            print("Training loss reached zero, stopping training!")
            control.should_training_stop = True
    
# Define a function to compute RMSE
def compute_rmse(eval_pred):
    predictions, labels = eval_pred
    return {'rmse': np.sqrt(mean_squared_error(labels, predictions))}

class ThresholdEarlyStoppingCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        rmse = metrics['eval_rmse'] # Make sure this key matches what's returned by your compute_metrics function
        if rmse < 0.2:
            control.should_training_stop = True
        return control


class BertRegression(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # Define and initialize the hidden layers
        self.hidden_layer1 = nn.Linear(config.hidden_size, config.hidden_size * 2)
        init.kaiming_normal_(self.hidden_layer1.weight, mode='fan_in', nonlinearity='relu')
        self.hidden_activation1 = nn.ReLU()

        self.hidden_layer2 = nn.Linear(config.hidden_size * 2, config.hidden_size * 2)
        init.kaiming_normal_(self.hidden_layer2.weight, mode='fan_in', nonlinearity='relu')
        self.hidden_activation2 = nn.ReLU()

        self.hidden_layer3 = nn.Linear(config.hidden_size * 2, config.hidden_size * 2)
        init.kaiming_normal_(self.hidden_layer3.weight, mode='fan_in', nonlinearity='relu')
        self.hidden_activation3 = nn.ReLU()

        self.regressor = nn.Linear(config.hidden_size * 2, 1)

        self.loss_fn = nn.HuberLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)

        hidden_output1 = self.hidden_activation1(self.hidden_layer1(pooled_output))
        hidden_output2 = self.hidden_activation2(self.hidden_layer2(hidden_output1))
        hidden_output3 = self.hidden_activation3(self.hidden_layer3(hidden_output2))

        logits = self.regressor(hidden_output3)
        logits = logits.squeeze()
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return (loss, logits) if loss is not None else logits


class RobertaRegressionTwitter_2(nn.Module):
    def __init__(self, delta=1.0):
        super(RobertaRegressionTwitter_2, self).__init__()
        self.roberta = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
        self.dropout = nn.Dropout(0.1)
        hidden_size = self.roberta.config.hidden_size

        # Adding an additional hidden layer
        self.hidden_layer = nn.Linear(hidden_size, hidden_size//2)
        
        # Adding L2 regularization (weight decay) to the hidden layer
        self.regularization = nn.LayerNorm(hidden_size//2)
        
        # Final regression layer
        self.regressor = nn.Linear(hidden_size//2, 1)
        self.huber_loss = nn.HuberLoss(delta=delta) # Delta controls the transition point in the loss

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        
        # Passing through the hidden layer with ReLU activation
        hidden_output = self.hidden_layer(pooled_output)
        hidden_output = F.relu(hidden_output)
        
        # Applying Layer Normalization (regularization)
        hidden_output = self.regularization(hidden_output)
        
        logits = self.regressor(hidden_output)
        logits = logits.squeeze()
        
        loss = None
        if labels is not None:
            loss = self.huber_loss(logits, labels) # Using Huber Loss here
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
    
class RobertaRegressionTwitter_3(nn.Module):
    def __init__(self, delta=1.0):
        super(RobertaRegressionTwitter_3, self).__init__()
        self.roberta = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
        self.dropout = nn.Dropout(0.1)

        # Getting RoBERTa's hidden size for reference
        roberta_hidden_size = self.roberta.config.hidden_size

        # Define the hidden layers
        self.hidden_layer1 = nn.Linear(roberta_hidden_size, roberta_hidden_size)  # This is essentially a pass-through layer
        self.hidden_layer2 = nn.Linear(roberta_hidden_size, roberta_hidden_size // 2)  # Half of RoBERTa's hidden size
        self.hidden_layer3 = nn.Linear(roberta_hidden_size // 2, 160)
        self.hidden_layer4 = nn.Linear(160, 80)
        self.hidden_layer5 = nn.Linear(80, 40)
        self.penultimate_layer = nn.Linear(40, 20)
        self.regressor = nn.Linear(20, 1)

        # After each hidden layer
        self.norm1 = nn.LayerNorm(roberta_hidden_size)
        self.norm2 = nn.LayerNorm(roberta_hidden_size // 2)
        self.norm3 = nn.LayerNorm(160)
        self.norm4 = nn.LayerNorm(80)
        self.norm5 = nn.LayerNorm(40)
        self.norm6 = nn.LayerNorm(20)

        self.huber_loss = nn.HuberLoss(delta=delta)  # Delta controls the transition point in the loss

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)

        # Passing pooled output through the hidden layers with ReLU and LayerNorm
        hidden_output1 = F.relu(self.hidden_layer1(pooled_output))
        hidden_output1 = self.norm1(hidden_output1)

        hidden_output2 = F.relu(self.hidden_layer2(hidden_output1))
        hidden_output2 = self.norm2(hidden_output2)

        hidden_output3 = F.relu(self.hidden_layer3(hidden_output2))
        hidden_output3 = self.norm3(hidden_output3)

        hidden_output4 = F.relu(self.hidden_layer4(hidden_output3))
        hidden_output4 = self.norm4(hidden_output4)

        hidden_output5 = F.relu(self.hidden_layer5(hidden_output4))
        hidden_output5 = self.norm5(hidden_output5)

        penultimate_output = F.relu(self.penultimate_layer(hidden_output5))
        penultimate_output = self.norm6(penultimate_output)

        logits = self.regressor(penultimate_output).squeeze()

        loss = None
        if labels is not None:
            loss = self.huber_loss(logits, labels)  # Using Huber Loss here

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

In [12]:
model = RobertaRegressionTwitter_3()

In [13]:
train_dataset = RegressionDataset(train_encodings, train_labels)
val_dataset = RegressionDataset(val_encodings, val_labels)

In [14]:
model = model.to(device)

In [15]:
# Define training arguments and trainer
training_args = TrainingArguments(
    output_dir='./output',
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    learning_rate=0.00001,
    num_train_epochs=2000,
    logging_dir='./logs',
    evaluation_strategy='steps',
    logging_steps=100,
    weight_decay=0.0001,
    lr_scheduler_type='cosine',  # Using a cosine scheduler
    warmup_steps=100  # Number of warmup steps
)





trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_rmse,
    callbacks=[ThresholdEarlyStoppingCallback(), StopOnZeroLossCallback()],
)


# Train the model
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)



Step,Training Loss,Validation Loss,Rmse
100,0.1176,0.050285,0.31752
200,0.0376,0.040527,0.284956
300,0.0241,0.040141,0.283994
400,0.0168,0.038156,0.277102
500,0.0125,0.035063,0.265584
600,0.0101,0.039261,0.281408
700,0.0083,0.033966,0.261443
800,0.0072,0.034963,0.265326
900,0.0059,0.033868,0.261167
1000,0.0053,0.030937,0.249478


KeyboardInterrupt: 

In [None]:
trainer.save_model('pretrained_models/best-reg_0.2')

In [None]:
# Define a dataset without labels for testing
class RegressionTestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
# Tokenize the test sentences
sentences = list(df_test.words_str.values)
test_encodings = tokenizer(sentences, truncation=True, padding=True)

# Convert to a PyTorch Dataset
test_dataset = RegressionTestDataset(test_encodings)

# Get predictions with the neural network
predictions = trainer.predict(test_dataset)
y_hat_tensor = torch.tensor(predictions.predictions, dtype=torch.float32)

# Convert the predictions back to a numpy array
y_hat = y_hat_tensor.cpu().numpy()

# Save the results with the specified format
directory = 'results'
np.save(os.path.join(directory, f'{team_id}__{split}__reg_pred.npy'), np.squeeze(y_hat))

In [None]:
# Load 20__test_2__reg_pred.npy

d = np.load('results/20__test_2__reg_pred.npy', allow_pickle=True)
d.shape

(1000,)