In [12]:
# Imports
import pandas as pd
import numpy as np
import os

# Preprocessing

In [13]:
# Load data

team_id = '20' #put your team id here
split = 'test_1' # replace by 'test_2' for FINAL submission

df = pd.read_csv('dataset/tweets_train.csv')
df_test = pd.read_csv(f'dataset/tweets_{split}.csv')

In [14]:
df['words_str'] = df['words'].apply(lambda words: ' '.join(eval(words)))
df_test['words_str'] = df_test['words'].apply(lambda words: ' '.join(eval(words)))

# Regression

In [15]:
from transformers import BertTokenizer, BertModel, BertPreTrainedModel, TrainingArguments, Trainer
from transformers import TrainerCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch
import numpy as np

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [17]:
# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['words_str'], df['score_compound'], test_size=0.2)

# Tokenize the input
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

In [18]:
# Create a custom dataset
class RegressionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.values.astype('float32') # Convert Series to NumPy array and then to float32

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]) # This is already float32
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = RegressionDataset(train_encodings, train_labels)
val_dataset = RegressionDataset(val_encodings, val_labels)

class BertRegression(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.regressor = nn.Linear(config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.regressor(pooled_output)
        logits = logits.squeeze()
        loss = None
        if labels is not None:
            loss = torch.sqrt(nn.MSELoss()(logits, labels))
        return (loss, logits) if loss is not None else logits

    

# Define a function to compute RMSE
def compute_rmse(eval_pred):
    predictions, labels = eval_pred
    return {'rmse': np.sqrt(mean_squared_error(labels, predictions))}
    
    
    
class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, patience=4):
        self.patience = patience
        self.best_score = None
        self.early_stop_counter = 0

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        rmse = metrics['eval_rmse']  # Make sure this key matches what's returned by your compute_metrics function
        if self.best_score is None or rmse < self.best_score:
            self.best_score = rmse
            self.early_stop_counter = 0
        else:
            self.early_stop_counter += 1
            if self.early_stop_counter >= self.patience:
                control.should_training_stop = True
        return control


In [19]:
model = BertRegression.from_pretrained('bert-base-uncased').to(device)

# Define training arguments and trainer
training_args = TrainingArguments(
    output_dir='./output',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=100,
    logging_dir='./logs',
    evaluation_strategy='steps', # Evaluate every 'logging_steps'
    logging_steps=100, # Set to evaluate every 100 steps
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_rmse,
    callbacks=[EarlyStoppingCallback()],
)


# Train the model
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)

Some weights of BertRegression were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['regressor.weight', 'regressor.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Rmse
100,0.479,0.384631,0.394402
200,0.3926,0.328166,0.343053
300,0.37,0.409392,0.418833
400,0.3528,0.324491,0.348967
500,0.3296,0.304577,0.322814
600,0.3511,0.262478,0.279858
700,0.274,0.260807,0.277929
800,0.2805,0.285309,0.298421
900,0.2721,0.25389,0.271551
1000,0.2452,0.244539,0.260517


{'eval_loss': 0.25911715626716614, 'eval_rmse': 0.2809239327907562, 'eval_runtime': 1.6306, 'eval_samples_per_second': 981.251, 'eval_steps_per_second': 122.656, 'epoch': 2.62}


In [20]:
# Save the model
trainer.save_model('pretrained_models/bert-base-regression')

# model = BertRegression.from_pretrained("./path/to/save/directory")

# Test

In [21]:
# Define a dataset without labels for testing
class RegressionTestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [26]:
# Tokenize the test sentences
sentences = list(df_test.words_str.values)
test_encodings = tokenizer(sentences, truncation=True, padding=True)

# Convert to a PyTorch Dataset
test_dataset = RegressionTestDataset(test_encodings)

# Get predictions with the neural network
predictions = trainer.predict(test_dataset)
y_hat_tensor = torch.tensor(predictions.predictions, dtype=torch.float32)

# Convert the predictions back to a numpy array
y_hat = y_hat_tensor.cpu().numpy()

# Save the results with the specified format
directory = 'results'
np.save(os.path.join(directory, f'{team_id}__{split}__reg_pred.npy'), np.squeeze(y_hat))

In [27]:
# Load 20__test_1__reg_pred.npy

d = np.load('results/20__test_1__reg_pred.npy', allow_pickle=True)
d.shape

(1000,)