In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import LongformerModel, LongformerTokenizer
from transformers import RobertaTokenizer, RobertaModel
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from transformers import EarlyStoppingCallback
from torch.optim import AdamW

In [None]:
# Load the dataset
df = pd.read_csv("pandora_v4.csv", encoding='latin-1')


# # Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Preprocess tweets: tokenize and pad/truncate
max_length = 64  # The max length is choosen after trying multiple values
df['body'] = df['body'].apply(lambda x: tokenizer.encode_plus(x, truncation=True, padding='max_length', max_length=max_length))

# Drop original 'gender' and 'age_group' columns
df.drop(['cleaned_body', 'lemmatized_body','author','tokenized','word_count'], axis=1, inplace=True)

# Split into train and test first
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

# Split df_train into train and validation
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42)


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
# Define a PyTorch dataset
class PersonalityDataset(Dataset):
    def __init__(self, tweets, targets):
        self.tweets = tweets
        self.targets = targets

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.tweets[idx]['input_ids'], dtype=torch.long)
        attention_mask = torch.tensor(self.tweets[idx]['attention_mask'], dtype=torch.long)
        targets = torch.tensor(self.targets[idx], dtype=torch.float)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'targets': targets
        }

# Define the model
class RoBERTaForPersonalityTraits(torch.nn.Module):
    def __init__(self):
        super(RoBERTaForPersonalityTraits, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 5)  # Update the output size to 5

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        output = self.linear(output)
        return output


In [None]:
# Prepare data loaders
batch_size = 16
train_dataset = PersonalityDataset(df_train['body'].tolist(), df_train[['ext', 'neu', 'agr', 'con', 'ope']].values)
val_dataset = PersonalityDataset(df_val['body'].tolist(), df_val[['ext', 'neu', 'agr', 'con', 'ope']].values)
test_dataset = PersonalityDataset(df_test['body'].tolist(), df_test[['ext', 'neu', 'agr', 'con', 'ope']].values)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Instantiate the model and define optimizer and loss function

model = RoBERTaForPersonalityTraits()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.MSELoss()

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Training loop
epochs = 10

# Early stopping parameters
patience = 3
best_val_loss = float('inf')
patience_counter = 0
accumulation_steps = 16

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_loader):
        optimizer.zero_grad() if i % accumulation_steps == 0 else None

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, targets)

        # Backward pass
        loss = loss / accumulation_steps
        loss.backward()
        if (i+1) % accumulation_steps == 0 or i+1 == len(train_loader):  # update on last step even if accumulation steps have not been reached
            optimizer.step()
            optimizer.zero_grad()  # make sure to reset gradient after updating

        total_loss += loss.item()

        # Delete unnecessary tensors to save GPU memory
        del input_ids
        del attention_mask
        del targets
        del outputs
        torch.cuda.empty_cache()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Train loss at epoch {epoch + 1}: {avg_train_loss}")

    # Validation loop
    model.eval()
    val_total_loss = 0
    all_predictions = [[] for _ in range(5)]  # Separate predictions for each trait
    all_targets = [[] for _ in range(5)]  # Separate targets for each trait

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, targets)

            val_total_loss += loss.item()

            # Split predictions and targets for each trait
            for i in range(5):
                all_predictions[i].extend(outputs[:, i].cpu().numpy())
                all_targets[i].extend(targets[:, i].cpu().numpy())

    avg_val_loss = val_total_loss / len(val_loader)
    print(f"Validation loss at epoch {epoch + 1}: {avg_val_loss}")

    # Check if early stopping conditions are met
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0  # reset counter
        # save the best model
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1  # increment counter
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break  # break out of the training loop



Train loss at epoch 1: 0.010789037843767021
Validation loss at epoch 1: 0.0811604765696185
Train loss at epoch 2: 0.005880175892130605
Validation loss at epoch 2: 0.06785646104386875
Train loss at epoch 3: 0.005063800858001091
Validation loss at epoch 3: 0.061626216396689415
Train loss at epoch 4: 0.004605255188154323
Validation loss at epoch 4: 0.058740147597023416
Train loss at epoch 5: 0.004179541852790862
Validation loss at epoch 5: 0.05569563601698194
Train loss at epoch 6: 0.0038841027086268048
Validation loss at epoch 6: 0.0532930982538632
Train loss at epoch 7: 0.0035797598911449314
Validation loss at epoch 7: 0.05368088386952877
Train loss at epoch 8: 0.0032950618854790392
Validation loss at epoch 8: 0.05148225219122001
Train loss at epoch 9: 0.0031168659904506056
Validation loss at epoch 9: 0.05124097208891596
Train loss at epoch 10: 0.0028605259808578663
Validation loss at epoch 10: 0.05008113009056875
For Extroversion:
MSE: 0.056397028267383575
RMSE: 0.23748058080673218
R^2

In [None]:
# Evaluation on test set
model.eval()
test_total_loss = 0
all_predictions = [[] for _ in range(5)]  # Separate predictions for each trait
all_targets = [[] for _ in range(5)]  # Separate targets for each trait

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, targets)

        test_total_loss += loss.item()

        # Split predictions and targets for each trait
        for i in range(5):
            all_predictions[i].extend(outputs[:, i].cpu().numpy())
            all_targets[i].extend(targets[:, i].cpu().numpy())

trait_names = ["Ext", "Neu", "Agre", "Con", "Ope"]

for i in range(5):
    predictions = all_predictions[i]
    targets = all_targets[i]

    mse = mean_squared_error(targets, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(targets, predictions)
    r2 = r2_score(targets, predictions)

    print(f"For {trait_names[i]}:")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"R^2: {r2}\n")

For Ext:
MSE: 0.06094896420836449
RMSE: 0.24687844514846802
MAE: 0.17111006379127502
R^2: 0.5229165534009182

For Neu:
MSE: 0.06385631114244461
RMSE: 0.25269806385040283
MAE: 0.17515407502651215
R^2: 0.3248026116264062

For Agre:
MSE: 0.05870259925723076
RMSE: 0.24228619039058685
MAE: 0.16692174971103668
R^2: 0.3629541933878292

For Con:
MSE: 0.041853081434965134
RMSE: 0.20458026230335236
MAE: 0.13300937414169312
R^2: 0.39640634446327894

For Ope:
MSE: 0.03033624216914177
RMSE: 0.1741730272769928
MAE: 0.1065702810883522
R^2: 0.43331488975179944



In [None]:
df_predictions = pd.DataFrame(np.transpose(all_predictions), columns=trait_names)
correlation_matrix_predictions = df_predictions.corr()
print("\nCorrelation matrix for predicted values:")
print(correlation_matrix_predictions)



Correlation matrix for predicted values:
           Ext       Neu      Agre       Con       Ope
Ext   1.000000 -0.761104 -0.647993 -0.126024  0.443589
Neu  -0.761104  1.000000  0.332148 -0.034599 -0.175018
Agre -0.647993  0.332148  1.000000  0.635959 -0.489487
Con  -0.126024 -0.034599  0.635959  1.000000 -0.638539
Ope   0.443589 -0.175018 -0.489487 -0.638539  1.000000


In [None]:
correlation_matrix = df[['ext', 'neu', 'agr', 'con', 'ope']].corr()
print(correlation_matrix)

          ext       neu       agr       con       ope
ext  1.000000 -0.547147 -0.437272 -0.048405  0.242843
neu -0.547147  1.000000  0.030872 -0.157761 -0.057183
agr -0.437272  0.030872  1.000000  0.348624 -0.186010
con -0.048405 -0.157761  0.348624  1.000000 -0.419969
ope  0.242843 -0.057183 -0.186010 -0.419969  1.000000
