In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import LongformerModel, LongformerTokenizer
from transformers import RobertaTokenizer, RobertaModel
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from transformers import EarlyStoppingCallback
from torch.optim import AdamW

In [None]:
# Load the dataset
df = pd.read_csv("author_profiling_v3.csv")

# # Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Preprocess tweets: tokenize and pad/truncate
max_length = 64
df['tweets'] = df['tweets'] .astype('str')
df['tweets'] = df['tweets'].apply(lambda x: tokenizer.encode_plus(x, truncation=True, padding='max_length', max_length=max_length))


# Drop original 'gender' and 'age_group' columns
df.drop(['gender', 'age_group','cleaned_tweets','tweets_lemmatized','tweets_withouthashtag'], axis=1, inplace=True)


# Split into train and test first
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Split df_train into train and validation
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=42)


In [None]:
# Define a PyTorch dataset
class PersonalityDataset(Dataset):
    def __init__(self, tweets, targets):
        self.tweets = tweets
        self.targets = targets

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.tweets[idx]['input_ids'], dtype=torch.long)
        attention_mask = torch.tensor(self.tweets[idx]['attention_mask'], dtype=torch.long)
        targets = torch.tensor(self.targets[idx], dtype=torch.float)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'targets': targets
        }

class RoBERTaForPersonalityTraits(torch.nn.Module):
    def __init__(self):
        super(RoBERTaForPersonalityTraits, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 5)  # Update the output size to 5

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        output = self.linear(output)
        return output


In [None]:
# Prepare data loaders
batch_size = 16
train_dataset = PersonalityDataset(df_train['tweets'].tolist(), df_train[['ext', 'neu', 'agr', 'con', 'ope']].values)
val_dataset = PersonalityDataset(df_val['tweets'].tolist(), df_val[['ext', 'neu', 'agr', 'con', 'ope']].values)
test_dataset = PersonalityDataset(df_test['tweets'].tolist(), df_test[['ext', 'neu', 'agr', 'con', 'ope']].values)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Instantiate the model and define optimizer and loss function
# model = Longformer()
model = RoBERTaForPersonalityTraits()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.MSELoss()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Training loop
epochs = 10

# Early stopping parameters
patience = 3
best_val_loss = float('inf')
patience_counter = 0
accumulation_steps = 28

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_loader):
        optimizer.zero_grad() if i % accumulation_steps == 0 else None

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, targets)

        # Backward pass
        loss = loss / accumulation_steps
        loss.backward()
        if (i+1) % accumulation_steps == 0 or i+1 == len(train_loader):  # update on last step even if accumulation steps have not been reached
            optimizer.step()
            optimizer.zero_grad()  # make sure to reset gradient after updating

        total_loss += loss.item()

        # Delete unnecessary tensors to save GPU memory
        del input_ids
        del attention_mask
        del targets
        del outputs
        torch.cuda.empty_cache()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Train loss at epoch {epoch + 1}: {avg_train_loss}")

    # Validation loop
    model.eval()
    val_total_loss = 0
    all_predictions = [[] for _ in range(5)]  # Separate predictions for each trait
    all_targets = [[] for _ in range(5)]  # Separate targets for each trait

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, targets)

            val_total_loss += loss.item()

            # Split predictions and targets for each trait
            for i in range(5):
                all_predictions[i].extend(outputs[:, i].cpu().numpy())
                all_targets[i].extend(targets[:, i].cpu().numpy())

    avg_val_loss = val_total_loss / len(val_loader)
    print(f"Validation loss at epoch {epoch + 1}: {avg_val_loss}")

    # Check if early stopping conditions are met
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0  # reset counter
        # save the best model
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1  # increment counter
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break  # break out of the training loop



Train loss at epoch 1: 0.0015698678076715264
Validation loss at epoch 1: 0.028544730094092138
Train loss at epoch 2: 0.001416440096620889
Validation loss at epoch 2: 0.028648711486973545
Train loss at epoch 3: 0.0013196915678410658
Validation loss at epoch 3: 0.02798172051552683
Train loss at epoch 4: 0.0012523203113967685
Validation loss at epoch 4: 0.028014787927862595
Train loss at epoch 5: 0.0012065046729997384
Validation loss at epoch 5: 0.0275388421617787
Train loss at epoch 6: 0.0011604977192502996
Validation loss at epoch 6: 0.02686394271652468
Train loss at epoch 7: 0.0011255693604441763
Validation loss at epoch 7: 0.026304453621957113
Train loss at epoch 8: 0.0010888563557466223
Validation loss at epoch 8: 0.02558827465823428
Train loss at epoch 9: 0.0010726135460756285
Validation loss at epoch 9: 0.02536240938669917
Train loss at epoch 10: 0.0010452240475378766
Validation loss at epoch 10: 0.025414319698360156
For Extroversion:
MSE: 0.023549817502498627
RMSE: 0.1534595042467

In [None]:
# Evaluation on test set
model.eval()
test_total_loss = 0
all_predictions = [[] for _ in range(5)]  # Separate predictions for each trait
all_targets = [[] for _ in range(5)]  # Separate targets for each trait

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, targets)

        test_total_loss += loss.item()

        # Split predictions and targets for each trait
        for i in range(5):
            all_predictions[i].extend(outputs[:, i].cpu().numpy())
            all_targets[i].extend(targets[:, i].cpu().numpy())

trait_names = ["Extroversion", "Neuroticism", "Agreeableness", "Conscientiousness", "Openness"]

for i in range(5):
    predictions = all_predictions[i]
    targets = all_targets[i]

    mse = mean_squared_error(targets, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(targets, predictions)
    r2 = r2_score(targets, predictions)

    print(f"For {trait_names[i]}:")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"R^2: {r2}\n")

For Extroversion:
MSE: 0.022939695045351982
RMSE: 0.15145856142044067
MAE: 0.11859782040119171
R^2: 0.12933367100562165

For Neuroticism:
MSE: 0.039613835513591766
RMSE: 0.1990322470664978
MAE: 0.16198262572288513
R^2: 0.22804763320969812

For Agreeableness:
MSE: 0.02363361231982708
RMSE: 0.1537322700023651
MAE: 0.11621047556400299
R^2: 0.018035479145879107

For Conscientiousness:
MSE: 0.02075071819126606
RMSE: 0.14405108988285065
MAE: 0.1176556795835495
R^2: 0.07265784955979493

For Openness:
MSE: 0.01933954283595085
RMSE: 0.139066681265831
MAE: 0.11536608636379242
R^2: 0.18549295347818118



In [None]:
df_predictions = pd.DataFrame(np.transpose(all_predictions), columns=trait_names)
correlation_matrix_predictions = df_predictions.corr()
print("\nCorrelation matrix for predicted values:")
print(correlation_matrix_predictions)



Correlation matrix for predicted values:
                   Extroversion  Neuroticism  Agreeableness  \
Extroversion           1.000000     0.377298       0.338042   
Neuroticism            0.377298     1.000000       0.563185   
Agreeableness          0.338042     0.563185       1.000000   
Conscientiousness      0.334508    -0.072858       0.426295   
Openness              -0.056672    -0.286838      -0.424843   

                   Conscientiousness  Openness  
Extroversion                0.334508 -0.056672  
Neuroticism                -0.072858 -0.286838  
Agreeableness               0.426295 -0.424843  
Conscientiousness           1.000000  0.086586  
Openness                    0.086586  1.000000  


In [None]:
correlation_matrix = df[['ext', 'neu', 'agr', 'con', 'ope']].corr()
print(correlation_matrix)

          ext       neu       agr       con       ope
ext  1.000000  0.294476  0.145334  0.192219  0.020805
neu  0.294476  1.000000  0.325530  0.021377 -0.029465
agr  0.145334  0.325530  1.000000  0.070499 -0.004108
con  0.192219  0.021377  0.070499  1.000000  0.071473
ope  0.020805 -0.029465 -0.004108  0.071473  1.000000
