# 🧠 Personality Prediction Model Training

This notebook trains a BERT-based personality prediction model using the Big Five personality traits.

## Setup
1. **Runtime**: Go to Runtime → Change runtime type → GPU (T4, A100, or V100)
2. **Run all cells** in order
3. **Download** the trained model at the end

**Estimated time**: 15-30 minutes on GPU

In [None]:
# Install required packages
!pip install transformers torch datasets accelerate tensorboard matplotlib seaborn scikit-learn
!pip install --upgrade numpy<2  # Fix numpy compatibility

In [None]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
else:
    print("⚠️  No GPU detected. Training will be slower on CPU.")

In [None]:
# Import libraries
import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModel, AutoConfig,
    TrainingArguments, Trainer, 
    EarlyStoppingCallback
)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
import warnings

warnings.filterwarnings('ignore')
print("✅ Libraries imported successfully")

In [None]:
# Training Configuration
class TrainingConfig:
    def __init__(self):
        # Model settings
        self.model_name = "distilbert-base-uncased"  # Faster than BERT
        self.max_length = 256
        self.hidden_size = 768
        self.num_labels = 5  # Big Five traits
        self.dropout = 0.1
        
        # Training settings
        self.batch_size = 32  # Larger batch for GPU
        self.learning_rate = 3e-5
        self.num_epochs = 3
        self.warmup_steps = 500
        self.weight_decay = 0.01
        
        # Dataset settings
        self.dataset_name = "essays_big5"
        self.train_split = 0.8
        self.val_split = 0.1
        self.test_split = 0.1
        
        # Evaluation settings
        self.eval_steps = 100
        self.logging_steps = 50
        self.save_steps = 500
        self.early_stopping_patience = 2
        
        # Device
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.mixed_precision = torch.cuda.is_available()
        
        # Reproducibility
        self.seed = 42
        
    def to_dict(self):
        return {k: v for k, v in self.__dict__.items() if not k.startswith('_')}

config = TrainingConfig()
print(f"✅ Training config created. Using device: {config.device}")

In [None]:
# Load essays-big5 dataset
print("📊 Loading essays-big5 dataset from Hugging Face...")

try:
    # Load the dataset
    ds = load_dataset("jingjietan/essays-big5")
    df = ds['train'].to_pandas()
    
    print(f"📈 Dataset loaded: {len(df)} samples")
    print(f"📋 Columns: {df.columns.tolist()}")
    
    # Check the first few rows
    display(df.head())
    
except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    print("🔄 Creating synthetic dataset instead...")
    
    # Create synthetic dataset as fallback
    texts = []
    labels = []
    
    for i in range(2000):
        text = f"This is sample text {i} for personality analysis. It contains various personality indicators."
        personality = np.random.normal(0.5, 0.15, 5)
        personality = np.clip(personality, 0, 1)
        
        texts.append(text)
        labels.append(personality)
    
    df = pd.DataFrame({
        'text': texts,
        'openness': [l[0] for l in labels],
        'conscientiousness': [l[1] for l in labels],
        'extraversion': [l[2] for l in labels],
        'agreeableness': [l[3] for l in labels],
        'neuroticism': [l[4] for l in labels]
    })
    
    print(f"📈 Synthetic dataset created: {len(df)} samples")

In [None]:
# Process the dataset
print("🔧 Processing dataset...")

# Column mapping for essays-big5 dataset
column_mapping = {
    'O': 'openness',
    'C': 'conscientiousness', 
    'E': 'extraversion',
    'A': 'agreeableness',
    'N': 'neuroticism'
}

# Apply column mapping
df = df.rename(columns=column_mapping)

# Ensure we have the required columns
big_five_traits = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']

# Process personality scores
for trait in big_five_traits:
    if trait in df.columns:
        df[trait] = pd.to_numeric(df[trait], errors='coerce')
        
        # Convert binary to continuous if needed
        unique_values = df[trait].dropna().unique()
        if len(unique_values) <= 2 and set(unique_values).issubset({0, 1}):
            print(f"🔄 Converting {trait} from binary to continuous scores")
            df[trait] = df[trait].apply(lambda x: 
                np.random.uniform(0.2, 0.4) if x == 0 else 
                np.random.uniform(0.6, 0.8) if x == 1 else 0.5)
    else:
        df[trait] = 0.5  # Default neutral value

# Filter out short texts
df = df[df['text'].str.len() > 50]

# Select final columns
df = df[['text'] + big_five_traits]

print(f"✅ Dataset processed: {len(df)} samples")
print(f"📊 Text length - Mean: {df['text'].str.len().mean():.1f}, Median: {df['text'].str.len().median():.1f}")

# Show personality trait statistics
for trait in big_five_traits:
    mean_score = df[trait].mean()
    std_score = df[trait].std()
    print(f"🎯 {trait.capitalize()}: Mean={mean_score:.3f}, Std={std_score:.3f}")

In [None]:
# Dataset class
class PersonalityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        labels = torch.tensor(self.labels[idx], dtype=torch.float32)
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': labels
        }

print("✅ Dataset class defined")

In [ ]:
# Model definition
class PersonalityModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        # Handle both custom config and transformers config objects
        if hasattr(config, 'model_name'):
            model_name = config.model_name
            hidden_size = config.hidden_size
            dropout = config.dropout
            num_labels = config.num_labels
        else:
            # If it's a transformers config object, use defaults
            model_name = "distilbert-base-uncased"
            hidden_size = getattr(config, 'hidden_size', 768)
            dropout = getattr(config, 'dropout', 0.1)
            num_labels = getattr(config, 'num_labels', 5)
        
        # Load pre-trained model
        self.bert = AutoModel.from_pretrained(model_name)
        
        # Classification head
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_size, num_labels)
        
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fn = nn.MSELoss()
            loss = loss_fn(logits, labels)
        
        return {
            'loss': loss,
            'logits': logits
        }

print("✅ Model class defined")

In [None]:
# Prepare data for training
print("🔧 Preparing data for training...")

# Extract texts and labels
texts = df['text'].tolist()
labels = df[big_five_traits].values.tolist()

# Split dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=config.test_split, random_state=config.seed
)

val_size = config.val_split / (1 - config.test_split)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=val_size, random_state=config.seed
)

print(f"📊 Dataset split:")
print(f"   Train: {len(train_texts)} samples")
print(f"   Validation: {len(val_texts)} samples")
print(f"   Test: {len(test_texts)} samples")

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Create datasets
train_dataset = PersonalityDataset(train_texts, train_labels, tokenizer, config.max_length)
val_dataset = PersonalityDataset(val_texts, val_labels, tokenizer, config.max_length)
test_dataset = PersonalityDataset(test_texts, test_labels, tokenizer, config.max_length)

print("✅ Data preparation complete")

In [ ]:
# Initialize model
print("🤖 Initializing model...")

# Use the training config (not model config)
model = PersonalityModel(config)  # This should be the TrainingConfig instance
model.to(config.device)

print(f"✅ Model initialized and moved to {config.device}")
print(f"📊 Model parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# Define metrics computation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # Calculate metrics for each personality trait
    metrics = {}
    trait_names = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']
    
    for i, trait in enumerate(trait_names):
        pred_trait = predictions[:, i]
        true_trait = labels[:, i]
        
        mse = mean_squared_error(true_trait, pred_trait)
        mae = mean_absolute_error(true_trait, pred_trait)
        r2 = r2_score(true_trait, pred_trait)
        
        metrics[f'{trait}_mse'] = mse
        metrics[f'{trait}_mae'] = mae
        metrics[f'{trait}_r2'] = r2
    
    # Overall metrics
    overall_mse = mean_squared_error(labels.flatten(), predictions.flatten())
    overall_mae = mean_absolute_error(labels.flatten(), predictions.flatten())
    overall_r2 = r2_score(labels.flatten(), predictions.flatten())
    
    metrics.update({
        'overall_mse': overall_mse,
        'overall_mae': overall_mae,
        'overall_r2': overall_r2
    })
    
    return metrics

print("✅ Metrics function defined")

In [None]:
# Set up training
print("🚀 Setting up training...")

training_args = TrainingArguments(
    output_dir='./personality_model',
    num_train_epochs=config.num_epochs,
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    warmup_steps=config.warmup_steps,
    weight_decay=config.weight_decay,
    logging_dir='./logs',
    logging_steps=config.logging_steps,
    eval_strategy="steps",
    eval_steps=config.eval_steps,
    save_steps=config.save_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_overall_r2",
    greater_is_better=True,
    save_total_limit=3,
    fp16=config.mixed_precision,
    learning_rate=config.learning_rate,
    report_to=None,  # Disable wandb
    dataloader_pin_memory=False,  # Helps with Colab
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=config.early_stopping_patience)]
)

print("✅ Trainer initialized")

In [None]:
# Start training
print("🎯 Starting training...")
print("⏱️  This may take 15-30 minutes depending on your GPU")

try:
    trainer.train()
    print("\n🎉 Training completed successfully!")
    
except Exception as e:
    print(f"❌ Training failed: {e}")
    raise

In [None]:
# Evaluate on test set
print("📊 Evaluating on test set...")

test_results = trainer.evaluate(test_dataset)
print("\n📈 Test Results:")
for key, value in test_results.items():
    if 'r2' in key or 'mse' in key or 'mae' in key:
        print(f"   {key}: {value:.4f}")

In [None]:
# Save the trained model
print("💾 Saving trained model...")

# Save model and tokenizer
model.save_pretrained('./personality_model_final')
tokenizer.save_pretrained('./personality_model_final')

# Save configuration
with open('./personality_model_final/training_config.json', 'w') as f:
    json.dump(config.to_dict(), f, indent=2)

print("✅ Model saved to './personality_model_final'")

# Create a zip file for easy download
import shutil
shutil.make_archive('personality_model_trained', 'zip', './personality_model_final')
print("📦 Model packaged as 'personality_model_trained.zip'")

print("\n🎯 Next steps:")
print("1. Download the 'personality_model_trained.zip' file")
print("2. Extract it to your local models/ directory")
print("3. Update your Flask backend to use the trained model")
print("4. Set use_mock_model = False in model_loader.py")

In [None]:
# Test the trained model
print("🧪 Testing trained model...")

# Test with a sample text
test_text = "I love meeting new people and exploring creative projects. I'm always organized and plan everything in advance."

# Tokenize
inputs = tokenizer(test_text, return_tensors="pt", truncation=True, padding=True, max_length=config.max_length)
inputs = {k: v.to(config.device) for k, v in inputs.items()}

# Predict
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs['logits'].cpu().numpy()[0]

# Display results
trait_names = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']
print(f"\n🎯 Personality Analysis for: '{test_text}'")
print("━" * 60)
for trait, score in zip(trait_names, predictions):
    print(f"📊 {trait:15}: {score:.3f} ({'High' if score > 0.6 else 'Medium' if score > 0.4 else 'Low'})")

print("\n✅ Model test completed successfully!")