# LLM Embeddings + XGBoost Engagement Prediction Model

This notebook trains an XGBoost regression model to predict post engagement metrics using:
- **LLM embeddings** (Google Gemini) for post text semantic understanding
- **Persona metadata** (job role, affiliation, account age)
- **Context metadata** (audience size, baseline engagement, time window)

**Target Variables:**
- % positive reactions
- % negative reactions  
- Comment sentiment distribution
- Engagement velocity (early vs late)

In [8]:
pip install python-dotenv



In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import glob
import time
from typing import List, Dict, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from xgboost import XGBRegressor, XGBClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.compose import ColumnTransformer

# Google AI for embeddings
try:
    import google.generativeai as genai
    GEMINI_AVAILABLE = True
except ImportError:
    print("Warning: google-generativeai not installed. Install with: pip install google-generativeai")
    GEMINI_AVAILABLE = False

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)

# Set random seed for reproducibility
np.random.seed(42)


In [10]:
def clean_linkedin_post_data(posts: List[Dict]) -> List[Dict]:
    """
    Clean and validate LinkedIn post data.
    
    Args:
        posts: List of post dictionaries from JSON files
        
    Returns:
        List of cleaned post dictionaries
    """
    cleaned_posts = []
    seen_urns = set()
    
    issues = {
        'missing_text': 0,
        'empty_text': 0,
        'missing_stats': 0,
        'missing_author': 0,
        'missing_timestamp': 0,
        'invalid_stats': 0,
        'missing_source_company': 0,
        'duplicate_urns': 0
    }
    
    for post in posts:
        # Check for duplicates by URN
        urn = post.get('activity_urn') or post.get('full_urn')
        if urn and urn in seen_urns:
            issues['duplicate_urns'] += 1
            continue
        if urn:
            seen_urns.add(urn)
        
        # Validate required fields
        if 'text' not in post or not post['text']:
            issues['missing_text'] += 1
            continue
        
        if not post['text'].strip():
            issues['empty_text'] += 1
            continue
        
        if 'stats' not in post or not post['stats']:
            issues['missing_stats'] += 1
            continue
        
        if 'author' not in post or not post['author']:
            issues['missing_author'] += 1
            continue
        
        if 'posted_at' not in post or not post['posted_at'] or not post['posted_at'].get('timestamp'):
            issues['missing_timestamp'] += 1
            continue
        
        # Validate and normalize stats
        stats = post['stats']
        if not isinstance(stats, dict):
            issues['invalid_stats'] += 1
            continue
        
        # Ensure all stat fields exist with defaults
        required_stats = ['total_reactions', 'like', 'love', 'celebrate', 'support', 'insight', 'comments', 'reposts']
        for stat in required_stats:
            if stat not in stats:
                stats[stat] = 0
        
        # Ensure source_company exists
        if 'source_company' not in post or not post['source_company']:
            issues['missing_source_company'] += 1
            # Try to infer from author name
            if post.get('author', {}).get('name'):
                post['source_company'] = post['author']['name'].lower()
            else:
                post['source_company'] = 'unknown'
        
        # Clean the post
        cleaned_post = {
            'activity_urn': post.get('activity_urn'),
            'full_urn': post.get('full_urn'),
            'post_url': post.get('post_url'),
            'text': post['text'].strip(),  # Trim whitespace
            'posted_at': post['posted_at'],
            'post_language_code': post.get('post_language_code', 'en'),
            'post_type': post.get('post_type', 'regular'),
            'author': post['author'],
            'stats': stats,
            'media': post.get('media'),
            'source_company': post['source_company']
        }
        
        # Only include document field if it's not null
        if post.get('document') is not None:
            cleaned_post['document'] = post['document']
        
        cleaned_posts.append(cleaned_post)
    
    # Print cleaning summary
    if any(count > 0 for count in issues.values()):
        print("Data cleaning issues found:")
        for issue, count in issues.items():
            if count > 0:
                print(f"  - {issue}: {count}")
    
    print(f"✓ Cleaned {len(cleaned_posts)}/{len(posts)} posts")
    return cleaned_posts

## Configuration

Set up API keys and model parameters

In [None]:
# Configuration
# Load API key from .env file (loaded via dotenv in imports cell)
GEMINI_API_KEY = ""
print(f"API Key loaded: {'Yes' if GEMINI_API_KEY else 'No'} (length: {len(GEMINI_API_KEY)})")
EMBEDDING_MODEL = 'models/gemini-embedding-001'
DATA_PATH = 'data/'  # Path to your training data directory, or None to simulate

# Configure Gemini API if available
if GEMINI_AVAILABLE and GEMINI_API_KEY:
    genai.configure(api_key=GEMINI_API_KEY)
    print("✓ Gemini API configured")
else:
    print("⚠ Gemini API not configured. Embeddings will be simulated.")
    if not GEMINI_API_KEY:
        print("   Tip: Make sure GEMINI_API_KEY is set in your .env file")


API Key loaded: Yes (length: 39)
✓ Gemini API configured


## Data Loading Functions

Functions to load and transform LinkedIn post data from JSON files.

In [12]:
import glob
from datetime import datetime

def load_linkedin_json_data(data_folder: str) -> pd.DataFrame:
    """
    Load and transform LinkedIn post JSON files from the data folder.
    
    Args:
        data_folder: Path to folder containing JSON files
        
    Returns:
        DataFrame with transformed LinkedIn post data matching expected format
    """
    # Find all company post JSON files
    json_pattern = os.path.join(data_folder, '*company-posts*.json')
    json_files = glob.glob(json_pattern)
    
    if not json_files:
        raise ValueError(f"No company post JSON files found in {data_folder}")
    
    print(f"Found {len(json_files)} JSON file(s) to load...")
    
    all_posts = []
    
    for json_file in json_files:
        print(f"Loading {os.path.basename(json_file)}...")
        with open(json_file, 'r', encoding='utf-8') as f:
            posts = json.load(f)
        
        if not isinstance(posts, list):
            posts = [posts]
        
        print(f"  Found {len(posts)} posts")
        
        # Clean the posts before adding to all_posts
        cleaned_posts = clean_linkedin_post_data(posts)
        all_posts.extend(cleaned_posts)
    
    print(f"Total posts loaded and cleaned: {len(all_posts)}")
    print("Transforming data...")
    
    transformed_data = []
    
    for post in all_posts:
        # Skip posts without required fields
        if not post.get('text') or not post.get('stats'):
            continue
        
        # Extract basic fields
        post_text = post.get('text', '')
        stats = post.get('stats', {})
        author = post.get('author', {})
        posted_at = post.get('posted_at', {})
        
        # Calculate engagement metrics
        total_reactions = stats.get('total_reactions', 0)
        like_count = stats.get('like', 0)
        love_count = stats.get('love', 0)
        celebrate_count = stats.get('celebrate', 0)
        support_count = stats.get('support', 0)
        insight_count = stats.get('insight', 0)
        comments_count = stats.get('comments', 0)
        reposts_count = stats.get('reposts', 0)
        
        # Calculate pct_positive (positive reactions / total reactions * 100)
        positive_reactions = like_count + love_count + celebrate_count + support_count
        if total_reactions > 0:
            pct_positive = (positive_reactions / total_reactions) * 100
            pct_negative = (insight_count / total_reactions) * 100
        else:
            pct_positive = 0.0
            pct_negative = 0.0
        
        # Audience size from follower count
        audience_size = author.get('follower_count', 0)
        if audience_size == 0:
            audience_size = 1000  # Default minimum
        
        # Calculate baseline engagement
        total_engagement = total_reactions + comments_count + reposts_count
        baseline_engagement = total_engagement / max(audience_size, 1)
        
        # Calculate comment sentiment distribution (normalized -1 to 1)
        if total_reactions > 0:
            comment_sentiment_dist = (pct_positive - pct_negative) / 100
            comment_sentiment_dist = np.clip(comment_sentiment_dist, -1, 1)
        else:
            comment_sentiment_dist = 0.0
        
        # Extract time window from timestamp
        timestamp = posted_at.get('timestamp', 0)
        if timestamp > 0:
            # Convert timestamp (milliseconds) to datetime
            post_datetime = datetime.fromtimestamp(timestamp / 1000)
            hour = post_datetime.hour
            weekday = post_datetime.weekday()  # 0 = Monday, 6 = Sunday
            
            if weekday >= 5:  # Saturday or Sunday
                time_window = 'weekend'
            elif hour < 12:
                time_window = 'morning'
            elif hour < 17:
                time_window = 'afternoon'
            else:
                time_window = 'evening'
        else:
            time_window = 'afternoon'  # Default
        
        # Extract affiliation from source_company
        affiliation = post.get('source_company', 'Unknown')
        if affiliation and isinstance(affiliation, str):
            affiliation = affiliation.capitalize()
        else:
            affiliation = 'Unknown'
        
        # Infer job_role from company name or use default
        # For company posts, we'll use a default role based on the company
        company_name = affiliation.lower()
        if 'google' in company_name or 'microsoft' in company_name or 'apple' in company_name:
            job_role = 'Software Engineer'
        elif 'meta' in company_name or 'facebook' in company_name:
            job_role = 'Product Manager'
        else:
            job_role = 'Product Manager'  # Default for company posts
        
        # Account age - use default since we don't have account creation date
        # For company accounts, use a reasonable default (e.g., 5 years)
        account_age = 1825  # 5 years in days
        
        # Calculate engagement velocity
        # Estimate based on post age and engagement rate
        # Newer posts with high engagement = high velocity
        if timestamp > 0:
            # Calculate post age in days
            current_time = datetime.now().timestamp() * 1000
            post_age_days = (current_time - timestamp) / (1000 * 60 * 60 * 24)
            
            # High engagement rate + recent post = high velocity
            engagement_rate = baseline_engagement
            if post_age_days < 1:  # Less than 1 day old
                velocity_factor = 0.9
            elif post_age_days < 7:  # Less than 1 week old
                velocity_factor = 0.7
            else:
                velocity_factor = 0.5
            
            engagement_velocity = min(engagement_rate * 10 * velocity_factor, 1.0)
            engagement_velocity = max(engagement_velocity, 0.0)
        else:
            engagement_velocity = 0.5  # Default
        
        transformed_data.append({
            'post_text': post_text,
            'job_role': job_role,
            'affiliation': affiliation,
            'account_age': account_age,
            'audience_size': audience_size,
            'baseline_engagement': baseline_engagement,
            'time_window': time_window,
            'pct_positive': pct_positive,
            'pct_negative': pct_negative,
            'comment_sentiment_dist': comment_sentiment_dist,
            'engagement_velocity': engagement_velocity
        })
    
    df = pd.DataFrame(transformed_data)
    print(f"✓ Transformed {len(df)} posts")
    return df

In [13]:
# Convert all JSON files to CSV
import glob
from pathlib import Path

data_folder = 'data'
json_pattern = os.path.join(data_folder, '*.json')
json_files = glob.glob(json_pattern)

print(f"Found {len(json_files)} JSON file(s) to convert...\n")

for json_file in sorted(json_files):
    try:
        # Load and transform using existing function
        print(f"Processing {os.path.basename(json_file)}...")
        
        # Load JSON
        with open(json_file, 'r', encoding='utf-8') as f:
            posts = json.load(f)
        
        if not isinstance(posts, list):
            posts = [posts]
        
        print(f"  Loaded {len(posts)} posts")
        
        # Clean posts
        cleaned_posts = clean_linkedin_post_data(posts)
        print(f"  Cleaned: {len(cleaned_posts)} posts")
        
        # Transform to DataFrame using existing function
        # We'll use a simplified version that processes one file
        transformed_data = []
        
        for post in cleaned_posts:
            if not post.get('text') or not post.get('stats'):
                continue
            
            post_text = post.get('text', '')
            stats = post.get('stats', {})
            author = post.get('author', {})
            posted_at = post.get('posted_at', {})
            
            total_reactions = stats.get('total_reactions', 0)
            like_count = stats.get('like', 0)
            love_count = stats.get('love', 0)
            celebrate_count = stats.get('celebrate', 0)
            support_count = stats.get('support', 0)
            insight_count = stats.get('insight', 0)
            comments_count = stats.get('comments', 0)
            reposts_count = stats.get('reposts', 0)
            
            positive_reactions = like_count + love_count + celebrate_count + support_count
            if total_reactions > 0:
                pct_positive = (positive_reactions / total_reactions) * 100
                pct_negative = (insight_count / total_reactions) * 100
            else:
                pct_positive = 0.0
                pct_negative = 0.0
            
            audience_size = author.get('follower_count', 0)
            if audience_size == 0:
                audience_size = 1000
            
            total_engagement = total_reactions + comments_count + reposts_count
            baseline_engagement = total_engagement / max(audience_size, 1)
            
            if total_reactions > 0:
                comment_sentiment_dist = (pct_positive - pct_negative) / 100
                comment_sentiment_dist = np.clip(comment_sentiment_dist, -1, 1)
            else:
                comment_sentiment_dist = 0.0
            
            timestamp = posted_at.get('timestamp', 0)
            if timestamp > 0:
                from datetime import datetime
                post_datetime = datetime.fromtimestamp(timestamp / 1000)
                hour = post_datetime.hour
                weekday = post_datetime.weekday()
                
                if weekday >= 5:
                    time_window = 'weekend'
                elif hour < 12:
                    time_window = 'morning'
                elif hour < 17:
                    time_window = 'afternoon'
                else:
                    time_window = 'evening'
            else:
                time_window = 'afternoon'
            
            affiliation = post.get('source_company', 'Unknown')
            if affiliation and isinstance(affiliation, str):
                affiliation = affiliation.capitalize()
            else:
                affiliation = 'Unknown'
            
            company_name = affiliation.lower()
            if 'google' in company_name or 'microsoft' in company_name or 'apple' in company_name:
                job_role = 'Software Engineer'
            elif 'meta' in company_name or 'facebook' in company_name:
                job_role = 'Product Manager'
            else:
                job_role = 'Product Manager'
            
            account_age = 1825
            
            if timestamp > 0:
                from datetime import datetime
                current_time = datetime.now().timestamp() * 1000
                post_age_days = (current_time - timestamp) / (1000 * 60 * 60 * 24)
                engagement_rate = baseline_engagement
                if post_age_days < 1:
                    velocity_factor = 0.9
                elif post_age_days < 7:
                    velocity_factor = 0.7
                else:
                    velocity_factor = 0.5
                engagement_velocity = min(engagement_rate * 10 * velocity_factor, 1.0)
                engagement_velocity = max(engagement_velocity, 0.0)
            else:
                engagement_velocity = 0.5
            
            transformed_data.append({
                'post_text': post_text,
                'job_role': job_role,
                'affiliation': affiliation,
                'account_age': account_age,
                'audience_size': audience_size,
                'baseline_engagement': baseline_engagement,
                'time_window': time_window,
                'pct_positive': pct_positive,
                'pct_negative': pct_negative,
                'comment_sentiment_dist': comment_sentiment_dist,
                'engagement_velocity': engagement_velocity
            })
        
        df = pd.DataFrame(transformed_data)
        
        # Save to CSV
        csv_file = json_file.replace('.json', '.csv')
        df.to_csv(csv_file, index=False, encoding='utf-8')
        print(f"  ✓ Saved to {os.path.basename(csv_file)} ({len(df)} rows, {len(df.columns)} columns)\n")
        
    except Exception as e:
        print(f"  ✗ Error: {e}\n")

print("✓ Conversion complete!")


Found 0 JSON file(s) to convert...

✓ Conversion complete!


In [17]:
def load_data(data_path: Optional[str] = None) -> pd.DataFrame:
    """
    Load data from CSV/JSON file or directory of JSON files.
    
    Expected columns:
    - post_text: string
    - job_role: string (persona metadata)
    - affiliation: string (persona metadata)
    - account_age: int (days, persona metadata)
    - audience_size: int (context metadata)
    - baseline_engagement: float (context metadata)
    - time_window: string (context metadata)
    - pct_positive: float (target, 0-100)
    - pct_negative: float (target, 0-100)
    - comment_sentiment_dist: float (target, -1 to 1)
    - engagement_velocity: float (target, 0-1)
    """
    if not data_path:
        raise ValueError("data_path is required. Please provide a path to your data directory or file.")
    
    if not os.path.exists(data_path):
        raise ValueError(f"Data path does not exist: {data_path}")
    
    # Check if it's a directory (for loading multiple JSON files)
    if os.path.isdir(data_path):
        print(f"Loading data from directory: {data_path}")
        df = load_linkedin_json_data(data_path)
        print(f"✓ Loaded {len(df)} rows from {data_path}")
        return df
    
    # Otherwise, treat as a single file
    print(f"Loading data from {data_path}...")
    if data_path.endswith('.csv'):
        df = pd.read_csv(data_path)
    elif data_path.endswith('.json'):
        df = pd.read_json(data_path)
    else:
        raise ValueError(f"Unsupported file format: {data_path}")
    print(f"✓ Loaded {len(df)} rows from {data_path}")
    return df


In [18]:
# Load data from JSON files
df = load_linkedin_json_data(DATA_PATH)
print(f"\nDataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

ValueError: No company post JSON files found in data/

### Load Data

Load data from the `data/` folder containing JSON files.


## Data Cleaning and Preprocessing

In [19]:
# Data cleaning and exploration
print("Dataset Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nTarget variable statistics:")
print(df[['pct_positive', 'pct_negative', 'comment_sentiment_dist', 'engagement_velocity']].describe())

# Handle missing values (if any)
df = df.dropna()

# Basic text normalization (optional - embeddings handle this well)
df['post_text_clean'] = df['post_text'].str.lower().str.strip()

print(f"\n✓ Cleaned dataset: {len(df)} rows")

Dataset Info:


NameError: name 'df' is not defined

## LLM Embedding Generation

Generate text embeddings using Google Gemini API.

In [20]:
def generate_embeddings(texts: List[str], model_name: str = EMBEDDING_MODEL, batch_size: int = 10) -> np.ndarray:
    """
    Generate embeddings for a list of texts using Google Gemini API.
    Includes batch processing, error handling, and fallback to simulated embeddings.
    """
    embeddings = []
    
    if not GEMINI_AVAILABLE or not GEMINI_API_KEY:
        print("⚠ Using simulated embeddings (random vectors)")
        # Generate random embeddings with same dimension as Gemini (768 for text-embedding-004, 768/1536/3072 for gemini-embedding-001)
        embedding_dim = 768
        for text in texts:
            # Create deterministic "embeddings" based on text hash for reproducibility
            np.random.seed(hash(text) % 2**32)
            emb = np.random.randn(embedding_dim)
            emb = emb / np.linalg.norm(emb)  # Normalize
            embeddings.append(emb)
        return np.array(embeddings)
    
    print(f"Generating embeddings for {len(texts)} texts using {model_name}...")
    
    # Try different API formats (support both old and new Google AI SDK)
    try:
        # Try new API format (google-genai package)
        try:
            from google import genai as genai_new
            client = genai_new.Client(api_key=GEMINI_API_KEY)
            use_new_api = True
        except:
            use_new_api = False
    except:
        use_new_api = False
    
    # Process in batches to handle rate limits
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_embeddings = []
        
        for text in batch:
            try:
                if use_new_api:
                    # New API format
                    response = client.models.embed_content(
                        model=model_name,
                        contents=text
                    )
                    embedding = response.embeddings[0].values if hasattr(response.embeddings[0], 'values') else response.embeddings[0]
                else:
                    # Old API format (google-generativeai)
                    result = genai.embed_content(
                        model=model_name,
                        content=text,
                        task_type="RETRIEVAL_DOCUMENT"
                    )
                    # Handle different response formats
                    if isinstance(result, dict):
                        embedding = result.get('embedding', result.get('values', []))
                    else:
                        embedding = result.embedding if hasattr(result, 'embedding') else result
                
                batch_embeddings.append(embedding)
                
                # Small delay to avoid rate limits
                time.sleep(0.1)
            except Exception as e:
                print(f"Error embedding text {i}: {str(e)}")
                # Fallback: use random embedding
                embedding_dim = 768
                np.random.seed(hash(text) % 2**32)
                emb = np.random.randn(embedding_dim)
                emb = emb / np.linalg.norm(emb)
                batch_embeddings.append(emb)
        
        embeddings.extend(batch_embeddings)
        
        if (i + batch_size) % 50 == 0:
            print(f"  Processed {min(i + batch_size, len(texts))}/{len(texts)} texts...")
    
    print(f"✓ Generated {len(embeddings)} embeddings")
    return np.array(embeddings)

# Generate embeddings for all post texts
print("Generating text embeddings...")
text_embeddings = generate_embeddings(df['post_text_clean'].tolist())
print(f"Embedding shape: {text_embeddings.shape}")
print(f"Embedding dimension: {text_embeddings.shape[1]}")

Generating text embeddings...


NameError: name 'df' is not defined

In [None]:
def generate_embeddings(texts: List[str], model_name: str = EMBEDDING_MODEL, batch_size: int = 10) -> np.ndarray:
    """
    Generate embeddings for a list of texts using Google Gemini API.
    Includes batch processing, error handling, and fallback to simulated embeddings.
    """
    embeddings = []
    
    if not GEMINI_AVAILABLE or not GEMINI_API_KEY:
        print("⚠ Using simulated embeddings (random vectors)")
        # Generate random embeddings with same dimension as Gemini (768 for text-embedding-004, 768/1536/3072 for gemini-embedding-001)
        embedding_dim = 768
        for text in texts:
            # Create deterministic "embeddings" based on text hash for reproducibility
            np.random.seed(hash(text) % 2**32)
            emb = np.random.randn(embedding_dim)
            emb = emb / np.linalg.norm(emb)  # Normalize
            embeddings.append(emb)
        return np.array(embeddings)
    
    print(f"Generating embeddings for {len(texts)} texts using {model_name}...")
    
    # Try different API formats (support both old and new Google AI SDK)
    try:
        # Try new API format (google-genai package)
        try:
            from google import genai as genai_new
            client = genai_new.Client(api_key=GEMINI_API_KEY)
            use_new_api = True
        except:
            use_new_api = False
    except:
        use_new_api = False
    
    # Process in batches to handle rate limits
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_embeddings = []
        
        for text in batch:
            try:
                if use_new_api:
                    # New API format
                    response = client.models.embed_content(
                        model=model_name,
                        contents=text
                    )
                    embedding = response.embeddings[0].values if hasattr(response.embeddings[0], 'values') else response.embeddings[0]
                else:
                    # Old API format (google-generativeai)
                    result = genai.embed_content(
                        model=model_name,
                        content=text,
                        task_type="RETRIEVAL_DOCUMENT"
                    )
                    # Handle different response formats
                    if isinstance(result, dict):
                        embedding = result.get('embedding', result.get('values', []))
                    else:
                        embedding = result.embedding if hasattr(result, 'embedding') else result
                
                batch_embeddings.append(embedding)
                
                # Small delay to avoid rate limits
                time.sleep(0.1)
            except Exception as e:
                print(f"Error embedding text {i}: {str(e)}")
                # Fallback: use random embedding
                embedding_dim = 768
                np.random.seed(hash(text) % 2**32)
                emb = np.random.randn(embedding_dim)
                emb = emb / np.linalg.norm(emb)
                batch_embeddings.append(emb)
        
        embeddings.extend(batch_embeddings)
        
        if (i + batch_size) % 50 == 0:
            print(f"  Processed {min(i + batch_size, len(texts))}/{len(texts)} texts...")
    
    print(f"✓ Generated {len(embeddings)} embeddings")
    return np.array(embeddings)

# Generate embeddings for all post texts
print("Generating text embeddings...")
text_embeddings = generate_embeddings(df['post_text_clean'].tolist())
print(f"Embedding shape: {text_embeddings.shape}")
print(f"Embedding dimension: {text_embeddings.shape[1]}")

Generating text embeddings...
⚠ Using simulated embeddings (random vectors)
Embedding shape: (500, 768)
Embedding dimension: 768


## Feature Engineering

Combine text embeddings with persona and context metadata

In [None]:
def prepare_features(df: pd.DataFrame, text_embeddings: np.ndarray) -> Tuple[np.ndarray, np.ndarray, Dict]:
    """
    Prepare feature matrix by combining embeddings with metadata.
    Returns: (X, y, feature_info)
    """
    # Encode categorical features
    le_job = LabelEncoder()
    le_affiliation = LabelEncoder()
    le_time = LabelEncoder()
    
    job_encoded = le_job.fit_transform(df['job_role'])
    affiliation_encoded = le_affiliation.fit_transform(df['affiliation'])
    time_encoded = le_time.fit_transform(df['time_window'])
    
    # One-hot encode categorical features for better representation
    ohe = OneHotEncoder(sparse_output=False, drop='first')
    categorical_features = ohe.fit_transform(
        df[['job_role', 'affiliation', 'time_window']]
    )
    
    # Normalize numerical features
    scaler = StandardScaler()
    numerical_features = scaler.fit_transform(
        df[['account_age', 'audience_size', 'baseline_engagement']]
    )
    
    # Combine all features: embeddings + categorical + numerical
    X = np.hstack([
        text_embeddings,  # Text embeddings (e.g., 768 dims)
        categorical_features,  # One-hot encoded categoricals
        numerical_features  # Normalized numerical features
    ])
    
    # Extract target variables
    y = df[['pct_positive', 'pct_negative', 'comment_sentiment_dist', 'engagement_velocity']].values
    
    feature_info = {
        'embedding_dim': text_embeddings.shape[1],
        'categorical_dim': categorical_features.shape[1],
        'numerical_dim': numerical_features.shape[1],
        'total_dim': X.shape[1],
        'label_encoders': {
            'job_role': le_job,
            'affiliation': le_affiliation,
            'time_window': le_time
        },
        'one_hot_encoder': ohe,
        'scaler': scaler
    }
    
    return X, y, feature_info

# Prepare features
X, y, feature_info = prepare_features(df, text_embeddings)

print(f"Feature matrix shape: {X.shape}")
print(f"Target matrix shape: {y.shape}")
print(f"\nFeature breakdown:")
print(f"  - Text embeddings: {feature_info['embedding_dim']} dimensions")
print(f"  - Categorical features: {feature_info['categorical_dim']} dimensions")
print(f"  - Numerical features: {feature_info['numerical_dim']} dimensions")
print(f"  - Total features: {feature_info['total_dim']} dimensions")
print(f"\nTarget variables: pct_positive, pct_negative, comment_sentiment_dist, engagement_velocity")

Feature matrix shape: (500, 788)
Target matrix shape: (500, 4)

Feature breakdown:
  - Text embeddings: 768 dimensions
  - Categorical features: 17 dimensions
  - Numerical features: 3 dimensions
  - Total features: 788 dimensions

Target variables: pct_positive, pct_negative, comment_sentiment_dist, engagement_velocity


In [None]:
def prepare_features(df: pd.DataFrame, text_embeddings: np.ndarray) -> Tuple[np.ndarray, np.ndarray, Dict]:
    """
    Prepare feature matrix by combining embeddings with metadata.
    Returns: (X, y, feature_info)
    """
    # Encode categorical features
    le_job = LabelEncoder()
    le_affiliation = LabelEncoder()
    le_time = LabelEncoder()
    
    job_encoded = le_job.fit_transform(df['job_role'])
    affiliation_encoded = le_affiliation.fit_transform(df['affiliation'])
    time_encoded = le_time.fit_transform(df['time_window'])
    
    # One-hot encode categorical features for better representation
    ohe = OneHotEncoder(sparse_output=False, drop='first')
    categorical_features = ohe.fit_transform(
        df[['job_role', 'affiliation', 'time_window']]
    )
    
    # Normalize numerical features
    scaler = StandardScaler()
    numerical_features = scaler.fit_transform(
        df[['account_age', 'audience_size', 'baseline_engagement']]
    )
    
    # Combine all features: embeddings + categorical + numerical
    X = np.hstack([
        text_embeddings,  # Text embeddings (e.g., 768 dims)
        categorical_features,  # One-hot encoded categoricals
        numerical_features  # Normalized numerical features
    ])
    
    # Extract target variables
    y = df[['pct_positive', 'pct_negative', 'comment_sentiment_dist', 'engagement_velocity']].values
    
    feature_info = {
        'embedding_dim': text_embeddings.shape[1],
        'categorical_dim': categorical_features.shape[1],
        'numerical_dim': numerical_features.shape[1],
        'total_dim': X.shape[1],
        'label_encoders': {
            'job_role': le_job,
            'affiliation': le_affiliation,
            'time_window': le_time
        },
        'one_hot_encoder': ohe,
        'scaler': scaler
    }
    
    return X, y, feature_info

# Prepare features
X, y, feature_info = prepare_features(df, text_embeddings)

print(f"Feature matrix shape: {X.shape}")
print(f"Target matrix shape: {y.shape}")
print(f"\nFeature breakdown:")
print(f"  - Text embeddings: {feature_info['embedding_dim']} dimensions")
print(f"  - Categorical features: {feature_info['categorical_dim']} dimensions")
print(f"  - Numerical features: {feature_info['numerical_dim']} dimensions")
print(f"  - Total features: {feature_info['total_dim']} dimensions")
print(f"\nTarget variables: pct_positive, pct_negative, comment_sentiment_dist, engagement_velocity")

Feature matrix shape: (500, 788)
Target matrix shape: (500, 4)

Feature breakdown:
  - Text embeddings: 768 dimensions
  - Categorical features: 17 dimensions
  - Numerical features: 3 dimensions
  - Total features: 788 dimensions

Target variables: pct_positive, pct_negative, comment_sentiment_dist, engagement_velocity


## Model Training

Train XGBoost multi-output regression model

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Create XGBoost model with MultiOutputRegressor wrapper
# This trains one XGBoost model per target variable
base_model = XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model = MultiOutputRegressor(base_model)

print("\nTraining model...")
model.fit(X_train, y_train)
print("✓ Model training complete")

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print(f"\nPredictions shape: {y_test_pred.shape}")

Training set: 400 samples
Test set: 100 samples

Training model...
✓ Model training complete

Predictions shape: (100, 4)


## Model Evaluation

Calculate metrics and analyze performance

In [None]:
def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray, target_names: List[str]) -> pd.DataFrame:
    """
    Calculate evaluation metrics for each target variable.
    """
    results = []
    
    for i, target_name in enumerate(target_names):
        y_true_i = y_true[:, i]
        y_pred_i = y_pred[:, i]
        
        mae = mean_absolute_error(y_true_i, y_pred_i)
        rmse = np.sqrt(mean_squared_error(y_true_i, y_pred_i))
        r2 = r2_score(y_true_i, y_pred_i)
        
        # Calculate correlation
        correlation = np.corrcoef(y_true_i, y_pred_i)[0, 1]
        
        results.append({
            'target': target_name,
            'MAE': mae,
            'RMSE': rmse,
            'R²': r2,
            'Correlation': correlation
        })
    
    return pd.DataFrame(results)

# Evaluate on train and test sets
target_names = ['pct_positive', 'pct_negative', 'comment_sentiment_dist', 'engagement_velocity']

train_metrics = evaluate_model(y_train, y_train_pred, target_names)
test_metrics = evaluate_model(y_test, y_test_pred, target_names)

print("Training Set Metrics:")
print(train_metrics.to_string(index=False))
print("\nTest Set Metrics:")
print(test_metrics.to_string(index=False))

Training Set Metrics:
                target      MAE     RMSE       R²  Correlation
          pct_positive 1.025742 1.336643 0.986629     0.995914
          pct_negative 0.399778 0.526746 0.986503     0.996368
comment_sentiment_dist 0.009412 0.012375 0.989928     0.996738
   engagement_velocity 0.009688 0.013141 0.986181     0.996036

Test Set Metrics:
                target      MAE      RMSE        R²  Correlation
          pct_positive 9.738217 12.188792  0.015307     0.281487
          pct_negative 3.924400  4.847199 -0.437728    -0.059668
comment_sentiment_dist 0.109485  0.135864 -0.064070     0.205078
   engagement_velocity 0.122085  0.144606 -0.050543     0.159297


In [None]:
def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray, target_names: List[str]) -> pd.DataFrame:
    """
    Calculate evaluation metrics for each target variable.
    """
    results = []
    
    for i, target_name in enumerate(target_names):
        y_true_i = y_true[:, i]
        y_pred_i = y_pred[:, i]
        
        mae = mean_absolute_error(y_true_i, y_pred_i)
        rmse = np.sqrt(mean_squared_error(y_true_i, y_pred_i))
        r2 = r2_score(y_true_i, y_pred_i)
        
        # Calculate correlation
        correlation = np.corrcoef(y_true_i, y_pred_i)[0, 1]
        
        results.append({
            'target': target_name,
            'MAE': mae,
            'RMSE': rmse,
            'R²': r2,
            'Correlation': correlation
        })
    
    return pd.DataFrame(results)

# Evaluate on train and test sets
target_names = ['pct_positive', 'pct_negative', 'comment_sentiment_dist', 'engagement_velocity']

train_metrics = evaluate_model(y_train, y_train_pred, target_names)
test_metrics = evaluate_model(y_test, y_test_pred, target_names)

print("Training Set Metrics:")
print(train_metrics.to_string(index=False))
print("\nTest Set Metrics:")
print(test_metrics.to_string(index=False))

Training Set Metrics:
                target      MAE     RMSE       R²  Correlation
          pct_positive 1.025742 1.336643 0.986629     0.995914
          pct_negative 0.399778 0.526746 0.986503     0.996368
comment_sentiment_dist 0.009412 0.012375 0.989928     0.996738
   engagement_velocity 0.009688 0.013141 0.986181     0.996036

Test Set Metrics:
                target      MAE      RMSE        R²  Correlation
          pct_positive 9.738217 12.188792  0.015307     0.281487
          pct_negative 3.924400  4.847199 -0.437728    -0.059668
comment_sentiment_dist 0.109485  0.135864 -0.064070     0.205078
   engagement_velocity 0.122085  0.144606 -0.050543     0.159297


## Uncertainty Quantification

Estimate prediction uncertainty and identify failure modes

In [None]:
def calculate_uncertainty(y_true: np.ndarray, y_pred: np.ndarray, target_names: List[str]) -> pd.DataFrame:
    """
    Calculate prediction errors and uncertainty metrics.
    """
    uncertainty_data = []
    
    for i, target_name in enumerate(target_names):
        y_true_i = y_true[:, i]
        y_pred_i = y_pred[:, i]
        
        # Absolute error
        abs_error = np.abs(y_true_i - y_pred_i)
        
        # Relative error (percentage)
        # Avoid division by zero
        with np.errstate(divide='ignore', invalid='ignore'):
            rel_error = np.where(
                y_true_i != 0,
                abs_error / np.abs(y_true_i) * 100,
                abs_error
            )
        
        uncertainty_data.append({
            'target': target_name,
            'mean_abs_error': np.mean(abs_error),
            'std_abs_error': np.std(abs_error),
            'max_abs_error': np.max(abs_error),
            'mean_rel_error_pct': np.mean(rel_error[rel_error != np.inf]),
            'high_uncertainty_threshold': np.percentile(abs_error, 90)  # 90th percentile
        })
    
    return pd.DataFrame(uncertainty_data)

# Calculate uncertainty metrics
test_uncertainty = calculate_uncertainty(y_test, y_test_pred, target_names)
print("Uncertainty Metrics (Test Set):")
print(test_uncertainty.to_string(index=False))

# Identify high-uncertainty predictions (potential failure modes)
print("\nIdentifying high-uncertainty predictions...")
abs_errors = np.abs(y_test - y_test_pred)
high_uncertainty_thresholds = test_uncertainty['high_uncertainty_threshold'].values

high_uncertainty_mask = np.any(
    abs_errors > high_uncertainty_thresholds.reshape(1, -1),
    axis=1
)

print(f"High-uncertainty predictions: {np.sum(high_uncertainty_mask)} / {len(y_test)} ({np.sum(high_uncertainty_mask)/len(y_test)*100:.1f}%)")
print("\nThese are potential failure modes where the model is less confident.")

Uncertainty Metrics (Test Set):
                target  mean_abs_error  std_abs_error  max_abs_error  mean_rel_error_pct  high_uncertainty_threshold
          pct_positive        9.738217       7.330333      37.036650           29.637312                   20.164888
          pct_negative        3.924400       2.845069      13.191705           95.163070                    8.104736
comment_sentiment_dist        0.109485       0.080448       0.443569           64.028137                    0.217105
   engagement_velocity        0.122085       0.077499       0.326937           32.904771                    0.239766

Identifying high-uncertainty predictions...
High-uncertainty predictions: 27 / 100 (27.0%)

These are potential failure modes where the model is less confident.
