Overall goal: Compare the prediction of the 500 posts picked out.
- General prediction (find those from csvs/gpt_outputs.csv)
- User personalized prediction (we need to generate)

To get general prediction
- find from csvs/gpt_outputs.csv based on same id

To get user personalized prediction
- For every tweet in csvs/tweet_samples_500.csv
- Find one (potentially the first) user id who viewed them from aggregation_data/user_view_tweet.csv 
- Write a csv that has just the tweet id and the user id of who viewed it. 

- Get the profile of the user from experiment_data/pre_study.csv
- Call the function in utils to predict emotion impact
- Store the result 

Final result
- A csv file that has all 500 posts vertically, horizontally, it would have tweet id, user_id (of who viewed the tweet), and each rows have predicted_{emotion}_general or predicted_{emotion}_personalized, and the explanation_personalized. 

In [2]:
import pandas as pd
import numpy as np


In [1]:
import pandas as pd
import numpy as np

# Step 1: Get foundational file
# For every tweet in csvs/tweet_samples_500.csv
# Find one (potentially the first) user id who viewed them from aggregation_data/user_view_tweet.csv 
# Write a csv that has just the tweet id and the user id of who viewed it. 

print("Loading tweet samples...")
# Load the 500 tweet samples
tweet_samples = pd.read_csv('../csvs/tweet_samples_500.csv', header=None, names=['tweet_id', 'tweet_text'])
print(f"Loaded {len(tweet_samples)} tweet samples")

print("Loading user view data...")
# Load user view tweet data
user_view_data = pd.read_csv('../aggregation_data/user_view_tweet.csv')
print(f"Loaded {len(user_view_data)} user view records")

print("Matching tweets with users...")
# For each tweet, find the first user who viewed it
tweet_user_mapping = []

for idx, row in tweet_samples.iterrows():
    tweet_id = str(row['tweet_id'])
    # Remove 'tweet-' prefix if it exists for matching
    tweet_id_clean = tweet_id.replace('tweet-', '') if tweet_id.startswith('tweet-') else tweet_id
    
    try:
        # Find users who viewed this tweet
        viewers = user_view_data[user_view_data['tweet_id'] == int(tweet_id_clean)]
        
        if len(viewers) > 0:
            # Take the first user who viewed this tweet
            first_viewer = viewers.iloc[0]['user_id']
            tweet_user_mapping.append({
                'tweet_id': tweet_id,
                'user_id': first_viewer,
                'tweet_text': str(row['tweet_text'])
            })
            if (len(tweet_user_mapping)) % 50 == 0:
                print(f"Processed {len(tweet_user_mapping)} tweets, found {len(tweet_user_mapping)} with viewers")
        else:
            print(f"No viewer found for tweet: {tweet_id}")
    except (ValueError, TypeError) as e:
        print(f"Error processing tweet {tweet_id}: {e}")

# Create DataFrame and save
tweet_user_df = pd.DataFrame(tweet_user_mapping)
print(f"Final mapping: {len(tweet_user_df)} tweets with user viewers")

# Save the mapping
tweet_user_df.to_csv('../csvs/tweet_user_mapping.csv', index=False)
print("Saved tweet-user mapping to csvs/tweet_user_mapping.csv")
print(f"Sample mapping:\n{tweet_user_df.head()}")


Loading tweet samples...
Loaded 501 tweet samples
Loading user view data...
Loaded 1798807 user view records
Matching tweets with users...
No viewer found for tweet: tweet-1810372630475846080
Error processing tweet home-conversation-1810527114531057824: invalid literal for int() with base 10: 'home-conversation-1810527114531057824'
No viewer found for tweet: tweet-1811380108029988868
No viewer found for tweet: tweet-1811868132262158442
Error processing tweet home-conversation-1812210180358074564: invalid literal for int() with base 10: 'home-conversation-1812210180358074564'
No viewer found for tweet: tweet-1812140543884874048
No viewer found for tweet: tweet-1812263297565229138
No viewer found for tweet: tweet-1812554636072460436
No viewer found for tweet: tweet-1812582971708256730
No viewer found for tweet: tweet-1812645495862346067
Error processing tweet home-conversation-1812851434347524363: invalid literal for int() with base 10: 'home-conversation-1812851434347524363'
No viewer f

In [2]:
# Step 2: Get general predictions
# Find from aggregation_data/gpt_outputs.csv based on same id

print("Loading general GPT predictions...")
# Load general predictions
gpt_outputs = pd.read_csv('../aggregation_data/gpt_outputs.csv')
print(f"Loaded {len(gpt_outputs)} general predictions")

# Load our tweet-user mapping
tweet_user_df = pd.read_csv('../csvs/tweet_user_mapping.csv')
print(f"Loaded {len(tweet_user_df)} tweet-user mappings")

print("Matching general predictions with our tweets...")
# Merge with general predictions
results_df = tweet_user_df.copy()

# Create a mapping for general predictions
general_predictions = {}
for _, row in gpt_outputs.iterrows():
    tweet_id_with_prefix = f"tweet-{row['tweet_id']}"
    general_predictions[tweet_id_with_prefix] = {
        'general_nervous': row['predicted_nervous'],
        'general_sad': row['predicted_sad'], 
        'general_happy': row['predicted_happy'],
        'general_calm': row['predicted_calm'],
        'general_excited': row['predicted_excited'],
        'general_aroused': row['predicted_aroused'],
        'general_angry': row['predicted_angry'],
        'general_relaxed': row['predicted_relaxed'],
        'general_fearful': row['predicted_fearful'],
        'general_enthusiastic': row['predicted_enthusiastic'],
        'general_still': row['predicted_still'],
        'general_satisfied': row['predicted_satisfied'],
        'general_bored': row['predicted_bored'],
        'general_lonely': row['predicted_lonely']
    }

# Add general prediction columns
emotion_columns = ['nervous', 'sad', 'happy', 'calm', 'excited', 'aroused', 'angry', 
                  'relaxed', 'fearful', 'enthusiastic', 'still', 'satisfied', 'bored', 'lonely']

for emotion in emotion_columns:
    results_df[f'general_{emotion}'] = None

# Fill in general predictions
matched_general = 0
for idx, row in results_df.iterrows():
    tweet_id = row['tweet_id']
    if tweet_id in general_predictions:
        for emotion in emotion_columns:
            results_df.at[idx, f'general_{emotion}'] = general_predictions[tweet_id][f'general_{emotion}']
        matched_general += 1

print(f"Matched {matched_general} tweets with general predictions out of {len(results_df)} total")

# Save intermediate results
results_df.to_csv('../csvs/comparison_results_intermediate.csv', index=False)
print("Saved intermediate results to csvs/comparison_results_intermediate.csv")
print(f"Sample results:\n{results_df[['tweet_id', 'user_id', 'general_happy', 'general_sad']].head()}")

Loading general GPT predictions...
Loaded 711528 general predictions
Loaded 317 tweet-user mappings
Matching general predictions with our tweets...
Matched 292 tweets with general predictions out of 317 total
Saved intermediate results to csvs/comparison_results_intermediate.csv
Sample results:
                    tweet_id                           user_id general_happy  \
0  tweet-1810453227416019028  94F9CDBD19AB4C17B1708F90734737F5             1   
1  tweet-1810399777970122824  B0AB5FCE66DD464DADDF35067B4C1744             3   
2  tweet-1810359058240541026  76EFC9FB74444AB2ACBAEFAF5D3C1E9B             5   
3  tweet-1810523058295226604  C3842FA04E0440E8BC54DC2BA2D471D8             2   
4  tweet-1810759363424047594  218D74FFB32A4C838A88C6019A69DE91             4   

  general_sad  
0           2  
1           1  
2           1  
3           1  
4           1  


In [3]:
# Load the comparison intermediate CSV and print the number of posts (rows)
df = pd.read_csv('../csvs/comparison_results_intermediate.csv')
print(f"Number of posts in comparison_results_intermediate.csv: {len(df)}")

Number of posts in comparison_results_intermediate.csv: 317


In [5]:
# Check if all user ids in the comparison_results_intermediate.csv are in the pre_study.csv

import pandas as pd

# Load the comparison results
comparison_results = pd.read_csv('../csvs/comparison_results_intermediate.csv')

# Load the pre-study data
pre_study = pd.read_csv('../aggregated_data2/pre_study.csv')

# Check if all user ids in comparison results are in pre-study data
comparison_user_ids = set(comparison_results['user_id'].unique())
pre_study_user_ids = set(pre_study['participant_id'].unique())

# Find missing user IDs
missing_user_ids = comparison_user_ids - pre_study_user_ids

print(f"Total unique users in comparison results: {len(comparison_user_ids)}")
print(f"Total unique users in pre-study data: {len(pre_study_user_ids)}")
print(f"Users in comparison results but missing from pre-study: {len(missing_user_ids)}")

if missing_user_ids:
    print(f"Missing user IDs: {list(missing_user_ids)[:10]}")  # Show first 10 missing IDs
    print("WARNING: Some users in comparison results are not in pre-study data!")
else:
    print("✓ All users in comparison results are present in pre-study data")

# Check overlap
overlap = comparison_user_ids & pre_study_user_ids
print(f"Users present in both datasets: {len(overlap)}")


FileNotFoundError: [Errno 2] No such file or directory: '../aggregated_data2/pre_study.csv'

In [None]:
# Step 3: Generate personalized predictions with resumable capability
# This cell sets up the personalized prediction generation but doesn't run it
# It can resume from where it left off by checking for empty rows

import sys
# No need to append path since we're importing from current directory
from emotion_detector import gpt_detect_emotion
import time

# Set this to True when you want to actually run the predictions
PROCESS_PREDICTIONS = True

print("=== PERSONALIZED PREDICTION SETUP ===")
print("This cell is ready to generate personalized predictions but is set to SAFE MODE")
print("To run predictions, set PROCESS_PREDICTIONS = True")
print("")

# Load intermediate results to check status
try:
    results_df = pd.read_csv('../csvs/comparison_results_intermediate.csv')
    print(f"✓ Found intermediate results with {len(results_df)} tweets")
    
    # Check if personalized columns exist
    personalized_cols = [col for col in results_df.columns if col.startswith('personalized_')]
    if personalized_cols:
        print(f"✓ Found existing personalized columns: {len(personalized_cols)}")
        # Count completed rows
        if 'personalized_explanation' in results_df.columns:
            completed = results_df['personalized_explanation'].notna().sum()
            remaining = len(results_df) - completed
            print(f"✓ Progress: {completed} completed, {remaining} remaining")
        else:
            print("✓ Personalized columns exist but no predictions yet")
    else:
        print("! No personalized columns found - will create them")
        
    # Load user mapping
    try:
        user_surveys = pd.read_csv('../aggregation_data/user_surveys.csv')
        user_mapping_count = len(user_surveys)
        print(f"✓ User ID mapping available for {user_mapping_count} users")
    except Exception as e:
        print(f"! Warning: Could not load user mapping - {e}")
    
    print("")
    print("RESUMABLE PREDICTION CODE:")
    print("The code below will automatically resume from incomplete predictions")
    print("It saves progress every 10 predictions to avoid losing work")
    
except FileNotFoundError:
    print("✗ No intermediate results found. Please run Step 2 first!")

if PROCESS_PREDICTIONS:
    print("\n=== STARTING PERSONALIZED PREDICTION GENERATION ===")
    
    # Load data
    results_df = pd.read_csv('../csvs/comparison_results_intermediate.csv')
    
    # Add personalized columns if they don't exist
    emotion_columns = ['nervous', 'sad', 'happy', 'calm', 'excited', 'aroused', 'angry', 
                      'relaxed', 'fearful', 'enthusiastic', 'still', 'satisfied', 'bored', 'lonely']
    
    for emotion in emotion_columns:
        if f'personalized_{emotion}' not in results_df.columns:
            results_df[f'personalized_{emotion}'] = None
    
    if 'personalized_explanation' not in results_df.columns:
        results_df['personalized_explanation'] = None
    
    # Load user mapping - check what user IDs are actually available
    try:
        user_surveys = pd.read_csv('../aggregation_data/user_surveys.csv')
        print(f"Total users in survey: {len(user_surveys)}")
        
        # Check the actual format of user IDs in the data
        sample_user_ids = user_surveys['user_id'].head(10).tolist()
        print(f"Sample user IDs: {sample_user_ids}")
        
        # For now, use all user IDs as they are (no filtering by format)
        user_id_mapping = dict(zip(user_surveys['user_id'], user_surveys['user_id']))
        print(f"Loaded {len(user_id_mapping)} users total")
        
    except Exception as e:
        print(f"Error loading user mapping: {e}")
        user_id_mapping = {}
    
    # Find rows that need processing (resume capability)
    incomplete_mask = results_df['personalized_explanation'].isna()
    rows_to_process = results_df[incomplete_mask]
    
    # Filter to only process tweets with users who have valid user IDs in our mapping
    if user_id_mapping:
        has_valid_user_id = rows_to_process['user_id'].isin(list(user_id_mapping.keys()))
        rows_to_process = rows_to_process[has_valid_user_id]
        print(f"Found {len(rows_to_process)} tweets with valid user IDs to process")
        
        if len(rows_to_process) < len(results_df[incomplete_mask]):
            skipped = len(results_df[incomplete_mask]) - len(rows_to_process)
            print(f"Skipping {skipped} tweets with user IDs not found in user_surveys.csv")
    else:
        print(f"Found {len(rows_to_process)} tweets needing personalized predictions (no user mapping available)")
    
    if len(rows_to_process) == 0:
        print("All personalized predictions are complete!")
    else:
        processed_count = 0
        error_count = 0
        
        for idx, row in rows_to_process.iterrows():
            try:
                tweet_id = str(row['tweet_id'])
                user_id = str(row['user_id'])
                tweet_text = str(row['tweet_text'])
                
                # Use the user_id directly as participant_id (since they should match the pre_study.csv)
                participant_id = user_id_mapping.get(user_id, user_id)
                
                print(f"Processing {processed_count + 1}/{len(rows_to_process)}: {tweet_id[:20]}...")
                
                # Generate personalized prediction
                result = gpt_detect_emotion(tweet_text, participant_id=participant_id, user_csv_path="../experiment_data/pre_study.csv")
                
                if 'error' not in result:
                    # Store predictions
                    for emotion in emotion_columns:
                        emotion_key = emotion.capitalize()
                        if emotion_key in result:
                            results_df.at[idx, f'personalized_{emotion}'] = result[emotion_key]
                    
                    results_df.at[idx, 'personalized_explanation'] = result.get('explanation', '')
                    processed_count += 1
                    
                    # Save progress every 10 predictions
                    if processed_count % 10 == 0:
                        results_df.to_csv('../csvs/comparison_results_progress.csv', index=False)
                        print(f"Saved progress: {processed_count} completed")
                    
                    time.sleep(1)  # Rate limiting
                    
                else:
                    print(f"Error: {result.get('error', 'Unknown error')}")
                    error_count += 1
                    
            except Exception as e:
                print(f"Exception: {e}")
                error_count += 1
                continue
        
        # Final save
        results_df.to_csv('../csvs/comparison_results_final.csv', index=False)
        print(f"\nCompleted! {processed_count} processed, {error_count} errors")
        print("Final results saved to comparison_results_final.csv")

else:
    print("\n" + "="*50)
    print("SAFE MODE: Set PROCESS_PREDICTIONS = True to start")
    print("="*50)


=== PERSONALIZED PREDICTION SETUP ===
This cell is ready to generate personalized predictions but is set to SAFE MODE
To run predictions, set PROCESS_PREDICTIONS = True

✓ Found intermediate results with 317 tweets
! No personalized columns found - will create them
✓ User ID mapping available for 16340 users

RESUMABLE PREDICTION CODE:
The code below will automatically resume from incomplete predictions
It saves progress every 10 predictions to avoid losing work

=== STARTING PERSONALIZED PREDICTION GENERATION ===
Using user_id directly as participant_id: 'participantId'
Found 317 tweets needing personalized predictions
Processing 1/317: tweet-18104532274160...
Error: CSV file 'experiment_data/pre_study.csv' not found
Generated prompt: 
    Definitions of emotions:

    Nervous: restless tension, emotion characterized by trembling, feelings of apprehensiveness, or other signs of anxiety or fear.

    Sad: the response to the loss of an object or person to which you are very attached. T

KeyboardInterrupt: 

In [None]:
# Step 4: Analysis and Final Results Summary
# This cell analyzes the differences between general and personalized predictions

print("=== FINAL ANALYSIS ===")

# Try to load the final results
try:
    final_df = pd.read_csv('../csvs/comparison_results_final.csv')
    print(f"✓ Loaded final results with {len(final_df)} tweets")
    
    # Check completion status
    general_complete = final_df['general_happy'].notna().sum()
    personalized_complete = final_df['personalized_explanation'].notna().sum()
    
    print(f"✓ General predictions: {general_complete}/{len(final_df)} complete")
    print(f"✓ Personalized predictions: {personalized_complete}/{len(final_df)} complete")
    
    if general_complete > 0 and personalized_complete > 0:
        print("\n=== COMPARISON ANALYSIS ===")
        
        # Emotion columns for analysis
        emotions = ['happy', 'sad', 'angry', 'fearful', 'excited']
        
        # Calculate differences for each emotion
        differences = {}
        for emotion in emotions:
            if f'general_{emotion}' in final_df.columns and f'personalized_{emotion}' in final_df.columns:
                # Only compare rows where both predictions exist
                mask = (final_df[f'general_{emotion}'].notna() & 
                       final_df[f'personalized_{emotion}'].notna())
                
                if mask.sum() > 0:
                    general_vals = final_df.loc[mask, f'general_{emotion}']
                    personalized_vals = final_df.loc[mask, f'personalized_{emotion}']
                    
                    # Calculate mean difference (personalized - general)
                    diff = personalized_vals - general_vals
                    differences[emotion] = {
                        'mean_diff': diff.mean(),
                        'std_diff': diff.std(),
                        'n_compared': mask.sum()
                    }
                    
                    print(f"{emotion.capitalize()}: Mean diff = {diff.mean():.2f} ± {diff.std():.2f} (n={mask.sum()})")
        
        # Find tweets with largest differences
        print(f"\n=== LARGEST DIFFERENCES ===")
        for emotion in emotions:
            if emotion in differences and differences[emotion]['n_compared'] > 0:
                mask = (final_df[f'general_{emotion}'].notna() & 
                       final_df[f'personalized_{emotion}'].notna())
                
                if mask.sum() > 0:
                    diff_col = final_df.loc[mask, f'personalized_{emotion}'] - final_df.loc[mask, f'general_{emotion}']
                    
                    # Find max positive and negative differences
                    max_idx = diff_col.idxmax()
                    min_idx = diff_col.idxmin()
                    
                    if abs(diff_col.loc[max_idx]) > 1 or abs(diff_col.loc[min_idx]) > 1:
                        print(f"\n{emotion.capitalize()} - Largest increases:")
                        if diff_col.loc[max_idx] > 1:
                            tweet_text = final_df.loc[max_idx, 'tweet_text']
                            print(f"  +{diff_col.loc[max_idx]:.1f}: {tweet_text[:80]}...")
                            
                        print(f"{emotion.capitalize()} - Largest decreases:")
                        if diff_col.loc[min_idx] < -1:
                            tweet_text = final_df.loc[min_idx, 'tweet_text']
                            print(f"  {diff_col.loc[min_idx]:.1f}: {tweet_text[:80]}...")
        
        # Save analysis summary
        summary_stats = pd.DataFrame(differences).T
        summary_stats.to_csv('../csvs/comparison_analysis_summary.csv')
        print(f"\n✓ Analysis summary saved to comparison_analysis_summary.csv")
        
    else:
        print("! Not enough complete predictions for comparison analysis")
        
except FileNotFoundError:
    print("! Final results file not found. Complete steps 1-3 first.")
    
    # Check for intermediate or progress files
    intermediate_files = [
        '../csvs/comparison_results_intermediate.csv',
        '../csvs/comparison_results_progress.csv'
    ]
    
    for file_path in intermediate_files:
        try:
            df = pd.read_csv(file_path)
            print(f"✓ Found {file_path.split('/')[-1]} with {len(df)} tweets")
            
            # Check status
            if 'general_happy' in df.columns:
                general_count = df['general_happy'].notna().sum()
                print(f"  - General predictions: {general_count}")
            
            if 'personalized_explanation' in df.columns:
                personalized_count = df['personalized_explanation'].notna().sum()
                print(f"  - Personalized predictions: {personalized_count}")
                
        except FileNotFoundError:
            continue

print(f"\n{'='*50}")
print("SUMMARY OF WORKFLOW:")
print("1. ✓ Step 1: Create tweet-user mapping")
print("2. ✓ Step 2: Add general predictions") 
print("3. ⚠ Step 3: Generate personalized predictions (set PROCESS_PREDICTIONS=True)")
print("4. ✓ Step 4: Analyze differences")
print("='*50")