In [None]:
import pandas as pd
import numpy as np
import os
import glob
import time
import logging
import re
import torch
from datetime import datetime
from tqdm import tqdm
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import BertTokenizer, BertForSequenceClassification
from concurrent.futures import ThreadPoolExecutor

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(f'reddit_sentiment_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Constants
MAX_TEXT_LENGTH = 60000  # Match the same limit as in the Gemini code
BATCH_SIZE = 5  # Number of rows to process at once for efficiency
SAVE_FREQUENCY = 20  # Save after processing this many rows
MAX_WORKERS = 4  # Thread pool size for parallel processing

# Initialize VADER and FinBERT only once
def initialize_models():
    logger.info("Initializing sentiment models...")
    try:
        # Initialize VADER
        vader_analyzer = SentimentIntensityAnalyzer()
        
        # Initialize FinBERT (with device management)
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        logger.info(f"Using device: {device}")
        
        finbert_tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
        finbert_model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert').to(device)
        finbert_model.eval()  # Set to evaluation mode
        
        logger.info("Models initialized successfully!")
        return vader_analyzer, finbert_tokenizer, finbert_model, device
    except Exception as e:
        logger.error(f"Error initializing models: {e}")
        raise

# Helper function to standardize date format
def standardize_date_format(date_str):
    """
    Standardize date string to YYYY-MM-DD format for consistent matching
    """
    if pd.isna(date_str) or date_str is None or date_str == '':
        return None
    
    try:
        # Handle Reddit date format (e.g., "Sun Jan 01 2023 05:02:07 GMT-0600")
        if 'GMT' in date_str:
            # Extract just the date part (ignore time and timezone)
            date_parts = date_str.split(' ')
            # Get month, day, year
            month = date_parts[1]
            day = date_parts[2]
            year = date_parts[3]
            
            # Convert month name to number
            month_dict = {
                'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
                'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
                'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
            }
            month_num = month_dict.get(month, '01')  # Default to 01 if month not found
            
            # Format day with leading zero if needed
            day_num = day.zfill(2)
            
            # Return YYYY-MM-DD format
            return f"{year}-{month_num}-{day_num}"
        
        # Try to parse using pandas (handles many formats)
        dt = pd.to_datetime(date_str)
        return dt.strftime('%Y-%m-%d')
    except Exception as e:
        logger.warning(f"Could not standardize date: {date_str}. Error: {e}")
        return date_str

# Function to check if content is removed or deleted
def is_removed_content(text):
    """
    Check if content is removed or deleted
    """
    if pd.isna(text) or text is None:
        return True
        
    text = str(text).lower().strip()
    removed_patterns = [
        '[removed]', '[deleted]', '[not available]', '[unavailable]',
        'deleted', 'removed', 'n/a', 'unavailable'
    ]
    
    return any(pattern in text for pattern in removed_patterns)

# Function to prepare reddit text for sentiment analysis
def prepare_reddit_text(row):
    """
    Combine title and content for sentiment analysis
    Handle cases where content is removed
    """
    title = row.get('title', '')
    content = row.get('content', '')
    
    # Handle NaN or None values
    if pd.isna(title) or title is None:
        title = ''
    if pd.isna(content) or content is None:
        content = ''
    
    # Check if content is removed
    content_removed = is_removed_content(content)
    title_removed = is_removed_content(title)
    
    # If both title and content are removed, return None
    if title_removed and content_removed:
        logger.warning("Both title and content are removed")
        return None
    
    # If content is removed, use just the title
    if content_removed:
        logger.info("Content removed, using title only")
        return title
    
    # If title is removed but content is available, use just the content
    if title_removed:
        logger.info("Title removed, using content only")
        return content
    
    # Otherwise, combine title and content
    combined_text = f"{title}\n\n{content}"
    return combined_text

# VADER sentiment analysis function
def analyze_with_vader(text, vader_analyzer):
    """
    Analyze sentiment using VADER
    """
    try:
        # Truncate text if too long (VADER can handle long texts, but we'll be consistent)
        if text and len(text) > MAX_TEXT_LENGTH:
            logger.warning(f"Text truncated for VADER from {len(text)} to {MAX_TEXT_LENGTH} chars")
            text = text[:MAX_TEXT_LENGTH]
        
        if not text or not isinstance(text, str):
            logger.warning("Empty or invalid text for VADER analysis")
            return {
                'vader_compound': np.nan,
                'vader_pos': np.nan,
                'vader_neg': np.nan,
                'vader_neu': np.nan
            }
        
        # Get sentiment scores
        scores = vader_analyzer.polarity_scores(text)
        
        return {
            'vader_compound': scores['compound'],
            'vader_pos': scores['pos'],
            'vader_neg': scores['neg'],
            'vader_neu': scores['neu']
        }
    except Exception as e:
        logger.error(f"Error in VADER analysis: {e}")
        return {
            'vader_compound': np.nan,
            'vader_pos': np.nan,
            'vader_neg': np.nan,
            'vader_neu': np.nan
        }

# FinBERT chunking and sentiment analysis function
def analyze_with_finbert(text, tokenizer, model, device):
    """
    Analyze sentiment using FinBERT with chunking for long texts
    """
    try:
        if not text or not isinstance(text, str):
            logger.warning("Empty or invalid text for FinBERT analysis")
            return {
                'finbert_positive': np.nan,
                'finbert_negative': np.nan,
                'finbert_neutral': np.nan,
                'finbert_sentiment': np.nan  # Overall sentiment score (pos - neg)
            }
        
        # Truncate text if too long (for memory considerations)
        if len(text) > MAX_TEXT_LENGTH:
            logger.warning(f"Text truncated for FinBERT from {len(text)} to {MAX_TEXT_LENGTH} chars")
            text = text[:MAX_TEXT_LENGTH]
        
        # Tokenize and split into chunks of 512 tokens
        encoded_input = tokenizer(text, 
                                  return_tensors='pt', 
                                  max_length=512, 
                                  truncation=True, 
                                  padding=True, 
                                  return_overflowing_tokens=True)
        
        # Get number of chunks
        num_chunks = encoded_input['input_ids'].size(0)
        logger.info(f"Processing {num_chunks} chunks for FinBERT")
        
        if num_chunks == 0:
            logger.warning("No chunks were created by the tokenizer")
            return {
                'finbert_positive': np.nan,
                'finbert_negative': np.nan,
                'finbert_neutral': np.nan,
                'finbert_sentiment': np.nan
            }
        
        # Process all chunks in one batch (or in smaller batches if there are too many)
        all_probs = []
        batch_size = 8  # Process chunks in batches of 8 if there are many
        
        for i in range(0, num_chunks, batch_size):
            batch_input_ids = encoded_input['input_ids'][i:i+batch_size].to(device)
            batch_attention_mask = encoded_input['attention_mask'][i:i+batch_size].to(device)
            
            with torch.no_grad():
                outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
                logits = outputs.logits
                probs = torch.softmax(logits, dim=1)
                all_probs.append(probs)
        
        # Combine all batches
        combined_probs = torch.cat(all_probs, dim=0)
        
        # Average probabilities across all chunks
        avg_probs = combined_probs.mean(dim=0).cpu().numpy()
        
        # Map to sentiment categories (FinBERT order: positive, negative, neutral)
        return {
            'finbert_positive': float(avg_probs[0]),
            'finbert_negative': float(avg_probs[1]),
            'finbert_neutral': float(avg_probs[2]),
            'finbert_sentiment': float(avg_probs[0] - avg_probs[1])  # pos - neg as overall score
        }
    except Exception as e:
        logger.error(f"Error in FinBERT analysis: {e}")
        import traceback
        logger.error(f"Traceback: {traceback.format_exc()}")
        return {
            'finbert_positive': np.nan,
            'finbert_negative': np.nan,
            'finbert_neutral': np.nan,
            'finbert_sentiment': np.nan
        }

# Process a batch of rows
def process_batch(batch_df, vader_analyzer, finbert_tokenizer, finbert_model, device):
    """
    Process a batch of rows with both sentiment models
    """
    results = []
    
    for idx, row in batch_df.iterrows():
        try:
            # Prepare Reddit text (combine title and content)
            reddit_text = prepare_reddit_text(row)
            
            # Skip if no text
            if not reddit_text:
                logger.warning(f"No valid text for row {idx}, skipping")
                results.append((idx, None, None))
                continue
            
            # Process with VADER
            vader_results = analyze_with_vader(reddit_text, vader_analyzer)
            
            # Process with FinBERT
            finbert_results = analyze_with_finbert(reddit_text, finbert_tokenizer, finbert_model, device)
            
            # Store results
            results.append((idx, vader_results, finbert_results))
            logger.info(f"Successfully analyzed sentiment for row {idx}")
            
        except Exception as e:
            logger.error(f"Error processing row {idx}: {e}")
            results.append((idx, None, None))
    
    return results

# Find Reddit data file path
def find_reddit_file_path(year):
    """
    Attempt to find the correct Reddit data file path
    """
    # Default path
    default_path = f"data/reddit_data/reddit_{year}.csv"
    
    # Check if default path exists
    if os.path.exists(default_path):
        return default_path
    
    # Try alternative paths
    alternatives = [
        f"reddit_data/reddit_{year}.csv",
        f"data/reddit_{year}.csv",
        f"reddit_{year}.csv"
    ]
    
    # Check parent directories
    current_dir = os.getcwd()
    parent_dir = os.path.dirname(current_dir)
    
    for alt_path in alternatives:
        # Check in current directory
        if os.path.exists(alt_path):
            return alt_path
        
        # Check in parent directory
        parent_path = os.path.join(parent_dir, alt_path)
        if os.path.exists(parent_path):
            return parent_path
    
    # If we've made it here, try to find any file that might match
    possible_paths = []
    
    # Search patterns
    patterns = [
        f"**/reddit*{year}*.csv",
        f"**/*reddit*{year}*.csv"
    ]
    
    for pattern in patterns:
        matches = glob.glob(pattern, recursive=True)
        possible_paths.extend(matches)
    
    if possible_paths:
        logger.info(f"Found possible Reddit files: {possible_paths}")
        return possible_paths[0]  # Return the first match
    
    return None

# Main function to process the CSV files
def enhance_sentiment_analysis(input_file, output_file, reddit_data_file):
    """
    Add VADER and FinBERT sentiment scores to the input file
    """
    try:
        logger.info(f"Loading input file: {input_file}")
        
        # Check if input file exists, if not, create from Reddit data
        if not os.path.exists(input_file):
            logger.warning(f"Input file {input_file} not found. Will create from Reddit data.")
            results_df = None
        else:
            results_df = pd.read_csv(input_file)
            logger.info(f"Loaded {len(results_df)} rows from {input_file}")
        
        # Load Reddit data
        logger.info(f"Loading Reddit data file: {reddit_data_file}")
        reddit_df = pd.read_csv(reddit_data_file)
        logger.info(f"Loaded {len(reddit_df)} rows from Reddit data file")
        
        # If results file doesn't exist, create it from Reddit data
        if results_df is None:
            logger.info("Creating new results dataframe from Reddit data")
            
            # Select columns to keep
            keep_cols = ['subreddit', 'username', 'date', 'title', 'reddit_link']
            if all(col in reddit_df.columns for col in keep_cols):
                results_df = reddit_df[keep_cols].copy()
            else:
                logger.error(f"Required columns {keep_cols} not found in Reddit data")
                return None
            
            # Standardize date format
            if 'date' in results_df.columns:
                logger.info("Standardizing dates in results dataframe")
                results_df['date'] = results_df['date'].apply(standardize_date_format)
        
        # Standardize date format in both dataframes for accurate matching if needed
        if results_df is not None and 'date' in results_df.columns:
            if not results_df['date'].apply(lambda x: isinstance(x, str) and re.match(r'^\d{4}-\d{2}-\d{2}$', str(x))).all():
                logger.info("Standardizing dates in results dataframe")
                results_df['date'] = results_df['date'].apply(standardize_date_format)
        
        if 'date' in reddit_df.columns:
            if not reddit_df['date'].apply(lambda x: isinstance(x, str) and re.match(r'^\d{4}-\d{2}-\d{2}$', str(x))).all():
                logger.info("Standardizing dates in Reddit dataframe")
                reddit_df['date'] = reddit_df['date'].apply(standardize_date_format)
        
        # Ensure Reddit data has the necessary columns for processing
        if 'title' not in reddit_df.columns or 'content' not in reddit_df.columns:
            logger.error("Reddit data missing required columns: title and/or content")
            return None
        
        # Check if VADER and FinBERT columns already exist
        vader_cols = [col for col in results_df.columns if 'vader' in col.lower()]
        finbert_cols = [col for col in results_df.columns if 'finbert' in col.lower()]
        
        if vader_cols and finbert_cols:
            logger.warning(f"VADER and FinBERT columns already exist in {input_file}")
            logger.info(f"VADER columns: {vader_cols}")
            logger.info(f"FinBERT columns: {finbert_cols}")
            overwrite = input("Overwrite existing sentiment columns? (y/n): ").lower() == 'y'
            if not overwrite:
                logger.info("Skipping file")
                return
        
        # Initialize models
        vader_analyzer, finbert_tokenizer, finbert_model, device = initialize_models()
        
        # Merge Reddit data with results dataframe if they're separate
        if 'content' not in results_df.columns:
            logger.info("Adding content column to results dataframe from Reddit data")
            
            # Add content column from Reddit data to results dataframe
            # Create a mapping dictionary for matching
            mapping_keys = ['subreddit', 'username', 'date', 'title', 'reddit_link']
            available_keys = [k for k in mapping_keys if k in results_df.columns and k in reddit_df.columns]
            
            if not available_keys:
                logger.error("No common columns found for matching Reddit content with results")
                return None
            
            # Add content column by matching on available keys
            results_df = results_df.merge(
                reddit_df[available_keys + ['content']], 
                on=available_keys,
                how='left'
            )
            
            logger.info(f"Added content column to results dataframe. Now has {len(results_df)} rows")
        
        # Process in batches
        batch_indices = [results_df.index[i:i+BATCH_SIZE] for i in range(0, len(results_df), BATCH_SIZE)]
        all_results = []
        
        logger.info(f"Processing {len(batch_indices)} batches")
        
        # Use ThreadPoolExecutor for parallel processing
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = []
            
            for i, batch_idx in enumerate(batch_indices):
                batch_df = results_df.loc[batch_idx].copy()
                futures.append(executor.submit(
                    process_batch, batch_df, vader_analyzer, finbert_tokenizer, finbert_model, device
                ))
                
                # Wait for futures to complete and save intermediate results periodically
                if (i + 1) % SAVE_FREQUENCY == 0 or i == len(batch_indices) - 1:
                    logger.info(f"Waiting for batch {i+1}/{len(batch_indices)} to complete")
                    
                    # Collect completed results
                    completed_futures = [f for f in futures if f.done()]
                    for future in completed_futures:
                        try:
                            batch_results = future.result()
                            all_results.extend(batch_results)
                        except Exception as e:
                            logger.error(f"Error getting future result: {e}")
                    
                    # Clear completed futures
                    futures = [f for f in futures if not f.done()]
                    
                    # Save intermediate results
                    if all_results:
                        temp_df = results_df.copy()
                        for idx, vader_result, finbert_result in all_results:
                            if vader_result:
                                for key, value in vader_result.items():
                                    temp_df.at[idx, key] = value
                            if finbert_result:
                                for key, value in finbert_result.items():
                                    temp_df.at[idx, key] = value
                        
                        temp_output = output_file.replace('.csv', f'_temp_{i+1}.csv')
                        # Make sure the directory exists
                        os.makedirs(os.path.dirname(temp_output), exist_ok=True)
                        # Save without content column to match the original format
                        if 'content' in temp_df.columns:
                            temp_df = temp_df.drop('content', axis=1)
                        temp_df.to_csv(temp_output, index=False)
                        logger.info(f"Saved intermediate results to {temp_output}")
            
            # Collect remaining results
            for future in futures:
                try:
                    batch_results = future.result()
                    all_results.extend(batch_results)
                except Exception as e:
                    logger.error(f"Error getting future result: {e}")
        
        # Update dataframe with all results (excluding content column)
        output_df = results_df.copy()
        
        # Remove content column if it exists
        if 'content' in output_df.columns:
            output_df = output_df.drop('content', axis=1)
        
        # Remove any existing VADER or FinBERT columns
        for col in vader_cols + finbert_cols:
            if col in output_df.columns:
                output_df = output_df.drop(col, axis=1)
                
        # Add new sentiment scores
        for idx, vader_result, finbert_result in all_results:
            if vader_result:
                for key, value in vader_result.items():
                    output_df.at[idx, key] = value
            if finbert_result:
                for key, value in finbert_result.items():
                    output_df.at[idx, key] = value
        
        # Save final results
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        output_df.to_csv(output_file, index=False)
        logger.info(f"Saved final results to {output_file}")
        
        # Clean up temporary files
        temp_files = [f for f in os.listdir(os.path.dirname(output_file)) 
                     if os.path.basename(f).startswith(os.path.basename(output_file).split('.')[0] + '_temp_')]
        if temp_files and input("Remove temporary files? (y/n): ").lower() == 'y':
            for temp_file in temp_files:
                temp_path = os.path.join(os.path.dirname(output_file), temp_file)
                try:
                    os.remove(temp_path)
                    logger.info(f"Removed {temp_file}")
                except Exception as e:
                    logger.error(f"Could not remove {temp_file}: {e}")
        
        return output_df
        
    except Exception as e:
        logger.error(f"Error processing file: {e}")
        import traceback
        logger.error(f"Traceback: {traceback.format_exc()}")
        return None

def main():
    print("\n=== ENHANCING REDDIT SENTIMENT ANALYSIS WITH VADER AND FINBERT ===")
    
    # Ask user for years to process
    years_input = input("Which years would you like to process? (Enter comma-separated values, e.g., '2023,2024' or 'all' for all years): ")
    
    if years_input.lower() == 'all':
        years = ['2023', '2024']  # Default years
    else:
        years = [year.strip() for year in years_input.split(',')]
    
    # Process each year
    for year in years:
        # Define file paths
        input_file = f"sentiment_results/reddit/reddit_sentiment_{year}.csv"
        output_file = f"sentiment_results/reddit/reddit_sentiment_{year}_v3.csv"
        reddit_data_file = f"data/reddit_data/reddit_{year}.csv"
        
        print(f"\n=== PROCESSING YEAR: {year} ===")
        print(f"Input file: {input_file}")
        print(f"Output file: {output_file}")
        print(f"Reddit data file: {reddit_data_file}")
        
        # Check if Reddit data file exists
        reddit_file_path = find_reddit_file_path(year)
        if not reddit_file_path:
            print(f"Error: Could not find Reddit data file for year {year}")
            user_path = input(f"Please provide the path to the Reddit data file for {year} (or press Enter to skip): ")
            if user_path.strip():
                if os.path.exists(user_path):
                    reddit_data_file = user_path
                    print(f"Using provided Reddit data file: {reddit_data_file}")
                else:
                    print(f"Provided path does not exist: {user_path}")
                    if input("Continue anyway? (y/n): ").lower() != 'y':
                        print(f"Skipping year {year}")
                        continue
            else:
                print(f"Skipping year {year}")
                continue
        else:
            reddit_data_file = reddit_file_path
            print(f"Found Reddit data file: {reddit_data_file}")
        
        # Ask user to confirm
        confirm = input(f"Process {year}? (y/n): ")
        if confirm.lower() != 'y':
            print(f"Skipping year {year}")
            continue
        
        # Enhance sentiment analysis
        enhance_sentiment_analysis(input_file, output_file, reddit_data_file)
        
        print(f"Completed enhancement for year {year}")

if __name__ == "__main__":
    # Check if required libraries are installed
    try:
        import nltk
        nltk.data.find('vader_lexicon')
    except (ImportError, LookupError):
        print("NLTK VADER lexicon not found. Installing...")
        import nltk
        nltk.download('vader_lexicon')
    
    try:
        import transformers
    except ImportError:
        print("Transformers library not found. Please install it with:")
        print("pip install transformers")
        if input("Continue anyway? (y/n): ").lower() != 'y':
            import sys
            sys.exit(1)
    
    try:
        import torch
    except ImportError:
        print("PyTorch not found. Please install it with:")
        print("pip install torch")
        if input("Continue anyway? (y/n): ").lower() != 'y':
            import sys
            sys.exit(1)
    
    # Run main function
    main()

  from .autonotebook import tqdm as notebook_tqdm
2025-04-13 16:39:48.188245: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-13 16:39:48.201199: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744580388.213193   45598 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744580388.216893   45598 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744580388.228936   45598 computation_placer.cc:177] computation placer already r

NLTK VADER lexicon not found. Installing...

=== ENHANCING REDDIT SENTIMENT ANALYSIS WITH VADER AND FINBERT ===


Which years would you like to process? (Enter comma-separated values, e.g., '2023,2024' or 'all' for all years):  2024



=== PROCESSING YEAR: 2024 ===
Input file: sentiment_results/reddit/reddit_sentiment_2024.csv
Output file: sentiment_results/reddit/reddit_sentiment_2024_v3.csv
Reddit data file: data/reddit_data/reddit_2024.csv
Found Reddit data file: data/reddit_data/reddit_2024.csv


Process 2024? (y/n):  y


2025-04-13 16:40:07,404 - INFO - Loading input file: sentiment_results/reddit/reddit_sentiment_2024.csv
2025-04-13 16:40:07,533 - INFO - Loaded 33543 rows from sentiment_results/reddit/reddit_sentiment_2024.csv
2025-04-13 16:40:07,534 - INFO - Loading Reddit data file: data/reddit_data/reddit_2024.csv
2025-04-13 16:40:07,713 - INFO - Loaded 38880 rows from Reddit data file
2025-04-13 16:40:07,754 - INFO - Initializing sentiment models...
2025-04-13 16:40:07,789 - INFO - Using device: cuda
2025-04-13 16:40:08,536 - INFO - Models initialized successfully!
2025-04-13 16:40:08,537 - INFO - Adding content column to results dataframe from Reddit data
2025-04-13 16:40:08,575 - INFO - Added content column to results dataframe. Now has 33543 rows
2025-04-13 16:40:08,582 - INFO - Processing 6709 batches
2025-04-13 16:40:08,585 - INFO - Processing 1 chunks for FinBERT
2025-04-13 16:40:08,588 - INFO - Processing 1 chunks for FinBERT
2025-04-13 16:40:08,591 - INFO - Processing 1 chunks for FinBERT
