In [8]:
import pandas as pd
import random
from typing import Optional
import logging
from dotenv import load_dotenv
import os
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler("aae_to_sae_translation.log"),
                        logging.StreamHandler()
                    ])
logger = logging.getLogger(__name__)

In [18]:
def load_sae_dataset(filepath: str, num_samples: Optional[int] = None) -> pd.DataFrame:
    """
    Load the SAE dataset from a CSV file.
    Opens the file as text, extracts headers from first line, then treats each line as a whole.
    
    Args:
        filepath: Path to the CSV file
        num_samples: Number of samples to load, if None, load all
        
    Returns:
        DataFrame containing the dataset
    """
    try:
        # Check if the file exists
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"File not found: {filepath}")
            
        # Read the CSV with proper parsing using pandas
        full_df = pd.read_csv(filepath)
        
        # Validate the required column exists
        if 'sae_text' not in full_df.columns:
            raise ValueError("Required column 'sae_text' not found in CSV file")
            
        # Extract only the sae_text column
        lines = full_df['sae_text'].tolist()
        
        # # Sample if requested
        # if num_samples and num_samples < len(lines):
        #     import random
        #     random.seed(42)
        #     lines = random.sample(lines, num_samples)
            
        logger.info(f"Loaded {len(lines)} SAE texts from column 'sae_text'")
        
        # Create DataFrame with text column (using first header as column name)
        column_name = 'sae_text'  # Use standard name for compatibility with rest of code
        
        df = pd.DataFrame({
            column_name: lines
        })
        
        logger.info(f"Loaded dataset as text: {len(df)} records")
            
        return df
    except Exception as e:
        logger.error(f"Error loading dataset: {str(e)}")
        raise

In [32]:
dataset = load_sae_dataset('../output_datasets/aae_to_sae_initial_5000_results.csv')

2025-05-16 13:49:26,943 - __main__ - INFO - Loaded 4896 SAE texts from column 'sae_text'
2025-05-16 13:49:26,943 - __main__ - INFO - Loaded dataset as text: 4896 records


In [33]:
dataset.tail(5)

Unnamed: 0,sae_text
4891,@Saviour_So_Heat you came here and I picked yo...
4892,"@LadyLove_LOUD Girl, where are you? We're at H..."
4893,"""Men are more emotional than ever these days. ..."
4894,"""All I do is sit back and laugh at these women..."
4895,"@AllHailQuan He isn’t a rapist, stop messing w..."


In [34]:
full_df = pd.read_csv('../output_datasets/sae_to_aae_initial_5000_results.csv')
full_df.tail(5)

Unnamed: 0,sae_text,aae_text
4891,@Saviour_So_Heat you came here and I picked yo...,@Saviour_So_Heat you came through and I picked...
4892,"@LadyLove_LOUD Girl, where are you? We're at H...","@LadyLove_LOUD Girl, where you at? We at Henry..."
4893,"""Men are more emotional than ever these days. ...",Men be more emotional than ever these days. It...
4894,"""All I do is sit back and laugh at these women...",All I do is sit back an’ laugh at these women....
4895,"@AllHailQuan He isn’t a rapist, stop messing w...","@AllHailQuan He ain’t no rapist, quit messin’ ..."


In [48]:
claude = pd.read_csv('../sentiment_datasets/phi4_aae_from_sae_sentiment.csv')
claude.head()

Unnamed: 0,aae_text,sentiment
0,She can't get nothin' from me but some bubble ...,negative
1,"@islandboi_B Yeah, that’s dope. Ain’t nothin’ ...",positive
2,"Mixed, huh!? Dem real dark knees an’ elbows go...",neutral
3,@larrympaige @zionsays #FIGHT. Folks always as...,positive
4,Mike James from @mavs ain’t nobody and he real...,negative


In [49]:
claude.rename(columns={"aae_text" : "aae_from_sae_text"}, inplace=True)

In [50]:
claude.head()

Unnamed: 0,aae_from_sae_text,sentiment
0,She can't get nothin' from me but some bubble ...,negative
1,"@islandboi_B Yeah, that’s dope. Ain’t nothin’ ...",positive
2,"Mixed, huh!? Dem real dark knees an’ elbows go...",neutral
3,@larrympaige @zionsays #FIGHT. Folks always as...,positive
4,Mike James from @mavs ain’t nobody and he real...,negative


In [51]:
claude[["aae_from_sae_text", "sentiment"]].to_csv('../sentiment_datasets/phi4_aae_from_sae_sentiment.csv', index=False)