In [31]:
import pandas as pd
import re
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from collections import Counter
import os
from openai import OpenAI
import anthropic
from groq import Groq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
import torch

In [2]:
cache_dir = "./cache/"
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ['OPENAI_API_KEY'] = 'openai-api-key-here'
os.environ['ANTHROPIC_API_KEY'] = 'anthropic-api-key-here'

In [57]:
from typing import List, Optional
from tqdm import tqdm
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate

# Define the schema for sentiment analysis
# Define the schema for sentiment analysis
class SentimentAnalysisResponse(BaseModel):
    sentiment: str = Field(description="The sentiment of the sentence (Positive, Negative, or Neutral)")

class Data(BaseModel):
    """Extracted data about sentences."""
    sentiments: List[SentimentAnalysisResponse]

# Define the prompt template
chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Your task is to analyze the provided sentences written in African American English and identify the sentiment expressed by the author. 
            The sentiment should be classified as Positive, Negative, or Neutral only. No other words are allowed for the sentiment."""
        ),
        ("user", "{sentences}")
    ]
)



# GPT 3-5 SAE_FROM_AAE sentiments

In [58]:
# Define the language model
#model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)

model = ChatAnthropic(model="claude-3-haiku-20240307", timeout=None,
    max_retries=2, temperature=0)

# Create the runnable chain
runnable = chat_template | model.with_structured_output(schema=Data)
runnable_single = chat_template | model.with_structured_output(schema=SentimentAnalysisResponse)

In [74]:
runnable_single.invoke({"sentences" : "I keep falling in and out of love with you. Sometimes I love you, sometimes you make me sad."})

SentimentAnalysisResponse(sentiment='Neutral')

## Read the AAE from SAE dataset

In [33]:
dataset = pd.read_csv("./labeled/anthropic_Haiku-AAE_from_SAE_updated.csv")
dataset.head()

Unnamed: 0,index,standard_american_english,aae_from_sae
0,0,I'm not giving that woman anything but bubble ...,"Nah, I ain't givin' that woman nothin' but som..."
1,1,"Yes, that's what's up. There's nothing like ge...","Yeah, that's what's up. Ain't nothin' like get..."
2,2,"Mixed, huh? Those dark elbows and knees will g...","Mixed, huh? Them dark elbows and knees gon' gi..."
3,3,The player Mike James from the Dallas Maverick...,That player Mike James from the Dallas Maveric...
4,4,It took a complete stranger to tell me they're...,It took a complete stranger to tell me they pr...


In [34]:
dataset = dataset['aae_from_sae']

In [35]:
# Initialize lists to store the sentiments and their corresponding indices
all_sentiments = []
processed_indices = []
failed_indices = []


for i in tqdm(range(0, len(dataset), 5)):
    batch = dataset[i:i+5].to_list()
    batch_indices = list(range(i, min(i+5, len(dataset))))
    
    try:
        result = runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.sentiments) == len(batch):
            all_sentiments.extend([response.sentiment for response in result.sentiments])
            processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as failed
            failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+4}: {str(e)}")
        failed_indices.extend(batch_indices)

# Create a new dataframe with successfully processed sentences and sentiments
labeled_df = pd.DataFrame({
    'index': processed_indices,
    'aae_from_sae': dataset.iloc[processed_indices],
    'sentiment': all_sentiments
})

# Sort the dataframe by the original index
labeled_df = labeled_df.sort_values('index').reset_index(drop=True)

# Save the labeled dataset to a CSV file
output_path = './labeled/anthropic_Haiku-AAE_from_SAE_updated_sentiment.csv'
labeled_df.to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")

# Save the failed indices to a separate file
failed_indices_path = './labeled/failed_indices.csv'
pd.DataFrame({'failed_index': failed_indices}).to_csv(failed_indices_path, index=False)

print(f"Failed indices saved to {failed_indices_path}")
print(f"Number of successfully processed sentences: {len(processed_indices)}")
print(f"Number of failed sentences: {len(failed_indices)}")

100%|███████████████████████████████████████████████████████████████████████| 400/400 [06:54<00:00,  1.04s/it]

Labeled dataset saved to ./labeled/anthropic_Haiku-AAE_from_SAE_updated_sentiment.csv
Failed indices saved to ./labeled/failed_indices.csv
Number of successfully processed sentences: 1970
Number of failed sentences: 30





## Handling failed indices

In [36]:
# Load the previously processed data
labeled_df = pd.read_csv('./labeled/anthropic_Haiku-AAE_from_SAE_updated_sentiment.csv')
failed_indices = pd.read_csv('./labeled/failed_indices.csv')['failed_index'].tolist()

# Load the original dataset
original_dataset = pd.read_csv("./labeled/anthropic_Haiku-AAE_from_SAE_updated.csv")['aae_from_sae']

# Initialize lists to store the new sentiments and their corresponding indices
new_sentiments = []
new_processed_indices = []
still_failed_indices = []

# Process the failed sentences
for i in tqdm(range(0, len(failed_indices), 2)):
    batch_indices = failed_indices[i:i+2]
    batch = original_dataset.iloc[batch_indices].tolist()
    
    try:
        result = runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.sentiments) == len(batch):
            new_sentiments.extend([response.sentiment for response in result.sentiments])
            new_processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as still failed
            still_failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing index {i}-{i+1}: {str(e)}")
        still_failed_indices.extend(batch_indices)

# Create a new dataframe with newly processed sentences and sentiments
new_labeled_df = pd.DataFrame({
    'index': new_processed_indices,
    'aae_from_sae': original_dataset.iloc[new_processed_indices],
    'sentiment': new_sentiments
})

# Combine the previously processed data with the newly processed data
combined_df = pd.concat([labeled_df, new_labeled_df], ignore_index=True)

# Sort the dataframe by the original index and reset the index
combined_df = combined_df.sort_values('index').reset_index(drop=True)

# If there are still failed indices, add them to the combined dataframe with NaN sentiment
if still_failed_indices:
    failed_df = pd.DataFrame({
        'index': still_failed_indices,
        'aae_from_sae': original_dataset.iloc[still_failed_indices],
        'sentiment': pd.NA
    })
    combined_df = pd.concat([combined_df, failed_df], ignore_index=True)
    combined_df = combined_df.sort_values('index').reset_index(drop=True)

# Save the complete labeled dataset to a CSV file
output_path = './labeled/complete-2000-anthropic_Haiku-AAE_from_SAE_updated.csv'
combined_df.to_csv(output_path, index=False)

print(f"Complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {len(combined_df)}")
print(f"Successfully labeled sentences: {combined_df['sentiment'].notna().sum()}")
print(f"Failed sentences: {combined_df['sentiment'].isna().sum()}")

100%|█████████████████████████████████████████████████████████████████████████| 15/15 [00:15<00:00,  1.01s/it]

Complete labeled dataset saved to ./labeled/complete-2000-anthropic_Haiku-AAE_from_SAE_updated.csv
Total processed sentences: 2000
Successfully labeled sentences: 1992
Failed sentences: 8





In [37]:
combined_df = pd.read_csv('./labeled/complete-2000-anthropic_Haiku-AAE_from_SAE_updated.csv')

In [38]:
# Function to replace null sentiments
def replace_null_sentiments(df, api_function):
    # Identify rows where sentiment is null
    null_sentiments = df[df['sentiment'].isna()]
    
    # Loop through these rows
    for index, row in null_sentiments.iterrows():
        text = row['aae_from_sae']
        # Call the API or function to get the sentiment
        result = api_function.invoke({"sentences": text}).sentiment
        # Update the DataFrame with the returned sentiment
        df.at[index, 'sentiment'] = result
    
    return df

# Call the function
new_combined_df = replace_null_sentiments(combined_df, runnable_single)

# Display the updated DataFrame
print(new_combined_df['sentiment'].unique())

['Negative' 'Positive' 'Neutral' 'Mixed' '<UNKNOWN>']


In [68]:
new_combined_df[new_combined_df['sentiment'] == '<UNKNOWN>']

Unnamed: 0,index,aae_from_sae,sentiment
1434,1434,"Nah, I ain't got no link fo' dat info. I'm 'bo...",<UNKNOWN>


In [72]:
print(new_combined_df['sentiment'].unique())

['Negative' 'Positive' 'Neutral']


In [69]:
runnable_single.invoke({"sentences": new_combined_df.iloc[1434].aae_from_sae}).sentiment

'Positive'

In [70]:
new_combined_df.at[1434, 'sentiment'] = runnable_single.invoke({"sentences": new_combined_df.iloc[1434].aae_from_sae}).sentiment

In [71]:
print(f"Total processed sentences: {len(new_combined_df)}")
print(f"Successfully labeled sentences: {new_combined_df['sentiment'].notna().sum()}")
print(f"Failed sentences: {new_combined_df['sentiment'].isna().sum()}")

Total processed sentences: 2000
Successfully labeled sentences: 2000
Failed sentences: 0


In [73]:
new_combined_df.to_csv('./labeled/complete-2000-anthropic_Haiku-AAE_from_SAE_updated.csv')