In [5]:
import pandas as pd
import re
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from collections import Counter
import os
from openai import OpenAI
import anthropic
from groq import Groq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_anthropic import ChatAnthropic

In [6]:
cache_dir = "../cache/"
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ['ANTHROPIC_API_KEY'] = 'anthropic-api-key-here'

## The below is API script to get labels from Claude Haiku

We send multiple sentences at once to save token usage cost.

In [7]:
from typing import List, Optional
from tqdm import tqdm
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate

# Define the schema for sentiment analysis
class SentimentAnalysisResponse(BaseModel):
    sentiment: str = Field(description="The sentiment of the sentence (Positive, Negative, or Neutral)")

class Data(BaseModel):
    """Extracted data about sentences."""
    sentiments: List[SentimentAnalysisResponse]

# Define the prompt template
chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Your task is to analyze the provided sentences written in African American English and identify the sentiment expressed by the author. 
            The sentiment should be classified as Positive, Negative, or Neutral for each sentence."""
        ),
        ("user", "{sentences}")
    ]
)
model = ChatAnthropic(model="claude-3-haiku-20240307", timeout=None,
    max_retries=2, temperature=0)

# Create the runnable chain
runnable = chat_template | model.with_structured_output(schema=Data)
runnable_single = chat_template | model.with_structured_output(schema=SentimentAnalysisResponse)

dataset = pd.read_csv('2000-5000_sentences.csv')["text"]

## Testing on first 10 sentences
result = runnable.invoke({"sentences" : "\n".join(dataset[:10].to_list())})

print(f"First 10 sentences sentiment result: {result}")

First 10 sentences sentiment result: sentiments=[SentimentAnalysisResponse(sentiment='Negative'), SentimentAnalysisResponse(sentiment='Positive'), SentimentAnalysisResponse(sentiment='Positive'), SentimentAnalysisResponse(sentiment='Neutral'), SentimentAnalysisResponse(sentiment='Positive'), SentimentAnalysisResponse(sentiment='Neutral'), SentimentAnalysisResponse(sentiment='Neutral'), SentimentAnalysisResponse(sentiment='Negative'), SentimentAnalysisResponse(sentiment='Neutral'), SentimentAnalysisResponse(sentiment='Neutral')]


In [8]:
# Initialize lists to store the sentiments and their corresponding indices
all_sentiments = []
processed_indices = []
failed_indices = []

# Process the dataset in batches of 5
for i in tqdm(range(0, len(dataset), 5)):
    batch = dataset[i:i+5].to_list()
    batch_indices = list(range(i, min(i+5, len(dataset))))
    
    try:
        result = runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.sentiments) == len(batch):
            all_sentiments.extend([response.sentiment for response in result.sentiments])
            processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as failed
            failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+4}: {str(e)}")
        failed_indices.extend(batch_indices)

# Create a new dataframe with successfully processed sentences and sentiments
labeled_df = pd.DataFrame({
    'index': processed_indices,
    'text': dataset.iloc[processed_indices],
    'sentiment': all_sentiments
})

# Sort the dataframe by the original index
labeled_df = labeled_df.sort_values('index').reset_index(drop=True)

# Save the labeled dataset to a CSV file
output_path = './labeled/Claude-Haiku-Labels.csv'
labeled_df.to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")

# Save the failed indices to a separate file
failed_indices_path = './labeled/failed_indices=Claude-Haiku.csv'
pd.DataFrame({'failed_index': failed_indices}).to_csv(failed_indices_path, index=False)

print(f"Failed indices saved to {failed_indices_path}")
print(f"Number of successfully processed sentences: {len(processed_indices)}")
print(f"Number of failed sentences: {len(failed_indices)}")

 13%|███████████████████████████▏                                                                                                                                                                                        | 77/600 [01:26<08:39,  1.01it/s]

Error processing batch 380-384: 1 validation error for Data
sentiments
  value is not a valid list (type=type_error.list)


 23%|████████████████████████████████████████████████▌                                                                                                                                                                  | 138/600 [02:34<07:00,  1.10it/s]

Error processing batch 685-689: 1 validation error for Data
sentiments
  value is not a valid list (type=type_error.list)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 600/600 [10:01<00:00,  1.00s/it]

Labeled dataset saved to ./labeled/Claude-Haiku-Labels.csv
Failed indices saved to ./labeled/failed_indices=Claude-Haiku.csv
Number of successfully processed sentences: 2750
Number of failed sentences: 250





## Processing indcies that were not parsed correctly in first iteration of the API call

In [9]:
# Load the previously processed data
labeled_df = pd.read_csv('./labeled/Claude-Haiku-Labels.csv')
failed_indices = pd.read_csv('./labeled/failed_indices=Claude-Haiku.csv')['failed_index'].tolist()

# Load the original dataset
original_dataset = pd.read_csv('2000-5000_sentences.csv')["text"]

# Initialize lists to store the new sentiments and their corresponding indices
new_sentiments = []
new_processed_indices = []
still_failed_indices = []

# Process the failed sentences
for i in tqdm(range(0, len(failed_indices), 3)):
    batch_indices = failed_indices[i:i+3]
    batch = original_dataset.iloc[batch_indices].tolist()
    
    try:
        result = runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.sentiments) == len(batch):
            new_sentiments.extend([response.sentiment for response in result.sentiments])
            new_processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as still failed
            still_failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+2}: {str(e)}")
        still_failed_indices.extend(batch_indices)

# Create a new dataframe with newly processed sentences and sentiments
new_labeled_df = pd.DataFrame({
    'index': new_processed_indices,
    'text': original_dataset.iloc[new_processed_indices],
    'sentiment': new_sentiments
})

# Combine the previously processed data with the newly processed data
combined_df = pd.concat([labeled_df, new_labeled_df], ignore_index=True)

# Sort the dataframe by the original index and reset the index
combined_df = combined_df.sort_values('index').reset_index(drop=True)

# If there are still failed indices, add them to the combined dataframe with NaN sentiment
if still_failed_indices:
    failed_df = pd.DataFrame({
        'index': still_failed_indices,
        'text': original_dataset.iloc[still_failed_indices],
        'sentiment': pd.NA
    })
    combined_df = pd.concat([combined_df, failed_df], ignore_index=True)
    combined_df = combined_df.sort_values('index').reset_index(drop=True)

# Save the complete labeled dataset to a CSV file
output_path = './labeled/complete-2000-5000-Claude-Haiku-Labels.csv'
combined_df[["text", "sentiment"]].to_csv(output_path, index=False)

print(f"Complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {len(combined_df)}")
print(f"Successfully labeled sentences: {combined_df['sentiment'].notna().sum()}")
print(f"Failed sentences: {combined_df['sentiment'].isna().sum()}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84/84 [01:16<00:00,  1.10it/s]

Complete labeled dataset saved to ./labeled/complete-2000-5000-Claude-Haiku-Labels.csv
Total processed sentences: 3000
Successfully labeled sentences: 2990
Failed sentences: 10





### Individualy calling the API for the 10 falied indices

In [10]:
import pandas as pd

# Load the complete labeled dataset
combined_df = pd.read_csv('./labeled/complete-2000-5000-Claude-Haiku-Labels.csv')

for idx in tqdm(combined_df[combined_df['sentiment'].isna()].index, desc="Processing sentences"):
    row = combined_df.loc[idx]
    try:
        # Invoke the API and update the sentiment
        combined_df.at[idx, 'sentiment'] = runnable_single.invoke({"sentences": row['text']}).sentiment
    except Exception as e:
        print(f"Error processing sentence at index {idx}: {str(e)}")

# Save the updated complete labeled dataset
output_path = './labeled/ccomplete-2000-5000-Claude-Haiku-Labels-final.csv'
combined_df.to_csv(output_path, index=False)

# Display the summary of the updated dataset
updated_total = len(combined_df)
updated_successful = combined_df['sentiment'].notna().sum()
updated_failed = combined_df['sentiment'].isna().sum()

print(f"Updated complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {updated_total}")
print(f"Successfully labeled sentences: {updated_successful}")
print(f"Failed sentences: {updated_failed}")


Processing sentences: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:10<00:00,  1.09s/it]

Updated complete labeled dataset saved to ./labeled/ccomplete-2000-5000-Claude-Haiku-Labels-final.csv
Total processed sentences: 3000
Successfully labeled sentences: 3000
Failed sentences: 0





In [11]:
combined_df[["text", "sentiment"]].to_csv(output_path, index=False)

______________

## Now we will translate the AAE sentences to SAE

In [45]:
class StandardEnglish(BaseModel):
    standard_english: str = Field(description="The tweet converted into Standard American English.")

class Data(BaseModel):
    """Convert the list of tweets provided to standard american english."""
    standard_english_tweets: List[StandardEnglish] = Field(description="The list of converted tweets by order of the sentences given.")

# Define the prompt template
chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You will be given a list of tweets extracted from twitter accounts belonging to African American individuals. Your task is to convert the given tweet to Standard American English."""
        ),
        ("user", "{sentences}")
    ]
)

model = ChatAnthropic(model="claude-3-haiku-20240307", timeout=None,
    max_retries=2, temperature=0)

# Create the runnable chain
runnable = chat_template | model.with_structured_output(schema=Data)

chat_template_single = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You will be given a tweet extracted from a twitter account belonging to an African American individual. Your task is to convert the given tweet to Standard American English."""
        ),
        ("user", "{sentence}")
    ]
)

runnable_single = chat_template_single | model.with_structured_output(schema=StandardEnglish)

In [13]:
aae_dataset = pd.read_csv('2000-5000_sentences.csv')["text"]

In [14]:
all_sae_sentence = []
processed_indices = []
failed_indices = []

In [15]:
# Process the dataset in batches of 5
for i in tqdm(range(0, len(aae_dataset), 5)):
    batch = aae_dataset[i:i+5].to_list()
    batch_indices = list(range(i, min(i+5, len(aae_dataset))))
    
    try:
        result = runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.standard_english_tweets) == len(batch):
            all_sae_sentence.extend([response.standard_english for response in result.standard_english_tweets])
            processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as failed
            failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+4}: {str(e)}")
        failed_indices.extend(batch_indices)

# Create a new dataframe with successfully processed sentences and sentiments
labeled_df = pd.DataFrame({
    'index': processed_indices,
    'african_american_english': aae_dataset.iloc[processed_indices],
    'standard_american_english': all_sae_sentence
})

# Sort the dataframe by the original index
labeled_df = labeled_df.sort_values('index').reset_index(drop=True)

# Save the labeled dataset to a CSV file
output_path = './labeled/Claude-Haiku-SAE.csv'
labeled_df.to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")

# Save the failed indices to a separate file
failed_indices_path = './labeled/failed_indices_Claude-Haiku-SAE.csv'
pd.DataFrame({'failed_index': failed_indices}).to_csv(failed_indices_path, index=False)

print(f"Failed indices saved to {failed_indices_path}")
print(f"Number of successfully processed sentences: {len(processed_indices)}")
print(f"Number of failed sentences: {len(failed_indices)}")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 600/600 [15:43<00:00,  1.57s/it]

Labeled dataset saved to ./labeled/Claude-Haiku-SAE.csv
Failed indices saved to ./labeled/failed_indices_Claude-Haiku-SAE.csv
Number of successfully processed sentences: 2470
Number of failed sentences: 530





## Processing the failed 530 sentences with lower batch size

In [16]:
# Load the previously processed data
labeled_df = pd.read_csv('./labeled/Claude-Haiku-SAE.csv')
failed_indices = pd.read_csv('./labeled/failed_indices_Claude-Haiku-SAE.csv')['failed_index'].tolist()

new_all_sae_sentence = []
new_processed_indices = []
still_failed_indices = []

# Process the failed sentences
for i in tqdm(range(0, len(failed_indices), 3)):
    batch_indices = failed_indices[i:i+3]
    batch = aae_dataset.iloc[batch_indices].tolist()
    
    try:
        result = runnable.invoke({"sentences": "\n".join(batch)})
        
        
        if len(result.standard_english_tweets) == len(batch):
            new_all_sae_sentence.extend([response.standard_english for response in result.standard_english_tweets])
            new_processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as still failed
            still_failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+2}: {str(e)}")
        still_failed_indices.extend(batch_indices)

# Create a new dataframe with newly processed sentences and sentiments
new_labeled_df = pd.DataFrame({
    'index': new_processed_indices,
    'african_american_english': aae_dataset.iloc[new_processed_indices],
    'standard_american_english': new_all_sae_sentence
})

# Combine the previously processed data with the newly processed data
combined_df = pd.concat([labeled_df, new_labeled_df], ignore_index=True)

# Sort the dataframe by the original index and reset the index
combined_df = combined_df.sort_values('index').reset_index(drop=True)

# If there are still failed indices, add them to the combined dataframe with NaN sentiment
if still_failed_indices:
    failed_df = pd.DataFrame({
        'index': still_failed_indices,
        'african_american_english': aae_dataset.iloc[still_failed_indices],
        'standard_american_english': pd.NA
    })
    combined_df = pd.concat([combined_df, failed_df], ignore_index=True)
    combined_df = combined_df.sort_values('index').reset_index(drop=True)

output_path = './labeled/complete-3000-Claude-Haiku-SAE.csv'
combined_df.to_csv(output_path, index=False)

print(f"Complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {len(combined_df)}")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 177/177 [03:31<00:00,  1.19s/it]

Complete labeled dataset saved to ./labeled/complete-3000-Claude-Haiku-SAE.csv
Total processed sentences: 3000





In [37]:
print(f"failed indices: {combined_df['standard_american_english'].isna().sum()}")

failed indices: 218


## Individually calling the API for the failed 218 sentences

In [44]:
import pandas as pd

# Load the complete labeled dataset
combined_df = pd.read_csv('./labeled/complete-3000-Claude-Haiku-SAE.csv')

# Apply the lambda function to process the indices where sentiment is NaN
combined_df['standard_american_english'] = combined_df.apply(
    lambda row: runnable_single.invoke({"sentence": row['african_american_english']}).standard_english
    if pd.isna(row['standard_american_english']) else row['standard_american_english'], 
    axis=1
)

# Save the updated complete labeled dataset
output_path = './labeled/complete-3000-Claude-Haiku-SAE-FINAL.csv'
combined_df.to_csv(output_path, index=False)

# Display the summary of the updated dataset
updated_total = len(combined_df)
updated_successful = combined_df['standard_american_english'].notna().sum()
updated_failed = combined_df['standard_american_english'].isna().sum()

print(f"Updated complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {updated_total}")
print(f"Successfully labeled sentences: {updated_successful}")
print(f"Failed sentences: {updated_failed}")

BadRequestError: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Output blocked by content filtering policy'}}

Claude blocking some output due to inappropriate words. Let's allow the model to mask these inapproaiate words so we may have a SAE translation.

*You will be given a tweet extracted from a twitter account belonging to an African American individual. Your task is to convert the given tweet to Standard American English. **In rare circumstances, you are allowed to mask any inappropriate word but try keeping your translation as accurate as possible.***

In [51]:
chat_template_single = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You will be given a tweet extracted from a twitter account belonging to an African American individual. Your task is to convert the given tweet to Standard American English. In rare circumstances, you are allowed to sensor out any inappropriate word in your translation."""
        ),
        ("user", "{sentence}")
    ]
)

runnable_single = chat_template_single | model.with_structured_output(schema=StandardEnglish)

In [52]:
import pandas as pd

# Load the complete labeled dataset
combined_df = pd.read_csv('./labeled/complete-3000-Claude-Haiku-SAE.csv')

# Apply the lambda function to process the indices where sentiment is NaN
combined_df['standard_american_english'] = combined_df.apply(
    lambda row: runnable_single.invoke({"sentence": row['african_american_english']}).standard_english
    if pd.isna(row['standard_american_english']) else row['standard_american_english'], 
    axis=1
)

# Save the updated complete labeled dataset
output_path = './labeled/complete-3000-Claude-Haiku-SAE-FINAL.csv'
combined_df.to_csv(output_path, index=False)

# Display the summary of the updated dataset
updated_total = len(combined_df)
updated_successful = combined_df['standard_american_english'].notna().sum()
updated_failed = combined_df['standard_american_english'].isna().sum()

print(f"Updated complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {updated_total}")
print(f"Successfully labeled sentences: {updated_successful}")
print(f"Failed sentences: {updated_failed}")

Updated complete labeled dataset saved to ./labeled/complete-3000-Claude-Haiku-SAE-FINAL.csv
Total processed sentences: 3000
Successfully labeled sentences: 3000
Failed sentences: 0


______________

## Getting sentiment labels for SAE

In [54]:
class SentimentAnalysisResponse(BaseModel):
    sentiment: str = Field(description="The sentiment of the sentence (Positive, Negative, or Neutral)")

class Data(BaseModel):
    """Extracted data about sentences."""
    sentiments: List[SentimentAnalysisResponse]

# Define the prompt template
chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Your task is to analyze the provided sentences written in Standard American English and identify the sentiment expressed by the author. The sentiment should be classified as Positive, Negative, or Neutral for each sentence."""
        ),
        ("user", "{sentences}")
    ]
)

model = ChatAnthropic(model="claude-3-haiku-20240307", timeout=None,
    max_retries=2, temperature=0)

# Create the runnable chain
sae_labels_runnable = chat_template | model.with_structured_output(schema=Data)

class SentimentAnalysisResponse(BaseModel):
    sentiment: str = Field(description="The sentiment of the sentence (Positive, Negative, or Neutral)")

chat_template_single = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Your task is to analyze the provided sentence written in Standard American English and identify the sentiment expressed by the author. The sentiment should be classified as Positive, Negative, or Neutral."""
        ),
        ("user", "{sentences}")
    ]
)

sae_labels_runnable_single = chat_template_single | model.with_structured_output(schema=SentimentAnalysisResponse)

In [55]:
all_sae_sentiment = []
processed_indices = []
failed_indices = []
sae_dataset = pd.read_csv('./labeled/Claude-Haiku-SAE.csv')['standard_american_english']

In [56]:
sae_dataset[0]

'If I do not get this job tomorrow, I do not know what I will do. I am at the end of my rope.'

In [57]:
for i in tqdm(range(0, len(sae_dataset), 5)):
    batch = sae_dataset[i:i+5].to_list()
    batch_indices = list(range(i, min(i+5, len(sae_dataset))))
    
    try:
        result = sae_labels_runnable.invoke({"sentences": "\n".join(batch)})
        
        if len(result.sentiments) == len(batch):
            all_sae_sentiment.extend([response.sentiment for response in result.sentiments])
            processed_indices.extend(batch_indices)
        else:
            failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+4}: {str(e)}")
        failed_indices.extend(batch_indices)

labeled_df_sae_labels = pd.DataFrame({
    'index': processed_indices,
    'standard_american_english': sae_dataset.iloc[processed_indices],
    'sae_labels': all_sae_sentiment
})

# Sort the dataframe by the original index
labeled_df_sae_labels = labeled_df_sae_labels.sort_values('index').reset_index(drop=True)

output_path = './labeled/Claude-Haiku-sae-labels.csv'
labeled_df_sae_labels.to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")

# Save the failed indices to a separate file
failed_indices_path = './labeled/failed_indices_Claude-Haiku-sae-labels.csv'
pd.DataFrame({'failed_index': failed_indices}).to_csv(failed_indices_path, index=False)

print(f"Failed indices saved to {failed_indices_path}")
print(f"Number of successfully processed sentences: {len(processed_indices)}")
print(f"Number of failed sentences: {len(failed_indices)}")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 600/600 [10:14<00:00,  1.02s/it]

Labeled dataset saved to ./labeled/Claude-Haiku-sae-labels.csv
Failed indices saved to ./labeled/failed_indices_Claude-Haiku-sae-labels.csv
Number of successfully processed sentences: 2940
Number of failed sentences: 60





## Processing the failed 60 sentences

In [59]:
labeled_df = pd.read_csv('./labeled/Claude-Haiku-sae-labels.csv')
failed_indices = pd.read_csv('./labeled/failed_indices_Claude-Haiku-sae-labels.csv')['failed_index'].tolist()


new_all_sae_sentiment = []
new_processed_indices = []
still_failed_indices = []

for i in tqdm(range(0, len(failed_indices), 2)):
    batch_indices = failed_indices[i:i+2]
    batch = sae_dataset.iloc[batch_indices].tolist()
    try:
        result = sae_labels_runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.sentiments) == len(batch):
            new_all_sae_sentiment.extend([response.sentiment for response in result.sentiments])
            new_processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as still failed
            still_failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+2}: {str(e)}")
        still_failed_indices.extend(batch_indices)

# Create a new dataframe with newly processed sentences and sentiments
new_labeled_df = pd.DataFrame({
    'index': new_processed_indices,
    'standard_american_english': sae_dataset.iloc[new_processed_indices],
    'sae_labels': new_all_sae_sentiment
})

# Combine the previously processed data with the newly processed data
combined_df = pd.concat([labeled_df, new_labeled_df], ignore_index=True)

# Sort the dataframe by the original index and reset the index
combined_df = combined_df.sort_values('index').reset_index(drop=True)

# If there are still failed indices, add them to the combined dataframe with NaN sentiment
if still_failed_indices:
    failed_df = pd.DataFrame({
        'index': still_failed_indices,
        'standard_american_english': sae_dataset.iloc[still_failed_indices],
        'sae_labels': pd.NA
    })
    combined_df = pd.concat([combined_df, failed_df], ignore_index=True)
    combined_df = combined_df.sort_values('index').reset_index(drop=True)

output_path = './labeled/complete-3000-Claude-Haiku-sae-labels.csv'
combined_df.to_csv(output_path, index=False)

print(f"Complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {len(combined_df)}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:26<00:00,  1.11it/s]

Complete labeled dataset saved to ./labeled/complete-3000-Claude-Haiku-sae-labels.csv
Total processed sentences: 3000





______________

### Pipiline uptil now:
AAE -> AAE Sentiment -> Translate to SAE -> SAE Sentiment

## Now we go back to AAE using SAE, and finish off by obtaining sentiment on that.

AAE -> AAE Sentiment -> Translate to SAE -> SAE Sentiment -> **AAE_from_SAE -> AAE_from_SAE Sentiment**

In [63]:
class StandardEnglish(BaseModel):
    african_american_english: str = Field(description="The tweet converted into African American English.")

class Data(BaseModel):
    """Convert the list of SAE tweets provided to convert to African American English."""
    aae_from_sae_tweets: List[StandardEnglish] = Field(description="The list of converted tweets by order of the sentences given.")


aae_from_sae_chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You will be given a list of tweets in Standard American English. Your task is to convert the given tweets to African American English."""
        ),
        ("user", "{sentences}")
    ]
)

model = ChatAnthropic(model="claude-3-haiku-20240307", timeout=None,
    max_retries=2, temperature=0)


aae_from_sae_runnable = aae_from_sae_chat_template | model.with_structured_output(schema=Data)

chat_template_single = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You will be given a tweet in Standard American English. Your task is to convert the given tweet to African American English."""
        ),
        ("user", "{sentences}")
    ]
)

aae_from_sae_runnable_single = chat_template_single | model.with_structured_output(schema=StandardEnglish)

In [66]:
sae_dataset = pd.read_csv('./labeled/Claude-Haiku-sae-labels.csv')["standard_american_english"]

In [67]:
all_aae_from_sae_sentence = []
processed_indices = []
failed_indices = []

In [68]:
for i in tqdm(range(0, len(sae_dataset), 5)):
    batch = sae_dataset[i:i+5].to_list()
    batch_indices = list(range(i, min(i+5, len(sae_dataset))))
    
    try:
        result = aae_from_sae_runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.aae_from_sae_tweets) == len(batch):
            all_aae_from_sae_sentence.extend([response.african_american_english for response in result.aae_from_sae_tweets])
            processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as failed
            failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+4}: {str(e)}")
        failed_indices.extend(batch_indices)

# Create a new dataframe with successfully processed sentences and sentiments
labeled_df = pd.DataFrame({
    'index': processed_indices,
    'standard_american_english' : sae_dataset.iloc[processed_indices],
    'aae_from_sae': all_aae_from_sae_sentence
})

# Sort the dataframe by the original index
labeled_df = labeled_df.sort_values('index').reset_index(drop=True)

# Save the labeled dataset to a CSV file
output_path = './labeled/Claude-Haiku-AAE_from_SAE.csv'
labeled_df.to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")

# Save the failed indices to a separate file
failed_indices_path = './labeled/failed_indices_Claude-Haiku-AAE_from_SAE.csv'
pd.DataFrame({'failed_index': failed_indices}).to_csv(failed_indices_path, index=False)

print(f"Failed indices saved to {failed_indices_path}")
print(f"Number of successfully processed sentences: {len(processed_indices)}")
print(f"Number of failed sentences: {len(failed_indices)}")

  1%|█▍                                                                                                                                                                                                               | 4/600 [00:08<20:09,  2.03s/it]

Error processing batch 15-19: 5 validation errors for Data
aae_from_sae_tweets -> 0
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 1
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 2
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 3
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 4
  value is not a valid dict (type=type_error.dict)


  1%|██▍                                                                                                                                                                                                              | 7/600 [00:13<17:58,  1.82s/it]

Error processing batch 30-34: 5 validation errors for Data
aae_from_sae_tweets -> 0
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 1
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 2
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 3
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 4
  value is not a valid dict (type=type_error.dict)


 46%|████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                              | 279/600 [09:23<10:01,  1.87s/it]

Error processing batch 1390-1394: 5 validation errors for Data
aae_from_sae_tweets -> 0
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 1
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 2
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 3
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 4
  value is not a valid dict (type=type_error.dict)


 48%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                            | 285/600 [09:35<09:40,  1.84s/it]

Error processing batch 1420-1424: 5 validation errors for Data
aae_from_sae_tweets -> 0
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 1
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 2
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 3
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 4
  value is not a valid dict (type=type_error.dict)


 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                   | 450/600 [15:09<04:35,  1.83s/it]

Error processing batch 2245-2249: 5 validation errors for Data
aae_from_sae_tweets -> 0
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 1
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 2
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 3
  value is not a valid dict (type=type_error.dict)
aae_from_sae_tweets -> 4
  value is not a valid dict (type=type_error.dict)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 600/600 [20:18<00:00,  2.03s/it]

Labeled dataset saved to ./labeled/Claude-Haiku-AAE_from_SAE.csv
Failed indices saved to ./labeled/failed_indices_Claude-Haiku-AAE_from_SAE.csv
Number of successfully processed sentences: 2770
Number of failed sentences: 230





In [69]:
labeled_df = pd.read_csv('./labeled/Claude-Haiku-AAE_from_SAE.csv')
failed_indices = pd.read_csv('./labeled/failed_indices_Claude-Haiku-AAE_from_SAE.csv')['failed_index'].tolist()


new_aae_from_sae_sentence = []
new_processed_indices = []
still_failed_indices = []

for i in tqdm(range(0, len(failed_indices), 2)):
    batch_indices = failed_indices[i:i+2]
    batch = sae_dataset.iloc[batch_indices].tolist()
    try:
        result = aae_from_sae_runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.aae_from_sae_tweets) == len(batch):
            new_aae_from_sae_sentence.extend([response.african_american_english for response in result.aae_from_sae_tweets])
            new_processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as failed
            still_failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+2}: {str(e)}")
        still_failed_indices.extend(batch_indices)

# Create a new dataframe with newly processed sentences and sentiments
new_labeled_df = pd.DataFrame({
    'index': new_processed_indices,
    'standard_american_english': sae_dataset.iloc[new_processed_indices],
    'aae_from_sae': new_aae_from_sae_sentence
})

# Combine the previously processed data with the newly processed data
combined_df = pd.concat([labeled_df, new_labeled_df], ignore_index=True)

# Sort the dataframe by the original index and reset the index
combined_df = combined_df.sort_values('index').reset_index(drop=True)

# If there are still failed indices, add them to the combined dataframe with NaN sentiment
if still_failed_indices:
    failed_df = pd.DataFrame({
        'index': still_failed_indices,
        'standard_american_english': sae_dataset.iloc[still_failed_indices],
        'aae_from_sae': pd.NA
    })
    combined_df = pd.concat([combined_df, failed_df], ignore_index=True)
    combined_df = combined_df.sort_values('index').reset_index(drop=True)

output_path = './labeled/complete-3000-Claude-Haiku-AAE_from_SAE.csv'
combined_df.to_csv(output_path, index=False)

print(f"Complete Claude-Haiku-AAE_from_SAE dataset saved to {output_path}")
print(f"Total processed sentences: {len(combined_df)}")
print(f"Still Failed sentences: {len(still_failed_indices)}")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 115/115 [02:39<00:00,  1.39s/it]

Complete Claude-Haiku-AAE_from_SAE dataset saved to ./labeled/complete-3000-Claude-Haiku-AAE_from_SAE.csv
Total processed sentences: 3000
Still Failed sentences: 82





In [70]:
import pandas as pd

combined_df = pd.read_csv('./labeled/complete-3000-Claude-Haiku-AAE_from_SAE.csv')

# Apply the lambda function to process the indices where sentiment is NaN
combined_df['aae_from_sae'] = combined_df.apply(
    lambda row: aae_from_sae_runnable_single.invoke({"sentences": row['standard_american_english']}).african_american_english
    if pd.isna(row['aae_from_sae']) else row['aae_from_sae'], 
    axis=1
)

# Save the updated complete labeled dataset
output_path = './labeled/Claude-Haiku-AAE_from_SAE-FINAL.csv'
combined_df[["aae_from_sae", "standard_american_english"]].to_csv(output_path, index=False)

# Display the summary of the updated dataset
updated_total = len(combined_df)
updated_successful = combined_df['aae_from_sae'].notna().sum()
updated_failed = combined_df['aae_from_sae'].isna().sum()

print(f"Updated complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {updated_total}")
print(f"Successfully labeled sentences: {updated_successful}")
print(f"Failed sentences: {updated_failed}")

Updated complete labeled dataset saved to ./labeled/Claude-Haiku-AAE_from_SAE-FINAL.csv
Total processed sentences: 3000
Successfully labeled sentences: 3000
Failed sentences: 0


## Getting the sentiment for AAE_from_SAE

In [77]:
# Define the schema for sentiment analysis
class SentimentAnalysisResponse(BaseModel):
    sentiment: str = Field(description="The sentiment of the sentence (Positive, Negative, or Neutral)")

class Data(BaseModel):
    """Extracted data about sentences."""
    sentiments: List[SentimentAnalysisResponse]

# Define the prompt template
chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Your task is to analyze the provided sentences written in African American English and identify the sentiment expressed by the author. 
            The sentiment should be classified as Positive, Negative, or Neutral for each sentence."""
        ),
        ("user", "{sentences}")
    ]
)

model = ChatAnthropic(model="claude-3-haiku-20240307", timeout=None,
    max_retries=2, temperature=0)

# Create the runnable chain
runnable = chat_template | model.with_structured_output(schema=Data)

single_chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Your task is to analyze the provided sentence written in African American English and identify the sentiment expressed by the author. 
            The sentiment should be classified as Positive, Negative, or Neutral for each sentence."""
        ),
        ("user", "{sentences}")
    ]
)

runnable_single = single_chat_template | model.with_structured_output(schema=SentimentAnalysisResponse)

In [74]:
dataset = pd.read_csv("./labeled/Claude-Haiku-AAE_from_SAE.csv")["aae_from_sae"]

In [75]:
# Initialize lists to store the sentiments and their corresponding indices
all_sentiments = []
processed_indices = []
failed_indices = []

# Process the dataset in batches of 5
for i in tqdm(range(0, len(dataset), 5)):
    batch = dataset[i:i+5].to_list()
    batch_indices = list(range(i, min(i+5, len(dataset))))
    
    try:
        result = runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.sentiments) == len(batch):
            all_sentiments.extend([response.sentiment for response in result.sentiments])
            processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as failed
            failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+4}: {str(e)}")
        failed_indices.extend(batch_indices)

# Create a new dataframe with successfully processed sentences and sentiments
labeled_df = pd.DataFrame({
    'index': processed_indices,
    'aae_from_sae': dataset.iloc[processed_indices],
    'sentiment': all_sentiments
})

# Sort the dataframe by the original index
labeled_df = labeled_df.sort_values('index').reset_index(drop=True)

# Save the labeled dataset to a CSV file
output_path = './labeled/Claude-Haiku-AAE_from_SAE_labels.csv'
labeled_df.to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")

# Save the failed indices to a separate file
failed_indices_path = './labeled/Claude-Haiku-AAE_from_SAE_labels_failed_indices.csv'
pd.DataFrame({'failed_index': failed_indices}).to_csv(failed_indices_path, index=False)

print(f"Failed indices saved to {failed_indices_path}")
print(f"Number of successfully processed sentences: {len(processed_indices)}")
print(f"Number of failed sentences: {len(failed_indices)}")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 600/600 [10:33<00:00,  1.06s/it]

Labeled dataset saved to ./labeled/Claude-Haiku-AAE_from_SAE_labels.csv
Failed indices saved to ./labeled/Claude-Haiku-AAE_from_SAE_labels_failed_indices.csv
Number of successfully processed sentences: 2910
Number of failed sentences: 90





In [76]:
# Load the previously processed data
labeled_df = pd.read_csv('./labeled/Claude-Haiku-AAE_from_SAE_labels.csv')
failed_indices = pd.read_csv('./labeled/Claude-Haiku-AAE_from_SAE_labels_failed_indices.csv')['failed_index'].tolist()


# Initialize lists to store the new sentiments and their corresponding indices
new_sentiments = []
new_processed_indices = []
still_failed_indices = []

# Process the failed sentences
for i in tqdm(range(0, len(failed_indices), 2)):
    batch_indices = failed_indices[i:i+2]
    batch = dataset.iloc[batch_indices].tolist()
    
    try:
        result = runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.sentiments) == len(batch):
            new_sentiments.extend([response.sentiment for response in result.sentiments])
            new_processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as still failed
            still_failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+1}: {str(e)}")
        still_failed_indices.extend(batch_indices)

# Create a new dataframe with newly processed sentences and sentiments
new_labeled_df = pd.DataFrame({
    'index': new_processed_indices,
    'aae_from_sae': dataset.iloc[new_processed_indices],
    'sentiment': new_sentiments
})

# Combine the previously processed data with the newly processed data
combined_df = pd.concat([labeled_df, new_labeled_df], ignore_index=True)

# Sort the dataframe by the original index and reset the index
combined_df = combined_df.sort_values('index').reset_index(drop=True)

# If there are still failed indices, add them to the combined dataframe with NaN sentiment
if still_failed_indices:
    failed_df = pd.DataFrame({
        'index': still_failed_indices,
        'aae_from_sae': dataset.iloc[still_failed_indices],
        'sentiment': pd.NA
    })
    combined_df = pd.concat([combined_df, failed_df], ignore_index=True)
    combined_df = combined_df.sort_values('index').reset_index(drop=True)

# Save the complete labeled dataset to a CSV file
output_path = './labeled/complete-3000-Claude-Haiku-AAE_from_SAE_labels.csv'
combined_df.to_csv(output_path, index=False)

print(f"Complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {len(combined_df)}")
print(f"Successfully labeled sentences: {combined_df['sentiment'].notna().sum()}")
print(f"Failed sentences: {combined_df['sentiment'].isna().sum()}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:48<00:00,  1.07s/it]

Complete labeled dataset saved to ./labeled/complete-3000-Claude-Haiku-AAE_from_SAE_labels.csv
Total processed sentences: 3000
Successfully labeled sentences: 2956
Failed sentences: 44





In [78]:
import pandas as pd

combined_df = pd.read_csv('./labeled/complete-3000-Claude-Haiku-AAE_from_SAE_labels.csv')

# Apply the lambda function to process the indices where sentiment is NaN
combined_df['sentiment'] = combined_df.apply(
    lambda row: runnable_single.invoke({"sentences": row['aae_from_sae']}).sentiment 
    if pd.isna(row['sentiment']) else row['sentiment'], 
    axis=1
)

# Save the updated complete labeled dataset
output_path = './labeled/complete-3000-Claude-Haiku-AAE_from_SAE_labels-final.csv'
combined_df.to_csv(output_path, index=False)

# Display the summary of the updated dataset
updated_total = len(combined_df)
updated_successful = combined_df['sentiment'].notna().sum()
updated_failed = combined_df['sentiment'].isna().sum()

print(f"Updated complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {updated_total}")
print(f"Successfully labeled sentences: {updated_successful}")
print(f"Failed sentences: {updated_failed}")

Updated complete labeled dataset saved to ./labeled/complete-3000-Claude-Haiku-AAE_from_SAE_labels-final.csv
Total processed sentences: 3000
Successfully labeled sentences: 3000
Failed sentences: 0


In [79]:
result = combined_df[["aae_from_sae", "sentiment"]]

In [80]:
result.to_csv(output_path, index=False)