In [3]:
model_name = 'gpt-3.5'

In [4]:
import pandas as pd
import re
from collections import Counter
import os
from openai import OpenAI
import anthropic
from groq import Groq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
import torch

In [5]:
cache_dir = "./cache/"
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ['OPENAI_API_KEY'] = 'openai-api-key-here'
os.environ['ANTHROPIC_API_KEY'] = 'anthropic-api-key-here'

In [82]:
from typing import List, Optional
from tqdm import tqdm
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate

class AAE(BaseModel):
    aae_english: str = Field(description="The tweet converted into African American English.")

class Data(BaseModel):
    """Convert the list of tweets provided to standard american english."""
    african_american_english_tweets: List[AAE] = Field(description="The list of tweets converted to African American English ordered by the Standard American English sentences input.")

chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You will be given a list of tweets in Standard American English. Your task is to convert the given tweets to African American English."""
        ),
        ("user", "{sentences}")
    ]
)

# Define the language model
# model = ChatAnthropic(model="claude-3-haiku-20240307", timeout=None,
#     max_retries=2, temperature=0)
model = ChatOpenAI(model="gpt-3.5-turbo-0125", timeout=None,
    max_retries=2, temperature=0)

# Create the runnable chain
runnable = chat_template | model.with_structured_output(schema=Data)

Here We are loading the dataset of aae -> sae, and using that sae to go back to aae.

In [74]:
sae_dataset = pd.read_csv(f'./labeled/{model_name}-sae.csv')["sae_sentence"]

Keeping track of the indicies we successfully got the aae of. Will retry on the failed_indices if there's any.

In [75]:
all_aae_sentence = []
processed_indices = []
failed_indices = []

We are going to send 5 inputs at a time to efficiently minimize api cost.

In [76]:
# Process the dataset in batches of 10
for i in tqdm(range(0, len(aae_dataset), 5)):
    batch = aae_dataset[i:i+5].to_list()
    batch_indices = list(range(i, min(i+5, len(aae_dataset))))
    
    try:
        result = runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.african_american_english_tweets) == len(batch):
            all_aae_sentence.extend([response.aae_english for response in result.african_american_english_tweets])
            processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as failed
            failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+4}: {str(e)}")
        failed_indices.extend(batch_indices)

# Create a new dataframe with successfully processed sentences and sentiments
labeled_df = pd.DataFrame({
    'index': processed_indices,
    'standard_american_english': sae_dataset.iloc[processed_indices],
    'aae_from_sae': all_aae_sentence
})

# Sort the dataframe by the original index
labeled_df = labeled_df.sort_values('index').reset_index(drop=True)

# Save the labeled dataset to a CSV file
output_path = './labeled/gpt-3.5-AAE_from_SAE.csv'
labeled_df.to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")

# Save the failed indices to a separate file
failed_indices_path = './labeled/failed_indices_gpt-3.5-AAE_From_SAE.csv'
pd.DataFrame({'failed_index': failed_indices}).to_csv(failed_indices_path, index=False)

print(f"Failed indices saved to {failed_indices_path}")
print(f"Number of successfully processed sentences: {len(processed_indices)}")
print(f"Number of failed sentences: {len(failed_indices)}")

100%|███████████████████████████████████████████████| 400/400 [15:51<00:00,  2.38s/it]

Labeled dataset saved to ./labeled/gpt-3.5-AAE_from_SAE.csv
Failed indices saved to ./labeled/failed_indices_gpt-3.5-AAE_From_SAE.csv
Number of successfully processed sentences: 1900
Number of failed sentences: 100





Trying the Failed indices again

In [83]:
# Load the previously processed data
labeled_df = pd.read_csv('./labeled/gpt-3.5-AAE_from_SAE.csv')
failed_indices = pd.read_csv('./labeled/failed_indices_gpt-3.5-AAE_From_SAE.csv')['failed_index'].tolist()

# Initialize lists to store the new sentiments and their corresponding indices
new_sentiments = []
new_processed_indices = []
still_failed_indices = []

# Process the failed sentences
for i in tqdm(range(0, len(failed_indices), 3)):
    batch_indices = failed_indices[i:i+3]
    batch = aae_dataset.iloc[batch_indices].tolist()
    
    try:
        result = runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.african_american_english_tweets) == len(batch):
            new_sentiments.extend([response.aae_english for response in result.african_american_english_tweets])
            new_processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as still failed
            still_failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+2}: {str(e)}")
        still_failed_indices.extend(batch_indices)

# Create a new dataframe with newly processed sentences and sentiments
new_labeled_df = pd.DataFrame({
    'index': new_processed_indices,
    'standard_american_english': sae_dataset.iloc[new_processed_indices],
    'aae_from_sae': new_sentiments
})

# Combine the previously processed data with the newly processed data
combined_df = pd.concat([labeled_df, new_labeled_df], ignore_index=True)

# Sort the dataframe by the original index and reset the index
combined_df = combined_df.sort_values('index').reset_index(drop=True)

# If there are still failed indices, add them to the combined dataframe with NaN sentiment
if still_failed_indices:
    failed_df = pd.DataFrame({
        'index': still_failed_indices,
        'standard_american_english': sae_dataset.iloc[still_failed_indices],
        'aae_from_sae': pd.NA
    })
    combined_df = pd.concat([combined_df, failed_df], ignore_index=True)
    combined_df = combined_df.sort_values('index').reset_index(drop=True)

# Save the complete labeled dataset to a CSV file
output_path = './labeled/complete-2000-with-gpt-3.5-AAE_from_SAE.csv'
combined_df.to_csv(output_path, index=False)

print(f"Complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {len(combined_df)}")
print(f"Successfully labeled sentences: {combined_df['aae_from_sae'].notna().sum()}")
print(f"Failed sentences: {combined_df['aae_from_sae'].isna().sum()}")

 32%|███████████▉                         | 11/34 [00:52<04:51, 12.69s/it]

Error processing batch 30-32: Function Data arguments:

{"african_american_english_tweets":[{"aae_english":"So @taaylor5 decided to go to the shop without me, shaking my head. Ready to get my hair and nails done today... :)","aae_english":"So @taaylor5 decided to go to the shop without me, shaking my head. Ready to get my hair and nails done today... :)","aae_english":"So @taaylor5 decided to go to the shop without me, shaking my head. Ready to get my hair and nails done today... :)","aae_english":"So @taaylor5 decided to go to the shop without me, shaking my head. Ready to get my hair and nails done today... :)","aae_english":"So @taaylor5 decided to go to the shop without me, shaking my head. Ready to get my hair and nails done today... :)","aae_english":"So @taaylor5 decided to go to the shop without me, shaking my head. Ready to get my hair and nails done today... :)","aae_english":"So @taaylor5 decided to go to the shop without me, shaking my head. Ready to get my hair and nails d

100%|█████████████████████████████████████| 34/34 [01:28<00:00,  2.59s/it]

Complete labeled dataset saved to ./labeled/complete-2000-with-gpt-3.5-AAE_from_SAE.csv
Total processed sentences: 2000
Successfully labeled sentences: 1976
Failed sentences: 24





We still have **24** failed sentences, so now we will process them one by one.

In [84]:
class AAE(BaseModel):
    aae_english: str = Field(description="The tweet converted into African American English.")

chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You will be given a tweet in Standard American English. Your task is to convert the given tweet to African American English."""
        ),
        ("user", "{sentence}")
    ]
)

# Define the language model
# model = ChatAnthropic(model="claude-3-haiku-20240307", timeout=None,
#     max_retries=2, temperature=0)

model = ChatOpenAI(model="gpt-3.5-turbo-0125", timeout=None,
    max_retries=2, temperature=0)

# Create the runnable chain
runnable = chat_template | model.with_structured_output(schema=AAE)

In [85]:
# Load the complete dataset
complete_df = pd.read_csv('./labeled/complete-2000-with-gpt-3.5-AAE_from_SAE.csv')

# Identify failed sentences
failed_df = complete_df[complete_df['aae_from_sae'].isna()].copy()

# Process failed sentences
for index, row in tqdm(failed_df.iterrows(), total=len(failed_df)):
    try:
        result = runnable.invoke({"sentence": row['standard_american_english']})
        failed_df.at[index, 'aae_from_sae'] = result.aae_english
    except Exception as e:
        print(f"Error processing sentence at index {index}: {str(e)}")

# Update the complete dataset with newly processed sentences
complete_df.update(failed_df)

# Save the updated complete dataset
output_path = './labeled/complete-2000-with-gpt-3.5-AAE_from_SAE_updated.csv'
complete_df.to_csv(output_path, index=False)

print(f"Updated complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {len(complete_df)}")
print(f"Successfully labeled sentences: {complete_df['aae_from_sae'].notna().sum()}")
print(f"Failed sentences: {complete_df['aae_from_sae'].isna().sum()}")

100%|█████████████████████████████████████| 24/24 [00:16<00:00,  1.46it/s]

Updated complete labeled dataset saved to ./labeled/complete-2000-with-gpt-3.5-AAE_from_SAE_updated.csv
Total processed sentences: 2000
Successfully labeled sentences: 2000
Failed sentences: 0





_______________

 -------------- ISSUE DETECTED - SOME SAE SENTENCES WHERE <UNKNOWN> hence their sentiment also <UNKNOWN> ---------------------

In [39]:
class SAE(BaseModel):
    sae_english: str = Field(description="The tweet converted into Standard American English.")

chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You will be given a tweet in African American English. Your task is to convert the given tweet to Standard American English."""
        ),
        ("user", "{sentence}")
    ]
)

# Define the language model
model = ChatAnthropic(model="claude-3-haiku-20240307", timeout=None,
    max_retries=2, temperature=0)

# Create the runnable chain
runnable_toSAE = chat_template | model.with_structured_output(schema=SAE)

Get SAE from AAE

In [46]:
runnable_toSAE.invoke({"sentence" : "@LoveGamesOxygen who made amy dress for the finale the red one at the end please tell me"})

SAE(sae_english='who made amy dress for the finale the red one at the end please tell me')

Get AAE from SAE

In [63]:
runnable.invoke({"sentence" : "How did you make the strawberries come up on the bananas? Is that an app or something?"})

AAE(aae_english='How did you make the strawberries come up on the bananas? Is that an app or something?')

In [30]:
def manual_input_sae(index):
    print(f"Current entry at index {index}:")
    print(complete_df.iloc[index])
    new_sentence = input("Enter the correct Standard American English sentence: ")
    complete_df.at[index, 'standard_american_english'] = new_sentence
    return new_sentence

In [43]:
def manual_input_aae(index):
    print(f"Current entry at index {index}:")
    print(complete_df.iloc[index])
    new_sentence = input("Enter the correct African American English sentence: ")
    complete_df.at[index, 'aae_from_sae'] = new_sentence
    return new_sentence

In [28]:
# Usage
new_sentence = manual_input_sae(288)

Current entry at index 288:
index                              288
standard_american_english    <UNKNOWN>
aae_from_sae                 <UNKNOWN>
Name: 288, dtype: object


Enter the correct Standard American English sentence:  Soulja Boy, my friend, I went so hard to give the sun to more than just the USA. Ha, what? Yes, the world.


In [33]:
def process_single_sentence(sentence):
    try:
        result = runnable.invoke({"sentence": sentence})
        return result.aae_english
    except Exception as e:
        print(f"Error processing sentence: {str(e)}")
        return None

In [41]:
aae_sentence = process_single_sentence(new_sentence)
print(f"sae: {complete_df.iloc[288].standard_american_english}\naae: {aae_sentence}")

sae: Soulja Boy, my friend, I went so hard to give the sun to more than just the USA. Ha, what? Yes, the world.
aae: Soulja Boy, my friend, I went so hard to give the sun to more than just the USA. Ha, what? Yes, the world.


In [64]:
new_sentence = manual_input_aae(1927)

Current entry at index 1927:
index                                                                     1927
standard_american_english    How did you make the strawberries come up on t...
aae_from_sae                                                         <UNKNOWN>
Name: 1927, dtype: object


Enter the correct African American English sentence:  How did you make the strawberries come up on the bananas? Is that an app or something?


In [61]:
complete_df.iloc[1927].standard_american_english

'How did you make the strawberries come up on the bananas? Is that an app or something?'

In [65]:
failed_df = complete_df[(complete_df['aae_from_sae'].isna()) | (complete_df['aae_from_sae'] == '<UNKNOWN>')].copy()
failed_df['index']

Series([], Name: index, dtype: int64)

_______________

In [66]:
# #Identify failed sentences (NaN or '<Unknown>')
# failed_df = complete_df[(complete_df['aae_from_sae'].isna()) | (complete_df['aae_from_sae'] == '<Unknown>')].copy()

# # Process failed sentences
# for index, row in tqdm(failed_df.iterrows(), total=len(failed_df)):
#     try:
#         result = runnable.invoke({"sentence": row['standard_american_english']})
#         failed_df.at[index, 'aae_from_sae'] = result.aae_english
#     except Exception as e:
#         print(f"Error processing sentence at index {index}: {str(e)}")

# # Update the complete dataset with newly processed sentences
# complete_df.update(failed_df)

# Save the updated complete dataset
output_path = './labeled/complete-2000-with-anthropic_Haiku-AAE_from_SAE_updated.csv'
complete_df.to_csv(output_path, index=False)

print(f"Updated complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {len(complete_df)}")
print(f"Successfully labeled sentences: {complete_df['aae_from_sae'].notna().sum()}")
print(f"Failed sentences: {complete_df['aae_from_sae'].isna().sum()}")
print(f"'<Unknown>' sentences remaining: {(complete_df['aae_from_sae'] == '<Unknown>').sum()}")

Updated complete labeled dataset saved to ./labeled/complete-2000-with-anthropic_Haiku-AAE_from_SAE_updated.csv
Total processed sentences: 2000
Successfully labeled sentences: 2000
Failed sentences: 0
'<Unknown>' sentences remaining: 0
