In [1]:
import pandas as pd
import re
from collections import Counter
import os
from openai import OpenAI
import anthropic
from groq import Groq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
import torch

In [2]:
cache_dir = "./cache/"
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ['OPENAI_API_KEY'] = 'openai-api-key-here'
os.environ['ANTHROPIC_API_KEY'] = 'anthropic-api-key-here'

In [5]:
from typing import List, Optional
from tqdm import tqdm
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate

# Define the schema for sentiment analysis
# Define the schema for sentiment analysis
class StandardEnglish(BaseModel):
    standard_english: str = Field(description="The tweet converted into Standard American English.")

class Data(BaseModel):
    """Convert the list of tweets provided to standard american english."""
    standard_english_tweets: List[StandardEnglish] = Field(description="The list of converted tweets by order of the sentences given.")

# Define the prompt template
chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You will be given a list of tweets extracted from twitter accounts belonging to African American individuals. Your task is to convert the given tweet to Standard American English."""
        ),
        ("user", "{sentences}")
    ]
)

# Define the language model
# model = ChatAnthropic(model="claude-3-haiku-20240307", timeout=None,
#     max_retries=2, temperature=0)
model = ChatOpenAI(model="gpt-3.5-turbo-0125", timeout=None,
    max_retries=2, temperature=0)

# Create the runnable chain
runnable = chat_template | model.with_structured_output(schema=Data)

In [6]:
#runnable.invoke({"sentences":"I wanna scream and shout and let it all outtt and scream and shout and let it out we sayin' ohhhweeee oh we oh we ohhhhh!"})

In [14]:
aae_dataset = pd.read_csv('initial_2000_sentences.csv')["text"]

In [15]:
all_sae_sentence = []
processed_indices = []
failed_indices = []

In [18]:
# Process the dataset in batches of 10
# for i in tqdm(range(0, len(aae_dataset), 5)):
#     batch = aae_dataset[i:i+5].to_list()
#     batch_indices = list(range(i, min(i+5, len(aae_dataset))))
    
#     try:
#         result = runnable.invoke({"sentences": "\n".join(batch)})
        
#         # Check if the number of returned sentiments matches the batch size
#         if len(result.standard_english_tweets) == len(batch):
#             all_sae_sentence.extend([response.standard_english for response in result.standard_english_tweets])
#             processed_indices.extend(batch_indices)
#         else:
#             # If the number of sentiments doesn't match, mark all as failed
#             failed_indices.extend(batch_indices)
#     except Exception as e:
#         print(f"Error processing batch {i}-{i+4}: {str(e)}")
#         failed_indices.extend(batch_indices)

# Create a new dataframe with successfully processed sentences and sentiments
labeled_df = pd.DataFrame({
    'index': processed_indices,
    'text': aae_dataset.iloc[processed_indices],
    'standard_american_english': all_sae_sentence
})

# Sort the dataframe by the original index
labeled_df = labeled_df.sort_values('index').reset_index(drop=True)

# Save the labeled dataset to a CSV file
output_path = './labeled/anthropic_Haiku-SAE.csv'
labeled_df.to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")

# Save the failed indices to a separate file
failed_indices_path = './labeled/failed_indices_anthropic_Haiku-SAE.csv'
pd.DataFrame({'failed_index': failed_indices}).to_csv(failed_indices_path, index=False)

print(f"Failed indices saved to {failed_indices_path}")
print(f"Number of successfully processed sentences: {len(processed_indices)}")
print(f"Number of failed sentences: {len(failed_indices)}")

Labeled dataset saved to ./labeled/anthropic_Haiku-SAE.csv
Failed indices saved to ./labeled/failed_indices_anthropic_Haiku-SAE.csv
Number of successfully processed sentences: 1620
Number of failed sentences: 380


In [17]:
len(all_sae_sentence)

1620

In [20]:
# Load the previously processed data
labeled_df = pd.read_csv('./labeled/anthropic_Haiku-SAE.csv')
failed_indices = pd.read_csv('./labeled/failed_indices_anthropic_Haiku-SAE.csv')['failed_index'].tolist()

# Initialize lists to store the new sentiments and their corresponding indices
new_sentiments = []
new_processed_indices = []
still_failed_indices = []

# Process the failed sentences
for i in tqdm(range(0, len(failed_indices), 3)):
    batch_indices = failed_indices[i:i+3]
    batch = aae_dataset.iloc[batch_indices].tolist()
    
    try:
        result = runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.standard_english_tweets) == len(batch):
            new_sentiments.extend([response.standard_english for response in result.standard_english_tweets])
            new_processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as still failed
            still_failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+2}: {str(e)}")
        still_failed_indices.extend(batch_indices)

# Create a new dataframe with newly processed sentences and sentiments
new_labeled_df = pd.DataFrame({
    'index': new_processed_indices,
    'text': original_dataset.iloc[new_processed_indices],
    'standard_american_english': new_sentiments
})

# Combine the previously processed data with the newly processed data
combined_df = pd.concat([labeled_df, new_labeled_df], ignore_index=True)

# Sort the dataframe by the original index and reset the index
combined_df = combined_df.sort_values('index').reset_index(drop=True)

# If there are still failed indices, add them to the combined dataframe with NaN sentiment
if still_failed_indices:
    failed_df = pd.DataFrame({
        'index': still_failed_indices,
        'text': original_dataset.iloc[still_failed_indices],
        'sentiment': pd.NA
    })
    combined_df = pd.concat([combined_df, failed_df], ignore_index=True)
    combined_df = combined_df.sort_values('index').reset_index(drop=True)

# Save the complete labeled dataset to a CSV file
output_path = './labeled/complete-2000-Haiku-SAE-Convert.csv'
combined_df.to_csv(output_path, index=False)

print(f"Complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {len(combined_df)}")
print(f"Successfully labeled sentences: {combined_df['sentiment'].notna().sum()}")
print(f"Failed sentences: {combined_df['sentiment'].isna().sum()}")

100%|███████████████████████████████████| 127/127 [03:19<00:00,  1.57s/it]


NameError: name 'original_dataset' is not defined

In [22]:
# Create a new dataframe with newly processed sentences and sentiments
new_labeled_df = pd.DataFrame({
    'index': new_processed_indices,
    'text': aae_dataset.iloc[new_processed_indices],
    'standard_american_english': new_sentiments
})

# Combine the previously processed data with the newly processed data
combined_df = pd.concat([labeled_df, new_labeled_df], ignore_index=True)

# Sort the dataframe by the original index and reset the index
combined_df = combined_df.sort_values('index').reset_index(drop=True)

# If there are still failed indices, add them to the combined dataframe with NaN sentiment
if still_failed_indices:
    failed_df = pd.DataFrame({
        'index': still_failed_indices,
        'text': aae_dataset.iloc[still_failed_indices],
        'sentiment': pd.NA
    })
    combined_df = pd.concat([combined_df, failed_df], ignore_index=True)
    combined_df = combined_df.sort_values('index').reset_index(drop=True)

# Save the complete labeled dataset to a CSV file
output_path = './labeled/complete-2000-Haiku-SAE-Convert.csv'
combined_df.to_csv(output_path, index=False)

print(f"Complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {len(combined_df)}")
print(f"Successfully labeled sentences: {combined_df['sentiment'].notna().sum()}")
print(f"Failed sentences: {combined_df['sentiment'].isna().sum()}")

Complete labeled dataset saved to ./labeled/complete-2000-Haiku-SAE-Convert.csv
Total processed sentences: 2000
Successfully labeled sentences: 0
Failed sentences: 2000


In [28]:
final_haiku_sae = pd.read_csv(output_path)
final_haiku_sae['standard_american_english'] = final_haiku_sae.apply(
    lambda row: runnable_single.invoke({"sentences": row['text']}).standard_english_tweets[0].standard_english if pd.isna(row['standard_american_english']) or row['standard_american_english'] == '' else row['standard_american_english'],
    axis=1
)

# Rename the 'text' column to 'aae'
final_haiku_sae.rename(columns={'text': 'aae'}, inplace=True)

# Keep only the 'aae' and 'standard_american_english' columns
final_haiku_sae = final_haiku_sae[['aae', 'standard_american_english']]

# Save the transformed dataset to a new CSV file
output_path = './labeled/complete-2000-Haiku-SAE-Convert-FINAL.csv'
final_haiku_sae.to_csv(output_path, index=False)

## Getting labels for SAE

In [125]:
# Define the schema for sentiment analysis
# Define the schema for sentiment analysis
class SentimentAnalysisResponse(BaseModel):
    sentiment: str = Field(description="The sentiment of the sentence (Positive, Negative, or Neutral)")

class Data(BaseModel):
    """Extracted data about sentences."""
    sentiments: List[SentimentAnalysisResponse]

# Define the prompt template
chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Your task is to analyze the provided sentences written in Standard American English and identify the sentiment expressed by the author. The sentiment should be classified as Positive, Negative, or Neutral for each sentence."""
        ),
        ("user", "{sentences}")
    ]
)

# Define the language model
model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)

# Create the runnable chain
sae_labels_runnable = chat_template | model.with_structured_output(schema=Data)

In [126]:
all_sae_sentiment = []
processed_indices = []
failed_indices = []
sae_dataset = pd.read_csv('./labeled/gpt-3.5-sae.csv')['sae_sentence']

In [127]:
sae_dataset[0]

"Bitch can't get anything from me but bubble gum and hard dick from me. I told the bitch I'm trying to make a profit. I'm shooting dice with her rent money!"

In [128]:

#Process the dataset in batches of 10
for i in tqdm(range(0, len(sae_dataset), 10)):
    batch = sae_dataset[i:i+10].to_list()
    batch_indices = list(range(i, min(i+10, len(sae_dataset))))
    
    try:
        result = sae_labels_runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.sentiments) == len(batch):
            all_sae_sentiment.extend([response.sentiment for response in result.sentiments])
            processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as failed
            failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+9}: {str(e)}")
        failed_indices.extend(batch_indices)

# Create a new dataframe with successfully processed sentences and sentiments
labeled_df_sae_labels = pd.DataFrame({
    'index': processed_indices,
    'standard_american_english': sae_dataset.iloc[processed_indices],
    'sae_labels': all_sae_sentiment
})

# Sort the dataframe by the original index
labeled_df_sae_labels = labeled_df_sae_labels.sort_values('index').reset_index(drop=True)

# Save the labeled dataset to a CSV file
output_path = './labeled/gpt-3.5-sae-labels.csv'
labeled_df_sae_labels.to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")

# Save the failed indices to a separate file
failed_indices_path = './labeled/failed_indices_gpt-3.5-sae-labels.csv'
pd.DataFrame({'failed_index': failed_indices}).to_csv(failed_indices_path, index=False)

print(f"Failed indices saved to {failed_indices_path}")
print(f"Number of successfully processed sentences: {len(processed_indices)}")
print(f"Number of failed sentences: {len(failed_indices)}")

  6%|█▉                                  | 11/200 [00:11<03:03,  1.03it/s]

Error processing batch 110-119: sequence item 0: expected str instance, float found


 74%|█████████████████████████▋         | 147/200 [02:38<01:01,  1.16s/it]

Error processing batch 1470-1479: sequence item 0: expected str instance, float found


 98%|██████████████████████████████████▏| 195/200 [03:31<00:05,  1.11s/it]

Error processing batch 1950-1959: sequence item 0: expected str instance, float found


100%|███████████████████████████████████| 200/200 [03:35<00:00,  1.08s/it]

Labeled dataset saved to ./labeled/gpt-3.5-sae-labels.csv
Failed indices saved to ./labeled/failed_indices_gpt-3.5-sae-labels.csv
Number of successfully processed sentences: 1530
Number of failed sentences: 470





In [134]:
# Load the dataset
combined_df_sae_labels = pd.read_csv('./labeled/gpt-3.5-sae-labels.csv')

# Find the indices of the failed sentences
failed_indices = combined_df_sae_labels[combined_df_sae_labels['sae_labels'].isna()].index

# Iterate over the failed sentences and get their labels
for index in failed_indices:
    failed_sentence = combined_df_sae_labels.loc[index, 'standard_american_english']
    # Invoke the SAE labels runnable to get the label
    label = sae_labels_runnable.invoke({"sentences": failed_sentence}).sentiments[0].sentiment
    # Update the DataFrame with the new label
    combined_df_sae_labels.loc[index, 'sae_labels'] = label

# Save the updated dataset
output_path = './labeled/gpt-3.5-sae-labels-updated.csv'
combined_df_sae_labels.to_csv(output_path, index=False)

# Print summary
print(f"Complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {len(combined_df_sae_labels)}")
print(f"Successfully labeled sentences: {combined_df_sae_labels['sae_labels'].notna().sum()}")
print(f"Failed sentences: {combined_df_sae_labels['sae_labels'].isna().sum()}")

Complete labeled dataset saved to ./labeled/gpt-3.5-sae-labels-updated.csv
Total processed sentences: 2000
Successfully labeled sentences: 2000
Failed sentences: 0


In [135]:
# Save the updated dataset
output_path = './labeled/gpt-3.5-sae-labels-updated.csv'
combined_df_sae_labels[["standard_american_english", "sae_labels"]].to_csv(output_path, index=False)

In [133]:
# Load the previously processed data
labeled_df = pd.read_csv('./labeled/gpt-3.5-sae-labels.csv')
failed_indices = pd.read_csv('./labeled/failed_indices_gpt-3.5-sae-labels.csv')['failed_index'].tolist()


# Initialize lists to store the new sentiments and their corresponding indices
new_sentiments = []
new_processed_indices = []
still_failed_indices = []

# Process the failed sentences
for i in tqdm(range(0, len(failed_indices), 3)):
    batch_indices = failed_indices[i:i+3]
    batch = sae_dataset.iloc[batch_indices].tolist()
    
    try:
        result = sae_labels_runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.sentiments) == len(batch):
            new_sentiments.extend([response.sentiment for response in result.sentiments])
            new_processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as still failed
            still_failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+2}: {str(e)}")
        still_failed_indices.extend(batch_indices)

# Create a new dataframe with newly processed sentences and sentiments
new_labeled_df_sae_labels = pd.DataFrame({
    'index': new_processed_indices,
    'standard_american_english': sae_dataset.iloc[new_processed_indices],
    'sae_labels': new_sentiments
})

# Combine the previously processed data with the newly processed data
combined_df_sae_labels = pd.concat([labeled_df_sae_labels, new_labeled_df_sae_labels], ignore_index=True)

# Sort the dataframe by the original index and reset the index
combined_df_sae_labels = combined_df_sae_labels.sort_values('index').reset_index(drop=True)

# If there are still failed indices, add them to the combined dataframe with NaN sentiment
if still_failed_indices:
    failed_df = pd.DataFrame({
        'index': still_failed_indices,
        'standard_american_english': sae_dataset.iloc[still_failed_indices],
        'sae_labels': pd.NA
    })
    combined_df_sae_labels = pd.concat([combined_df_sae_labels, failed_df], ignore_index=True)
    combined_df_sae_labels = combined_df_sae_labels.sort_values('index').reset_index(drop=True)

# Save the complete labeled dataset to a CSV file
output_path = './labeled/gpt-3.5-sae-labels.csv'
combined_df_sae_labels.to_csv(output_path, index=False)

print(f"Complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {len(combined_df_sae_labels)}")
print(f"Successfully labeled sentences: {combined_df_sae_labels['sae_labels'].notna().sum()}")
print(f"Failed sentences: {combined_df_sae_labels['sae_labels'].isna().sum()}")

  6%|██▎                                 | 10/157 [00:06<01:41,  1.44it/s]

Error processing batch 30-32: sequence item 0: expected str instance, float found


 76%|██████████████████████████▊        | 120/157 [01:14<00:21,  1.71it/s]

Error processing batch 360-362: sequence item 0: expected str instance, float found


 93%|████████████████████████████████▌  | 146/157 [01:31<00:07,  1.49it/s]

Error processing batch 438-440: sequence item 2: expected str instance, float found


100%|███████████████████████████████████| 157/157 [01:37<00:00,  1.61it/s]

Complete labeled dataset saved to ./labeled/gpt-3.5-sae-labels.csv
Total processed sentences: 2000
Successfully labeled sentences: 1934
Failed sentences: 66





In [None]:
combined_df_sae_labels.

In [109]:
print(f"Successfully labeled sentences: {combined_df_sae_labels['sae_labels'].notna().sum()}")
print(f"Failed sentences: {combined_df_sae_labels['sae_labels'].isna().sum()}")

Successfully labeled sentences: 2000
Failed sentences: 0


In [107]:
aab = combined_df_sae_labels[combined_df_sae_labels['sae_labels'].isna()].index

In [111]:
output_path = './labeled/complete-2000-Haiku-SAE-FINAL-Labels.csv'
combined_df_sae_labels[["standard_american_english", "sae_labels"]].to_csv(output_path, index=False)

In [106]:
combined_df_sae_labels.loc[1850, 'sae_labels'] = sae_labels_runnable.invoke({"sentences": combined_df_sae_labels.loc[1850, 'standard_american_english']}).sentiment

In [56]:
aae_sae = pd.read_csv('./labeled/complete-2000-Haiku-SAE-Convert-FINAL.csv')


In [60]:
aae_sae.iloc[968]

aae                          @LoveGamesOxygen who made amy dress for the fi...
standard_american_english    Who made Amy's dress for the finale, the red o...
Name: 968, dtype: object

In [59]:
aae_sae.loc[968, 'standard_american_english'] = runnable.invoke({"sentences": aae_sae.loc[968, 'aae']}).standard_english_tweets[0].standard_english

In [61]:
aae_sae.to_csv('./labeled/complete-2000-Haiku-SAE-Convert-FINAL.csv')