In [1]:
import pandas as pd
from typing import List, Optional
from tqdm import tqdm
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_anthropic import ChatAnthropic
import os


os.environ['OPENAI_API_KEY'] = 'openai-api-key-here'
os.environ['ANTHROPIC_API_KEY'] = 'anthropic-api-key-here'

In [61]:
class SentimentAnalysisResponse(BaseModel):
    sentiment: str = Field(description="The sentiment of the sentence (Positive, Negative, or Neutral)")

# Define the prompt template
chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Your task is to analyze the provided sentence written in African American English and identify the sentiment expressed by the author. 
            The sentiment should be classified as Positive, Negative, or Neutral ONLY. Do not make use of Mixed as a sentiment."""
        ),
        ("user", "{sentences}")
    ]
)
model = ChatAnthropic(model="claude-3-haiku-20240307", timeout=None,
    max_retries=2, temperature=0.0)
runnable_single = chat_template | model.with_structured_output(schema=SentimentAnalysisResponse)

In [48]:
# Function to process a single row
def process_row_sentiment(row):
    result = runnable_single.invoke({"sentences": row['text']})
    return result.sentiment

## Claude Haiku

1. AAE

In [4]:
aae = pd.read_csv("./labeled/Claude-Haiku-AAE-Labels.csv")
aae["sentiment"].unique()

array(['Negative', 'Positive', 'Neutral', 'Mixed', ': <UNKNOWN>',
       '<UNKNOWN>', ': Positive'], dtype=object)

In [5]:
# Process rows with non-standard sentiments
mask = ~aae["sentiment"].isin(["Positive", "Negative", "Neutral"])
aae.loc[mask, "sentiment"] = aae[mask].apply(process_row_sentiment, axis=1)

In [6]:
aae["sentiment"].unique()

array(['Negative', 'Positive', 'Neutral', 'Mixed'], dtype=object)

In [12]:
mixed_sentiment_rows = aae[aae["sentiment"] == "Mixed"]
mixed_sentiment_rows

Unnamed: 0,text,sentiment
487,@itsneshaaaa I'm still mad at u for leaving me...,Mixed


In [14]:
print(mixed_sentiment_rows['text'])

487    @itsneshaaaa I'm still mad at u for leaving me...
Name: text, dtype: object


In [49]:
aae.loc[487, "sentiment"] = runnable_single.invoke({"sentences": aae.loc[487, "text"]}).sentiment

In [52]:
aae.to_csv("./labeled/Claude-Haiku-AAE-Labels.csv")

2. SAE Labels

In [54]:
sae = pd.read_csv("./labeled/Claude-Haiku-sae-labels.csv")
sae["sae_labels"].unique()

array(['Negative', 'Positive', 'Neutral', nan, 'Mixed', '<UNKNOWN>'],
      dtype=object)

In [55]:
# Process rows with non-standard sentiments
mask = ~sae["sae_labels"].isin(["Positive", "Negative", "Neutral"])
sae.loc[mask, "sae_labels"] = aae[mask].apply(process_row_sentiment, axis=1)

In [56]:
sae["sae_labels"].unique()

array(['Negative', 'Positive', 'Neutral', 'Mixed'], dtype=object)

In [57]:
sae.to_csv("./labeled/Claude-Haiku-sae-labels.csv")

3. AAE from SAE labels

In [58]:
aae_from_sae = pd.read_csv("./labeled/Claude-Haiku-AAE_from_SAE_labels-final.csv")

In [62]:
aae_from_sae["sentiment"].unique()

array(['Negative', 'Positive', 'Neutral', 'Mixed', 'Sarcastic',
       '<UNKNOWN>'], dtype=object)

In [63]:
# Process rows with non-standard sentiments
mask = ~aae_from_sae["sentiment"].isin(["Positive", "Negative", "Neutral"])
aae_from_sae.loc[mask, "sentiment"] = aae[mask].apply(process_row_sentiment, axis=1)

In [64]:
aae_from_sae["sentiment"].unique()

array(['Negative', 'Positive', 'Neutral', 'Mixed'], dtype=object)

## GPT

In [66]:
os.environ['OPENAI_API_KEY'] = 'openai-api-key-here'

class SentimentAnalysisResponse(BaseModel):
    sentiment: str = Field(description="The sentiment of the sentence (Positive, Negative, or Neutral)")

# Define the prompt template
chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Your task is to analyze the provided sentence written in African American English and identify the sentiment expressed by the author. 
            The sentiment should be classified as Positive, Negative, or Neutral ONLY. Do not make use of Mixed as a sentiment."""
        ),
        ("user", "{sentences}")
    ]
)
# Define the language model
model = ChatOpenAI(model="gpt-4o-mini", timeout=None,
    max_retries=2, temperature=0.0)
runnable_single = chat_template | model.with_structured_output(schema=SentimentAnalysisResponse)

# Function to process a single row
def process_row_sentiment(row):
    result = runnable_single.invoke({"sentences": row['text']})
    return result.sentiment

1. AAE labels

In [67]:
aae = pd.read_csv("./labeled/GPT-4o-mini-AAE-Labels.csv")
aae["sentiment"].unique()

array(['Negative', 'Neutral', 'Positive'], dtype=object)

2. SAE Labels

In [69]:
sae = pd.read_csv("./labeled/GPT-4o-mini-sae-labels.csv")
sae["sae_labels"].unique()

array(['Negative', 'Neutral', 'Positive'], dtype=object)

3. AAE from SAE labels

In [70]:
aae_from_sae = pd.read_csv("./labeled/GPT-4o-mini-AAE_from_SAE_labels-final.csv")
aae_from_sae["sentiment"].unique()

array(['Negative', 'Neutral', 'Positive'], dtype=object)

## Phi-3

1. AAE Labels

In [71]:
aae = pd.read_csv("./labeled/Phi-3-AAE-Labels.csv")
aae["sentiment"].unique()

array(['Negative', 'Neutral', 'Positive'], dtype=object)

2. SAE labels

In [72]:
sae = pd.read_csv("./labeled/Phi-3-SAE-Labels.csv")
sae["sae_labels"].unique()

array(['Negative', 'Neutral', 'Positive'], dtype=object)

3. AAE from SAE

In [74]:
aae_from_sae = pd.read_csv("./labeled/Phi-3-AAE_from_SAE_labels.csv")
aae_from_sae["AAE_from_SAE sentiment"].unique()

array(['Negative', 'Neutral', 'Positive', 'Unknown'], dtype=object)

## Fixing the "Unknowns"

In [75]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import pandas as pd
from tqdm import tqdm
import os
cache_dir = "../cache"
os.environ['TRANSFORMERS_CACHE'] = cache_dir

# Load the Phi-3 model and tokenizer
model_id = "microsoft/Phi-3-medium-4k-instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto", 
    torch_dtype="auto",
    trust_remote_code=True,
    cache_dir=cache_dir
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/3.15k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

In [76]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=10,
)

In [86]:
# Function to extract sentiment from model output
def extract_sentiment(output):
    response = output.split("<|assistant|>")[-1].strip()
    if "Positive" in response:
        return "Positive"
    elif "Negative" in response:
        return "Negative"
    elif "Neutral" in response:
        return "Neutral"
    else:
        return "Unknown"
    
    
def generate_sentiment(sentence):
    outputs = pipe(
        f"""<|user|>
Your task is to analyze the provided sentences written in African American English and identify the sentiment expressed by the author. The sentiment should be must be classified as Positive, Negative, or Neutral. Reply with just the sentiment.\n
"{sentence}"<|end|>
<|assistant|>""",
        max_new_tokens=50,
        do_sample=False,
        num_return_sequences=1
    )
    return extract_sentiment(outputs[0]["generated_text"])

In [87]:
# Function to process a single row
def process_row_sentiment(row):
    return generate_sentiment(row["text"])

In [88]:
# Process rows with non-standard sentiments
mask = ~aae_from_sae["AAE_from_SAE sentiment"].isin(["Positive", "Negative", "Neutral"])
aae_from_sae.loc[mask, "AAE_from_SAE sentiment"] = aae[mask].apply(process_row_sentiment, axis=1)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [89]:
aae_from_sae["AAE_from_SAE sentiment"].unique()

array(['Negative', 'Neutral', 'Positive'], dtype=object)

In [90]:
aae_from_sae.rename(columns={"AAE_from_SAE sentiment" : "sentiment"}, inplace=True)

In [91]:
aae_from_sae.head()

Unnamed: 0,AAE_from_SAE,sentiment
0,"If I don't get dis job tomorrow, I don't know ...",Negative
1,"Ah'm gon' try double workouts, y'all. I be aim...",Neutral
2,"Ah still gon' ha' his baby, 'cause he a good d...",Positive
3,I used someone as a reference without givin' n...,Neutral
4,@HumanistExec Try ginger for da sickness. Have...,Neutral


In [92]:
aae_from_sae.to_csv("./labeled/Phi-3-AAE_from_SAE_labels.csv")