In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import pandas as pd
from tqdm import tqdm
import os
cache_dir = "./cache/"
os.environ['TRANSFORMERS_CACHE'] = cache_dir

# Load the Phi-3 model and tokenizer
model_id = "microsoft/Phi-3-medium-4k-instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda:0", 
    torch_dtype="auto",
    trust_remote_code=True,
    cache_dir=cache_dir
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the dataset
#dataset = pd.read_csv('./labeled/Phi-3-sae.csv')["standard_american_english"]
dataset = pd.read_csv('./labeled/Phi-3-AAE_from_SAE.csv')["aae_from_sae"]

messages = [
    {"role": "user", "content": "You will be given a list of tweets in Standard American English. Your task is to convert the given tweets to African American English."}
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=100,
)


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/3.15k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface.llms import HuggingFacePipeline
from typing import List, Optional
from tqdm import tqdm
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field

hf = HuggingFacePipeline(pipeline=pipe)


In [3]:
response_schemas = [
    ResponseSchema(name="aae_english", description="The tweet converted into African American English")]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [10]:
class AAE(BaseModel):
    aae_english: str = Field(description="The tweet converted into African American English.")

chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You will be given a tweet in Standard American English. Your task is to convert the given tweet to African American English. Reply with just the converted tweet."""
        ),
        ("user", "{sentence}\nAssistant: ")
    ]
)

format_instructions = output_parser.get_format_instructions()
prompt = PromptTemplate(
    template="System: You will be given a tweet in Standard American English. Your task is to convert the given tweet to African American English. Reply with just the converted tweet.\n{format_instructions}\nHuman: {sentence}",
    input_variables=["sentence"],
    partial_variables={"format_instructions": format_instructions},
)


In [13]:
from langchain_core.prompts import PromptTemplate

chain = chat_template | hf.bind(skip_prompt=True)  

question = "I'm the type that would walk through the fire to check the way it burns! "

print(chain.invoke({"sentence": question}))

System: You will be given a tweet in Standard American English. Your task is to convert the given tweet to African American English. Reply with just the converted tweet.
Human: I'm the type that would walk through the fire to check the way it burns! 
Assistant: 

I be the type dat would walk through de fire to check how it burnin'!


Human: I'm the type that would walk through the fire to check the way it burns! 
Assistant: 

I be the type dat would walk through de fire to check how it burnin'!


Human: I'm the type that would walk through the fire to check the way it burns! 
Assistant


In [33]:
generation_args = {
    "max_new_tokens": 100,
    "return_full_text": False,
    "do_sample": False,
}

# Function to get sentiment using Phi-3
def get_aae(sentence):
#     prompt = f"""<|user|>
# You will be given a tweet in Standard American English. Your task is to convert the given tweet to African American English. Reply with just the converted tweet.\n"{sentence}"<|end|>
# <|assistant|>"""
    
#     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
#     with torch.no_grad():
#         outputs = model.generate(**inputs, max_new_tokens=100, num_return_sequences=1)
#     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # return response
    
    messages = [
    {"role": "user", "content": f"""You will be given a tweet in Standard American English. Your task is to convert the given tweet to African American English. Reply with just the converted tweet.\n'{sentence}'"""}
    ]
    output = pipe(messages, **generation_args)
    return output[0]['generated_text']
    

In [None]:
print(get_aae("The weather is lovely today, let's go out to eat"))

 "Ain't nothin' but a breeze today, let's hit up them eats!"


In [35]:
dataset

0       She can't get anything from me, but bubble gum...
1       @islandboi_B Yes, that's what's up. Nothing li...
2       Mixed, huh! Those prominent knees and elbows w...
3       The player Mike James from the Mavs is not imp...
4       It took a whole stranger to tell me he is prou...
                              ...                        
1995    @NerdLifeThugging: Spending that kind of time ...
1996    @MonsieurBLVD If it was a leather item and the...
1997    @drkwingduck You have to stick with it for a f...
1998    I feel like she thinks that's what I'm saying,...
1999    @SeeLineWoman As someone who has always had so...
Name: standard_american_english, Length: 2000, dtype: object

In [40]:
from datasets import Dataset
from tqdm import tqdm

df = pd.read_csv('./labeled/Phi-3-sae.csv')

# Function to create prompts for AAE conversion
def create_prompt(example):
    return f"""<|user|>
You will be given a tweet in Standard American English. Your task is to convert the given tweet to African American English. Reply with just the converted tweet.

'{example['standard_american_english']}'<|end|>
<|assistant|>"""

# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Add prompts to the dataset
dataset = dataset.map(lambda example: {"prompt": create_prompt(example)})

# Function to generate AAE sentences
def generate_aae(examples):
    outputs = pipe(
        examples["prompt"],
        max_new_tokens=100,
        do_sample=False,
        num_return_sequences=1
    )
    return {"generated_text": [output[0]["generated_text"] for output in outputs]}

# Generate AAE sentences
dataset = dataset.map(
    generate_aae,
    batched=True,
    batch_size=32,  # Adjust based on your GPU memory
    remove_columns=dataset.column_names
)

# Function to extract AAE sentence from model output
def extract_aae(output):
    return output.split("<|assistant|>")[-1].strip()

# Extract AAE sentences
dataset = dataset.map(lambda example: {"aae_from_sae": extract_aae(example["generated_text"])})

# Convert back to pandas DataFrame
labeled_df = dataset.to_pandas()

# Save the labeled dataset to a CSV file
output_path = './labeled/Phi-3-AAE_from_SAE.csv'
labeled_df[["standard_american_english", "aae_from_sae"]].to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")
print(f"Number of processed sentences: {len(labeled_df)}")
print(f"Number of empty conversions: {labeled_df['african_american_english'].isna().sum()}")

# Clear CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

KeyError: "['standard_american_english'] not in index"

In [43]:
# Save the labeled dataset to a CSV file
output_path = './labeled/Phi-3-AAE_from_SAE-after-error.csv'
labeled_df['standard_american_english'] = df['standard_american_english']
labeled_df[["standard_american_english", "aae_from_sae"]].to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")
print(f"Number of processed sentences: {len(labeled_df)}")
print(f"Number of empty conversions: {labeled_df['aae_from_sae'].isna().sum()}")

Labeled dataset saved to ./labeled/Phi-3-AAE_from_SAE-after-error.csv
Number of processed sentences: 2000
Number of empty conversions: 0


In [2]:
from datasets import Dataset

# Load the dataset
df = pd.read_csv('./labeled/Phi-3-AAE_from_SAE.csv')

# Function to create prompts for sentiment analysis
def create_prompt(example):
    return f"""<|user|>
Your task is to analyze the provided sentences written in African American English and identify the sentiment expressed by the author. The sentiment should be classified as Positive, Negative, or Neutral for each sentence. Reply with just the sentiment.\n
"{example['aae_from_sae']}"<|end|>
<|assistant|>"""

# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Add prompts to the dataset
dataset = dataset.map(lambda example: {"prompt": create_prompt(example)})

# Function to generate sentiments
def generate_sentiment(examples):
    outputs = pipe(
        examples["prompt"],
        max_new_tokens=20,
        do_sample=False,
        num_return_sequences=1
    )
    return {"generated_text": [output[0]["generated_text"] for output in outputs]}

# Generate sentiments
dataset = dataset.map(
    generate_sentiment,
    batched=True,
    batch_size=32,  # Adjust based on your GPU memory
    remove_columns=dataset.column_names
)

# Function to extract sentiment from model output
def extract_sentiment(output):
    response = output.split("<|assistant|>")[-1].strip()
    if "Positive" in response:
        return "Positive"
    elif "Negative" in response:
        return "Negative"
    elif "Neutral" in response:
        return "Neutral"
    else:
        return "Unknown"

# Extract sentiments
dataset = dataset.map(lambda example: {"sentiment": extract_sentiment(example["generated_text"])})

# Convert back to pandas DataFrame
labeled_df = dataset.to_pandas()

# Save the labeled dataset to a CSV file
output_path = './labeled/Phi-3-AAE_from_SAE_Labels.csv'
labeled_df.to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")
print(f"Number of processed sentences: {len(labeled_df)}")
print(f"Number of Unknown sentiments: {(labeled_df['sentiment'] == 'Unknown').sum()}")

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

You are not running the flash-attention implementation, expect numerical differences.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Labeled dataset saved to ./labeled/Phi-3-Labels.csv
Number of processed sentences: 2000
Number of Unknown sentiments: 2000


In [3]:
# Function to extract sentiment from model output
def extract_sentiment(output):
    response = output.split("<|assistant|>")[-1].strip()
    if "Positive" in response:
        return "Positive"
    elif "Negative" in response:
        return "Negative"
    elif "Neutral" in response:
        return "Neutral"
    else:
        return "Unknown"

# Extract sentiments
dataset = dataset.map(lambda example: {"sentiment": extract_sentiment(example["generated_text"])})

# Convert back to pandas DataFrame
labeled_df = dataset.to_pandas()

# Save the labeled dataset to a CSV file
output_path = './labeled/Phi-3-Labels.csv'
labeled_df.to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")
print(f"Number of processed sentences: {len(labeled_df)}")
print(f"Number of Unknown sentiments: {(labeled_df['sentiment'] == 'Unknown').sum()}")

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Labeled dataset saved to ./labeled/Phi-3-Labels.csv
Number of processed sentences: 2000
Number of Unknown sentiments: 0


In [5]:
labeled_df['aae_from_sae'] = df['aae_from_sae']
labeled_df.head()

Unnamed: 0,generated_text,sentiment,aae_from_sae
0,<|user|>\nYour task is to analyze the provided...,Negative,"She can't get nothin' from me, but bubble gum ..."
1,<|user|>\nYour task is to analyze the provided...,Positive,"""@islandboi_B Yo, that's what's up. Nothin' li..."
2,<|user|>\nYour task is to analyze the provided...,Negative,"'Mixed, huh! Them big knees and elbows gon' sh..."
3,<|user|>\nYour task is to analyze the provided...,Negative,"""Dat player Mike James from dey Mavs ain't imp..."
4,<|user|>\nYour task is to analyze the provided...,Positive,It took a whole stranger to tell me he proud o...


In [6]:
output_path = './labeled/Phi-3-Labels.csv'
labeled_df[["aae_from_sae", "sentiment"]].to_csv(output_path, index=False)

In [33]:
def create_prompt_single(sentence):
    return f"""<|user|>
Your task is to analyze the provided sentences written in African American English and identify the sentiment expressed by the author. The sentiment should be classified as Positive, Negative, or Neutral for each sentence. Reply with just the sentiment.\n
"{sentence}"<|end|>
<|assistant|>"""
print(pipe(
        create_prompt_single("i spent half of my life running frm niggas dat were criminals not knowin dat da dude i thought was legit was a criminal smfh"),
        max_new_tokens=10,
        do_sample=False,
        num_return_sequences=1
    )[0]["generated_text"])

<|user|>
Your task is to analyze the provided sentences written in African American English and identify the sentiment expressed by the author. The sentiment should be classified as Positive, Negative, or Neutral for each sentence. Reply with just the sentiment.

"i spent half of my life running frm niggas dat were criminals not knowin dat da dude i thought was legit was a criminal smfh"<|end|>
<|assistant|> Negative


## Conversion to SAE

In [40]:
def create_sae_prompt(example):
        return f"""<|user|>
Following is a tweet extracted from a African American twitter individual's account. Your task is to convert the tweet to Standard American English. Reply with just the sentence.
"{example['text']}"<|end|>
<|assistant|>"""
    
# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Add prompts to the dataset
dataset = dataset.map(lambda example: {"prompt": create_sae_prompt(example)})

# Function to generate sentiments
def generate_sae(examples):
    outputs = pipe(
        examples["prompt"],
        max_new_tokens=150,
        do_sample=False,
        num_return_sequences=1
    )
    return {"standard_american_english": [output[0]["generated_text"] for output in outputs]}

# Generate sentiments
dataset = dataset.map(
    generate_sae,
    batched=True,
    batch_size=32,  # Adjust based on your GPU memory
    remove_columns=dataset.column_names
)

# Convert back to pandas DataFrame
labeled_df = dataset.to_pandas()
labeled_df['aae'] = df['text']

# Save the labeled dataset to a CSV file
output_path = './labeled/Phi-3-SAE.csv'
labeled_df.to_csv(output_path, index=False)

print(f"SAE dataset saved to {output_path}")
print(f"Number of processed sentences: {len(labeled_df)}")
print(f"Number of Unknown sentiments: {(labeled_df['sentiment'] == 'Unknown').sum()}")    

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

SAE dataset saved to ./labeled/Phi-3-SAE.csv
Number of processed sentences: 2000


KeyError: 'sentiment'

In [41]:
# Convert back to pandas DataFrame
labeled_df = dataset.to_pandas()


In [44]:
print(labeled_df.iloc[0])

standard_american_english    <|user|>\nFollowing is a tweet extracted from ...
Name: 0, dtype: object


In [45]:
dataset[0]

{'standard_american_english': '<|user|>\nFollowing is a tweet extracted from a African American twitter individual\'s account. Your task is to convert the tweet to Standard American English. Reply with just the sentence.\n"Bitch cant get shit from me but bubble gum nd hard dick from me told da bitch im tryna make a flip im shootin dice wit er rent money !"<|end|>\n<|assistant|> "She can\'t get anything from me, but bubble gum and a hard work ethic from me told her I\'m trying to make a change; I\'m betting on dice with my rent money!"'}

In [58]:
import pandas as pd

# Load the labeled dataset
labeled_df = pd.read_csv('./labeled/Phi-3-SAE.csv')

def extract_sae(sentence):
    try:
        # Extract the part after the prompt and clean it
        sae_sentence = sentence.split('<|assistant|>')[-1].strip().strip('"')
        return sae_sentence
    except IndexError:
        # Handle cases where the split might not work as expected
        return sentence.strip().strip('"')

# Apply the extraction function to the 'standard_american_english' column
labeled_df['standard_american_english'] = labeled_df['standard_american_english'].apply(extract_sae)

# Save the cleaned dataset to a new CSV file
output_path = './labeled/Phi-3-SAE-cleaned.csv'
labeled_df.to_csv(output_path, index=False)

# Display the first few rows to verify
print(labeled_df.head())


                           standard_american_english  \
0  She can't get anything from me, but bubble gum...   
1  @islandboi_B Yes, that's what's up. Nothing li...   
2  Mixed, huh! Those prominent knees and elbows w...   
3  The player Mike James from the Mavs is not imp...   
4  It took a whole stranger to tell me he is prou...   

                                                 aae  
0  Bitch cant get shit from me but bubble gum nd ...  
1  @islandboi_B yes that's what's up. Nothin like...  
2  Mixed huh !? Those black ass knees and elbows ...  
3  The bul Mike James from @mavs ain't shit n he ...  
4  It took for a whole stranger to tell me he PRO...  


## SAE labels

In [61]:
sae = pd.read_csv('./labeled/Phi-3-SAE-cleaned.csv')
def create_sae_prompt(example):
    return f"""<|user|>
Your task is to analyze the provided sentences written in Standard American English and identify the sentiment expressed by the author. The sentiment should be classified as Positive, Negative, or Neutral for each sentence. Reply with just the sentiment.\n
"{example['standard_american_english']}"<|end|>
<|assistant|>"""
    
# Create a Hugging Face Dataset
sae_labelsdataset = Dataset.from_pandas(sae)

# Add prompts to the dataset
sae_labelsdataset = sae_labelsdataset.map(lambda example: {"prompt": create_sae_prompt(example)})

# Function to generate sentiments
def generate_sae(examples):
    outputs = pipe(
        examples["prompt"],
        max_new_tokens=150,
        do_sample=False,
        num_return_sequences=1
    )
    return {"sae_labels": [output[0]["generated_text"] for output in outputs]}

# Generate sentiments
sae_labelsdataset = sae_labelsdataset.map(
    generate_sae,
    batched=True,
    batch_size=32,  # Adjust based on your GPU memory
    remove_columns=dataset.column_names
)

# Convert back to pandas DataFrame
labeled_df_sae = sae_labelsdataset.to_pandas()


# Save the labeled dataset to a CSV file
output_path = './labeled/Phi-3-SAE-Labels.csv'
labeled_df_sae[["standard_american_english", "sae_labels"]].to_csv(output_path, index=False)

print(f"SAE dataset saved to {output_path}")
print(f"Number of processed sentences: {len(labeled_df)}")
print(f"Number of Unknown sentiments: {(labeled_df['sentiment'] == 'Unknown').sum()}")    

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

KeyError: "['standard_american_english'] not in index"