In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import pandas as pd
from tqdm import tqdm
import os
cache_dir = "../cache"
os.environ['TRANSFORMERS_CACHE'] = cache_dir

# Load the Phi-3 model and tokenizer
model_id = "microsoft/Phi-3-medium-4k-instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto", 
    torch_dtype="auto",
    trust_remote_code=True,
    cache_dir=cache_dir
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=10,
)


`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [11]:
import datasets
from datasets import Dataset

datasets.logging.set_verbosity_info()

# Load the dataset
df = pd.read_csv('./2000-5000_sentences.csv')

# Function to create prompts for sentiment analysis
def create_prompt(example):
    return f"""<|user|>
Your task is to analyze the provided sentences written in African American English and identify the sentiment expressed by the author. The sentiment should be classified as Positive, Negative, or Neutral. Reply with just the sentiment.\n
"{example['text']}"<|end|>
<|assistant|>"""

# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Add prompts to the dataset
dataset = dataset.map(lambda example: {"prompt": create_prompt(example)})

print("successfully created prompt column")

# Function to generate sentiments
def generate_sentiment(examples):
    outputs = pipe(
        examples["prompt"],
        max_new_tokens=20,
        do_sample=False,
        num_return_sequences=1
    )
    return {"generated_text": [output[0]["generated_text"] for output in outputs]}

# Generate sentiments
dataset = dataset.map(
    generate_sentiment,
    batched=True,
    batch_size=32,  # Adjust based on your GPU memory
    remove_columns=dataset.column_names
)

print("successfully generated models answers")

# Function to extract sentiment from model output
def extract_sentiment(output):
    response = output.split("<|assistant|>")[-1].strip()
    if "Positive" in response:
        return "Positive"
    elif "Negative" in response:
        return "Negative"
    elif "Neutral" in response:
        return "Neutral"
    else:
        return "Unknown"

# Extract sentiments
dataset = dataset.map(lambda example: {"sentiment": extract_sentiment(example["generated_text"])})

print("successfully parsed sentiment out of each answer")

# Convert back to pandas DataFrame
labeled_df = dataset.to_pandas()

# Save the labeled dataset to a CSV file
output_path = './labeled/Phi-3-Labels.csv'
labeled_df.to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")
print(f"Number of processed sentences: {len(labeled_df)}")
print(f"Number of Unknown sentiments: {(labeled_df['sentiment'] == 'Unknown').sum()}")

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

successfully created prompt column




Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


successfully generated models answers


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

successfully parsed sentiment out of each answer
Labeled dataset saved to ./labeled/Phi-3-Labels.csv
Number of processed sentences: 3000
Number of Unknown sentiments: 0


In [12]:
text_and_sentiment = pd.DataFrame({"text" : df['text'], "sentiment" : dataset['sentiment']})
text_and_sentiment.head()

Unnamed: 0,text,sentiment
0,@SeeLineWoman If I don't get this job tomorrow...,Negative
1,"I'm going to try two-a-days, y'all. I want to ...",Neutral
2,"I still intend to have his child, because he's...",Positive
3,Used someone as a reference with no warning. I...,Neutral
4,@HumanistExec Try ginger for the nausea. Have ...,Neutral


In [13]:
output_path = './labeled/Phi-3-Labels.csv'
text_and_sentiment.to_csv(output_path, index=False)

In [15]:
df.head()

Unnamed: 0,text
0,@SeeLineWoman If I don't get this job tomorrow...
1,"I'm going to try two-a-days, y'all. I want to ..."
2,"I still intend to have his child, because he's..."
3,Used someone as a reference with no warning. I...
4,@HumanistExec Try ginger for the nausea. Have ...


## Conversion to SAE

In [17]:
def create_sae_prompt(example):
        return f"""<|user|>
Following is a tweet extracted from a African American twitter individual's account. Your task is to convert the tweet to Standard American English. Reply with just the sentence.
"{example['text']}"<|end|>
<|assistant|>"""
    
# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Add prompts to the dataset
dataset = dataset.map(lambda example: {"prompt": create_sae_prompt(example)})

print("successfully created prompt column")

# Function to generate sentiments
def generate_sae(examples):
    outputs = pipe(
        examples["prompt"],
        max_new_tokens=150,
        do_sample=False,
        num_return_sequences=1
    )
    return {"standard_american_english": [output[0]["generated_text"] for output in outputs]}

# Generate sentiments
dataset = dataset.map(
    generate_sae,
    batched=True,
    batch_size=32
)

print("successfully generated models answers")

# Convert back to pandas DataFrame
labeled_df = dataset.to_pandas()

# Save the labeled dataset to a CSV file
output_path = './labeled/Phi-3-SAE.csv'
labeled_df[["text", "standard_american_english"]].rename(columns={"text", "african_american_english"}).to_csv(output_path, index=False)

print(f"SAE dataset saved to {output_path}")
print(f"Number of processed sentences: {len(labeled_df)}")

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Parameter 'function'=<function generate_sae at 0x7f4b9805a5e0> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead.


successfully created prompt column


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

successfully generated models answers


TypeError: 'set' object is not callable

Suppose to be .rename(columns={"text" : "african_american_english"}) not .rename(columns={"text" , "african_american_english"})!

In [18]:
# Save the labeled dataset to a CSV file
output_path = './labeled/Phi-3-SAE.csv'
labeled_df[["text", "standard_american_english"]].rename(columns={"text" : "african_american_english"}).to_csv(output_path, index=False)

print(f"SAE dataset saved to {output_path}")
print(f"Number of processed sentences: {len(labeled_df)}")

SAE dataset saved to ./labeled/Phi-3-SAE.csv
Number of processed sentences: 3000


In [19]:
# Convert back to pandas DataFrame
labeled_df = dataset.to_pandas()


In [20]:
print(labeled_df.iloc[0])

text                         @SeeLineWoman If I don't get this job tomorrow...
prompt                       <|user|>\nFollowing is a tweet extracted from ...
standard_american_english    <|user|>\nFollowing is a tweet extracted from ...
Name: 0, dtype: object


In [21]:
dataset[0]

{'text': "@SeeLineWoman If I don't get this job tomorrow, I don't know what I'm going to do. I'm at the end of my rope.",
 'prompt': '<|user|>\nFollowing is a tweet extracted from a African American twitter individual\'s account. Your task is to convert the tweet to Standard American English. Reply with just the sentence.\n"@SeeLineWoman If I don\'t get this job tomorrow, I don\'t know what I\'m going to do. I\'m at the end of my rope."<|end|>\n<|assistant|>',
 'standard_american_english': '<|user|>\nFollowing is a tweet extracted from a African American twitter individual\'s account. Your task is to convert the tweet to Standard American English. Reply with just the sentence.\n"@SeeLineWoman If I don\'t get this job tomorrow, I don\'t know what I\'m going to do. I\'m at the end of my rope."<|end|>\n<|assistant|> "If I don\'t get this job tomorrow, I don\'t know what I\'m going to do. I\'m at the end of my rope."'}

In [22]:
import pandas as pd

# Load the labeled dataset
labeled_df = pd.read_csv('./labeled/Phi-3-SAE.csv')

def extract_sae(sentence):
    try:
        # Extract the part after the prompt and clean it
        sae_sentence = sentence.split('<|assistant|>')[-1].strip().strip('"')
        return sae_sentence
    except IndexError:
        # Handle cases where the split might not work as expected
        return sentence.strip().strip('"')

# Apply the extraction function to the 'standard_american_english' column
labeled_df['standard_american_english'] = labeled_df['standard_american_english'].apply(extract_sae)

# Save the cleaned dataset to a new CSV file
output_path = './labeled/Phi-3-SAE-cleaned.csv'
labeled_df.to_csv(output_path, index=False)

# Display the first few rows to verify
print(labeled_df.head())


                            african_american_english  \
0  @SeeLineWoman If I don't get this job tomorrow...   
1  I'm going to try two-a-days, y'all. I want to ...   
2  I still intend to have his child, because he's...   
4  @HumanistExec Try ginger for the nausea. Have ...   

                           standard_american_english  
0  If I don't get this job tomorrow, I don't know...  
1  I'm going to attempt double workouts, everyone...  
2  I still plan to have his child, because he's a...  
3  I used someone as a reference without giving a...  
4  @HumanistExec Try ginger for the nausea. Have ...  


## SAE labels

In [23]:
sae = pd.read_csv('./labeled/Phi-3-SAE-cleaned.csv')
def create_sae_prompt(example):
    return f"""<|user|>
Your task is to analyze the provided sentences written in Standard American English and identify the sentiment expressed by the author. The sentiment should be classified as Positive, Negative, or Neutral for each sentence. Reply with just the sentiment.\n
"{example['standard_american_english']}"<|end|>
<|assistant|>"""
    
# Create a Hugging Face Dataset
sae_labelsdataset = Dataset.from_pandas(sae)

# Add prompts to the dataset
sae_labelsdataset = sae_labelsdataset.map(lambda example: {"prompt": create_sae_prompt(example)})

# Function to extract sentiment from model output
def extract_sentiment(output):
    response = output.split("<|assistant|>")[-1].strip()
    if "Positive" in response:
        return "Positive"
    elif "Negative" in response:
        return "Negative"
    elif "Neutral" in response:
        return "Neutral"
    else:
        return "Unknown"
    
    
# Function to generate sentiments
def generate_sae(examples):
    outputs = pipe(
        examples["prompt"],
        max_new_tokens=20,
        do_sample=False,
        num_return_sequences=1
    )
    return {"sae_labels": [extract_sentiment(output[0]["generated_text"]) for output in outputs]}

# Generate sentiments
sae_labelsdataset = sae_labelsdataset.map(
    generate_sae,
    batched=True,
    batch_size=32
)


# Convert back to pandas DataFrame
labeled_df_sae = sae_labelsdataset.to_pandas()


# Save the labeled dataset to a CSV file
output_path = './labeled/Phi-3-SAE-Labels.csv'
labeled_df_sae[["standard_american_english", "sae_labels"]].to_csv(output_path, index=False)

print(f"SAE dataset saved to {output_path}")
print(f"Number of processed sentences: {len(labeled_df_sae)}")
print(f"Number of Unknown sentiments: {(labeled_df_sae['sae_labels'] == 'Unknown').sum()}")    

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Parameter 'function'=<function generate_sae at 0x7f4ba436e160> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead.


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

SAE dataset saved to ./labeled/Phi-3-SAE-Labels.csv
Number of processed sentences: 3000
Number of Unknown sentiments: 0


________________

### Pipiline uptil now:
AAE -> AAE Sentiment -> Translate to SAE -> SAE Sentiment

## Now we go back to AAE using SAE, and finish off by obtaining sentiment on that.

AAE -> AAE Sentiment -> Translate to SAE -> SAE Sentiment -> **AAE_from_SAE -> AAE_from_SAE Sentiment**

In [6]:
df = pd.read_csv('./labeled/Phi-3-SAE-Labels.csv')

In [7]:
import datasets
from datasets import Dataset

datasets.logging.set_verbosity_info()

def create_aae_from_sae_prompt(example):
        return f"""<|user|>
You will be given a tweet in Standard American English. Your task is to convert the given tweet to African American English. Reply with just the translated sentence.
"{example['standard_american_english']}"<|end|>
<|assistant|>"""
    
# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Add prompts to the dataset
dataset = dataset.map(lambda example: {"prompt": create_aae_from_sae_prompt(example)})

print("successfully created prompt column")

def extract_aae_from_sae(sentence):
    try:
        sae_sentence = sentence.split('<|assistant|>')[-1].strip().strip('"')
        return sae_sentence
    except IndexError:
        return sentence.strip().strip('"')

# Function to generate sentiments
def generate_aae_from_sae(examples):
    outputs = pipe(
        examples["prompt"],
        max_new_tokens=150,
        do_sample=False,
        num_return_sequences=1
    )
    return {"AAE_from_SAE": [extract_aae_from_sae(output[0]["generated_text"]) for output in outputs]}

# Generate AAE from SAE sentences
dataset = dataset.map(
    generate_aae_from_sae,
    batched=True,
    batch_size=32
)

print("successfully generated models answers")

# Convert back to pandas DataFrame
labeled_df = dataset.to_pandas()


output_path = './labeled/Phi-3-AAE_from_SAE.csv'
labeled_df[["AAE_from_SAE", "standard_american_english"]].to_csv(output_path, index=False)

print(f"AAE_from_SAE dataset saved to {output_path}")
print(f"Number of processed sentences: {len(labeled_df)}")

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

successfully created prompt column


Parameter 'function'=<function generate_aae_from_sae at 0x7f382c693160> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead.


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


successfully generated models answers
AAE_from_SAE dataset saved to ./labeled/Phi-3-AAE_from_SAE.csv
Number of processed sentences: 3000


## Getting the sentiment for AAE_from_SAE

In [10]:
import datasets
from datasets import Dataset

datasets.logging.set_verbosity_info()

df = pd.read_csv('./labeled/Phi-3-AAE_from_SAE.csv')

def create_aae_from_sae_prompt(example):
        return f"""<|user|>
Your task is to analyze the provided sentences written in African American English and identify the sentiment expressed by the author. The sentiment should be classified as Positive, Negative, or Neutral.
"{example['AAE_from_SAE']}"<|end|>
<|assistant|>"""
    
# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Add prompts to the dataset
dataset = dataset.map(lambda example: {"prompt": create_aae_from_sae_prompt(example)})

print("successfully created prompt column")

# Function to extract sentiment from model output
def extract_sentiment(output):
    response = output.split("<|assistant|>")[-1].strip()
    if "Positive" in response:
        return "Positive"
    elif "Negative" in response:
        return "Negative"
    elif "Neutral" in response:
        return "Neutral"
    else:
        return "Unknown"
    

# Function to generate sentiments
def generate_aae_from_sae_sentiment(examples):
    outputs = pipe(
        examples["prompt"],
        max_new_tokens=20,
        do_sample=False,
        num_return_sequences=1
    )
    return {"AAE_from_SAE sentiment": [extract_sentiment(output[0]["generated_text"]) for output in outputs]}

# Generate AAE from SAE sentences
dataset = dataset.map(
    generate_aae_from_sae_sentiment,
    batched=True,
    batch_size=32
)

print("successfully generated models answers")

# Convert back to pandas DataFrame
labeled_df = dataset.to_pandas()


output_path = './labeled/Phi-3-AAE_from_SAE_labels.csv'
labeled_df[["AAE_from_SAE", "AAE_from_SAE sentiment"]].to_csv(output_path, index=False)

print(f"AAE_from_SAE dataset saved to {output_path}")
print(f"Number of processed sentences: {len(labeled_df)}")

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Parameter 'function'=<function generate_aae_from_sae_sentiment at 0x7f384807c280> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead.


successfully created prompt column


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

successfully generated models answers
AAE_from_SAE dataset saved to ./labeled/Phi-3-AAE_from_SAE_labels.csv
Number of processed sentences: 3000
