In [1]:
import pandas as pd
import re
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from collections import Counter
import os
from openai import OpenAI
import anthropic
from groq import Groq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
import torch

In [2]:
cache_dir = "./cache/"
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ['OPENAI_API_KEY'] = 'openai-api-key-here'
os.environ['ANTHROPIC_API_KEY'] = 'anthropic-api-key-here'

In [3]:
# Function to filter out problematic sentences
def is_valid_sentence(sentence):
    # Define regex patterns to match problematic characters or structures
    pattern1 = re.compile(r'[^\x00-\x7F]+|[\x00-\x1F\x7F]')  # Non-ASCII and control characters
    pattern2 = re.compile(r'[\\]')  # Backslashes
    
    # Return True only if the sentence matches neither pattern
    return not (pattern1.search(sentence) or pattern2.search(sentence))

# Read the dataset
dataset = pd.read_csv('cleaned_dataset.csv')

# Filter out problematic sentences
dataset['text'] = dataset['text'].apply(lambda x: x if is_valid_sentence(x) else None)
dataset = dataset.dropna().reset_index(drop=True)
dataset.head()

Unnamed: 0,text
0,Bitch cant get shit from me but bubble gum nd ...
1,@islandboi_B yes that's what's up. Nothin like...
2,Mixed huh !? Those black ass knees and elbows ...
3,The bul Mike James from @mavs ain't shit n he ...
4,It took for a whole stranger to tell me he PRO...


In [4]:
dataset[:2000].to_csv('initial_2000_sentences.csv')

## The below is API script

In [52]:
from typing import List, Optional
from tqdm import tqdm
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate

# Define the schema for sentiment analysis
# Define the schema for sentiment analysis
class SentimentAnalysisResponse(BaseModel):
    sentiment: str = Field(description="The sentiment of the sentence (Positive, Negative, or Neutral)")

class Data(BaseModel):
    """Extracted data about sentences."""
    sentiments: List[SentimentAnalysisResponse]

# Define the prompt template
chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Your task is to analyze the provided sentences written in African American English and identify the sentiment expressed by the author. 
            The sentiment should be classified as Positive, Negative, or Neutral for each sentence."""
        ),
        ("user", "{sentences}")
    ]
)

# Define the language model
model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)

# Create the runnable chain
runnable = chat_template | model.with_structured_output(schema=Data)
runnable_single = chat_template | model.with_structured_output(schema=SentimentAnalysisResponse)

dataset = pd.read_csv('initial_2000_sentences.csv')["text"]

tweets = [
    "Bitch cant get shit from me but bubble gum nd hard dick from me told da bitch im tryna make a flip im shootin dice wit er rent money !",
    "@islandboi_B yes that's what's up. Nothin like getting dressed up and getting some culture man.",
    "Mixed huh !? Those black ass knees and elbows will give you away every time lol"
]

#result = runnable.invoke({"sentences" : "\n".join(dataset[:10].to_list())})

print(f"results are: {result}")

results are: sentiments=[SentimentAnalysisResponse(sentiment='Positive'), SentimentAnalysisResponse(sentiment='Neutral'), SentimentAnalysisResponse(sentiment='Negative')]


In [23]:
# Initialize lists to store the sentiments and their corresponding indices
all_sentiments = []
processed_indices = []
failed_indices = []

# Process the dataset in batches of 10
for i in tqdm(range(0, len(dataset), 5)):
    batch = dataset[i:i+5].to_list()
    batch_indices = list(range(i, min(i+5, len(dataset))))
    
    try:
        result = runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.sentiments) == len(batch):
            all_sentiments.extend([response.sentiment for response in result.sentiments])
            processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as failed
            failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+9}: {str(e)}")
        failed_indices.extend(batch_indices)

# Create a new dataframe with successfully processed sentences and sentiments
labeled_df = pd.DataFrame({
    'index': processed_indices,
    'text': dataset.iloc[processed_indices],
    'sentiment': all_sentiments
})

# Sort the dataframe by the original index
labeled_df = labeled_df.sort_values('index').reset_index(drop=True)

# Save the labeled dataset to a CSV file
output_path = './labeled/GPT-3.5-Labels.csv'
labeled_df.to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")

# Save the failed indices to a separate file
failed_indices_path = './labeled/failed_indices.csv'
pd.DataFrame({'failed_index': failed_indices}).to_csv(failed_indices_path, index=False)

print(f"Failed indices saved to {failed_indices_path}")
print(f"Number of successfully processed sentences: {len(processed_indices)}")
print(f"Number of failed sentences: {len(failed_indices)}")

100%|███████████████████████████████████████████████| 400/400 [04:51<00:00,  1.37it/s]

Labeled dataset saved to ./labeled/GPT-3.5-Labels.csv
Failed indices saved to ./labeled/failed_indices.csv
Number of successfully processed sentences: 1800
Number of failed sentences: 200





In [24]:
# Load the previously processed data
labeled_df = pd.read_csv('./labeled/GPT-3.5-Labels.csv')
failed_indices = pd.read_csv('./labeled/failed_indices.csv')['failed_index'].tolist()

# Load the original dataset
original_dataset = pd.read_csv('initial_2000_sentences.csv')["text"]

# Initialize lists to store the new sentiments and their corresponding indices
new_sentiments = []
new_processed_indices = []
still_failed_indices = []

# Process the failed sentences
for i in tqdm(range(0, len(failed_indices), 3)):
    batch_indices = failed_indices[i:i+3]
    batch = original_dataset.iloc[batch_indices].tolist()
    
    try:
        result = runnable.invoke({"sentences": "\n".join(batch)})
        
        # Check if the number of returned sentiments matches the batch size
        if len(result.sentiments) == len(batch):
            new_sentiments.extend([response.sentiment for response in result.sentiments])
            new_processed_indices.extend(batch_indices)
        else:
            # If the number of sentiments doesn't match, mark all as still failed
            still_failed_indices.extend(batch_indices)
    except Exception as e:
        print(f"Error processing batch {i}-{i+2}: {str(e)}")
        still_failed_indices.extend(batch_indices)

# Create a new dataframe with newly processed sentences and sentiments
new_labeled_df = pd.DataFrame({
    'index': new_processed_indices,
    'text': original_dataset.iloc[new_processed_indices],
    'sentiment': new_sentiments
})

# Combine the previously processed data with the newly processed data
combined_df = pd.concat([labeled_df, new_labeled_df], ignore_index=True)

# Sort the dataframe by the original index and reset the index
combined_df = combined_df.sort_values('index').reset_index(drop=True)

# If there are still failed indices, add them to the combined dataframe with NaN sentiment
if still_failed_indices:
    failed_df = pd.DataFrame({
        'index': still_failed_indices,
        'text': original_dataset.iloc[still_failed_indices],
        'sentiment': pd.NA
    })
    combined_df = pd.concat([combined_df, failed_df], ignore_index=True)
    combined_df = combined_df.sort_values('index').reset_index(drop=True)

# Save the complete labeled dataset to a CSV file
output_path = './labeled/complete-2000-GPT-3.5.csv'
combined_df.to_csv(output_path, index=False)

print(f"Complete labeled dataset saved to {output_path}")
print(f"Total processed sentences: {len(combined_df)}")
print(f"Successfully labeled sentences: {combined_df['sentiment'].notna().sum()}")
print(f"Failed sentences: {combined_df['sentiment'].isna().sum()}")

100%|█████████████████████████████████████████████████| 67/67 [00:40<00:00,  1.64it/s]

Complete labeled dataset saved to ./labeled/complete-2000-GPT-3.5.csv
Total processed sentences: 2000
Successfully labeled sentences: 1974
Failed sentences: 26





In [39]:
def get_GPT_label(sentence):
    completion = openai_client.chat.completions.create(
      model="gpt-3.5-turbo",
      temperature=0,
      messages=[
        {"role": "system", "content": "Your task is to analyze the provided tweet written in African American English and identify the sentiment expressed by the author. The sentiment should be classified as Positive, Negative, or Neutral. Reply with just the sentiment."},
        {"role": "user", "content": sentence}
      ]
    )

    return completion.choices[0].message.content

In [40]:
def get_anthropic_label(sentence):
    message = anthropic_client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=10,
        temperature=0,
        system="Your task is to analyze the provided tweet written in African American English and identify the sentiment expressed by the author. The sentiment should be classified as Positive, Negative, or Neutral. Reply with just the sentiment.",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": sentence
                    }
                ]
            }
        ]
    )
    return message.content[0].text

In [4]:
def get_groq_label(sentences):
   
    chat_completion = groq_client.chat.completions.create(
        
        messages=[
            {
                "role": "system",
                "content": """Your task is to analyze the provided tweet written in African American English and identify the sentiment expressed by the author. The sentiment should be classified as Positive, Negative, or Neutral. Reply with just the sentiment in JSON. For example,
                {'Sentence 1': 'Positive', 'Sentence 2 : Negative', 'Sentence 3' : 'Neutral' }."""
            },
            {
                "role": "user",
                "content": sentence,
            }
        ],

        # The language model which will generate the completion.
        model="llama3-70b-8192",
        temperature=0,

        # If set, partial message deltas will be sent.
        stream=False,
    )

    return chat_completion.choices[0].message.content

In [42]:
def sentiment_labeling_pipeline(texts):
    results = []
    for text in texts:
        gpt_label = get_GPT_label(text)
        anthropic_label = get_anthropic_label(text)
        groq_label = get_groq_label(text)
        results.append([text, gpt_label, anthropic_label, groq_label])
    return results

# Example usage
tweets = [
    "Bitch cant get shit from me but bubble gum nd hard dick from me told da bitch im tryna make a flip im shootin dice wit er rent money !",
    "@islandboi_B yes that's what's up. Nothin like getting dressed up and getting some culture man.",
    "Mixed huh !? Those black ass knees and elbows will give you away every time lol"
]

# Get labeled tweets
labeled_tweets = sentiment_labeling_pipeline(tweets)

# Create a DataFrame to display the results
columns = ["Tweet", "GPT-3.5", "Anthropic", "Groq"]
df = pd.DataFrame(labeled_tweets, columns=columns)


In [43]:
df.head()

Unnamed: 0,Tweet,GPT-3.5,Anthropic,Groq
0,Bitch cant get shit from me but bubble gum nd ...,Negative,Negative,Negative
1,@islandboi_B yes that's what's up. Nothin like...,Positive,Positive,Positive
2,Mixed huh !? Those black ass knees and elbows ...,Negative,Negative,Positive
