In [1]:
import pandas as pd
import re
from collections import Counter
import os
from openai import OpenAI
import anthropic
from groq import Groq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_groq import ChatGroq

In [2]:
cache_dir = "./cache/"
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['TOKENIZERS_PARALLELISM'] = "false"

In [14]:
from typing import List, Optional

from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate

# Define the schema for sentiment analysis
class SentimentAnalysisResponse(BaseModel):
    sentiment: str = Field(description="The sentiment of the sentence (Positive, Negative, or Neutral)")

class Data(BaseModel):
    """Extracted data about sentences."""
    sentences: List[SentimentAnalysisResponse]

# Define the prompt template
chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Your task is to analyze the provided sentences written in African American English and identify the sentiment expressed by the author. 
            The sentiment should be classified as Positive, Negative, or Neutral. Reply back with just the sentiment."""
        ),
        ("user", "{text}")
    ]
)

anthropic_key = os.getenv("ANTHROPIC_API_KEY")
anthropic_model = ChatAnthropic(
    model="claude-3-haiku-20240307",
    temperature=0,
    timeout=None,
    max_retries=2,
    api_key=anthropic_key
)


# Create the runnable chain
runnable = chat_template | anthropic_model.with_structured_output(schema=SentimentAnalysisResponse, method="json_mode")

def get_groq_label(sentences: str):

    # Get the sentiment analysis from the model
    response = runnable.invoke({"text": sentences})
    #print(response.sentiment)
    
    return response.sentiment

# Function to filter out problematic sentences
def is_valid_sentence(sentence):
    # Define a regex pattern to match problematic characters or structures
    pattern = re.compile(r'[^\x00-\x7F]+|[\x00-\x1F\x7F]')
    return not pattern.search(sentence)


# Read the dataset
dataset = pd.read_csv('initial_2000_sentences.csv')

# Filter out problematic sentences
dataset['text'] = dataset['text'].apply(lambda x: x if is_valid_sentence(x) else None)
dataset = dataset.dropna().reset_index(drop=True)

batch_size = 5
results = []

for sentence in dataset['text'].tolist():
    results.append(get_groq_label(sentence))

# Create a new DataFrame with the results
#results_df = pd.DataFrame(results, columns=['sentence', 'sentiment'])
results_df.rename(columns={'sentence': 'text', 'sentiment': 'anthropic_Haiku_provided_sentiment'}, inplace=True)
#
# Save the results to a new CSV file
#results_df.to_csv('anthropic_Haiku_labels.csv', index=False)

#print("Sentiment analysis completed and saved to gpt3.5_labels.csv")

NameError: name 'results_df' is not defined

In [15]:
len(results)

2000

In [17]:
results_df = pd.DataFrame({'sentences': dataset['text'], 'sentiment': results})
results_df.head()

Unnamed: 0,sentences,sentiment
0,Bitch cant get shit from me but bubble gum nd ...,Negative
1,@islandboi_B yes that's what's up. Nothin like...,Positive
2,Mixed huh !? Those black ass knees and elbows ...,Negative
3,The bul Mike James from @mavs ain't shit n he ...,Negative
4,It took for a whole stranger to tell me he PRO...,Positive


In [18]:
results_df.sentiment.unique()

array(['Negative', 'Positive', 'Neutral', 'Mixed'], dtype=object)

In [20]:
results_df.to_csv('labeled/anthropic_Haiku.csv', index=None)

## Conversion to SAE