In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
!pip install anthropic -q

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
claude_key = user_secrets.get_secret("claude_key")

In [None]:
import os
from tqdm import tqdm
import pandas as pd
import json
from sklearn.metrics import accuracy_score
import anthropic
import time

client = anthropic.Anthropic(
    api_key=claude_key,
)

In [None]:
total_cost = 0

In [None]:
def get_sentiment_analysis(data, system, prefill=""):
    max_retries = 3
    retry_delay = 60  # 60 seconds wait between retries
    
    for attempt in range(max_retries):
        try:
            message = client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=2000,
                temperature=0.0,
                system=system,
                messages=[
                    {"role": "user", "content": data},
                    {"role": "assistant", "content": prefill}
                ]
            )
            
            input_tokens = message.usage.input_tokens
            output_tokens = message.usage.output_tokens
            
            global total_cost
            input_cost = input_tokens * 0.000003
            output_cost = output_tokens * 0.000015
            total_cost += input_cost + output_cost
            
            return message.content[0].text
            
        except anthropic.RateLimitError:
            if attempt < max_retries - 1:
                print(f"Rate limit hit. Waiting {retry_delay} seconds before retry...")
                time.sleep(retry_delay)
            else:
                raise

### bangla prompt

In [None]:
base_prompt = '''
You are an expert in natural language processing and hate speech detection. Your task is to analyze Bengali text and determine whether it contains hate speech.

Follow these steps:
1. Read the input text carefully.
2. Identify if the text contains hate speech.
3. Classify the text into one of the following categories:
   - "Hate": The text contains hateful, offensive, or harmful language.
   - "Non-Hate": The text does not contain hateful or offensive language.


You will receive an array of samples, each containing "id" and "text".
'''

output_prompt = ' Use JSON format with the keys as \"id\", \"prediction\". Return only the JSON object with the root named "samples". Do not change the value of the "id".'

bangla_prompt = base_prompt + output_prompt


### codemixed prompt

In [None]:
base_prompt = '''
You are an expert in natural language processing and hate speech detection. Your task is to analyze codemixed Bengali text and determine whether it contains hate speech.

Follow these steps:
1. Read the input text carefully.
2. Identify if the text contains hate speech.
3. Classify the text into one of the following categories:
   - "Hate": The text contains hateful, offensive, or harmful language.
   - "Non-Hate": The text does not contain hateful or offensive language.

You will receive an array of samples, each containing "id" and codemixed "text".
'''

output_prompt = ' Use JSON format with the keys as \"id\", \"prediction\". Return only the JSON object with the root named "samples". Do not change the value of the "id".'

codemixed_prompt = base_prompt + output_prompt


In [None]:
df = pd.read_csv('/kaggle/input/acl-codemixing-dataset/hate/hate_dataset.csv')

df = df.sample(n=500, random_state=42) 
# df = df[:200]



In [None]:
df

In [None]:
df = df.reset_index(drop=True)
df['id'] = df.index + 1

In [None]:
df

In [None]:
df.columns

In [None]:
df.shape

## choose val or test dataset

In [None]:
chunk_size = 10
chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

output_dir = 'output/claude'
os.makedirs(output_dir, exist_ok=True)

## choose column

In [None]:
column = 'perturbed_text_salient'

In [None]:
def process_text(text):
    return text.replace('\n', ' ')

In [None]:
chunk_list = []
for i, chunk in enumerate(chunks):
    start_id = chunk.index[0] + 1
    end_id = chunk.index[-1] + 1
    
    user_prompt = "\n".join([f"{row['id']}: {process_text(row[column])}" for _, row in chunk.iterrows()])
    
    chunk_dict = {
        'chunk_name': f"{start_id}-{end_id}",
        'user_prompt': user_prompt
    }
    
    chunk_list.append(chunk_dict)

In [None]:
# chunk_list = create_chunks(df, column, chunk_size=10)

In [None]:
chunk_list[40]

In [None]:
# chunk_list[221]

In [None]:
final_df = pd.DataFrame()
for chunk in chunk_list:
    try:
        data = chunk['user_prompt']
        response = get_sentiment_analysis(codemixed_prompt, data)
        parsed_data = json.loads(response)
        results = parsed_data['samples']
        
        output_df = pd.DataFrame(results)
        final_df = pd.concat([final_df, output_df], ignore_index=True)
        
        # Add a small delay between chunks to avoid rate limits
        time.sleep(2)  # 2 second delay between chunks
        
    except Exception as e:
        print(f"An error occurred for chunk {chunk['chunk_name']}: {e}")
        # Optionally save failed chunks for later retry
        continue

# final_df[id] = final_df.index + 1

In [None]:
print("total cost incurred: ", total_cost)

In [None]:
# final_df

In [None]:
final_df.shape

In [None]:
# final_df.rename(columns = {'text': column}, inplace = True)
final_df.columns

In [None]:
duplicates = final_df[final_df.duplicated(subset='id', keep=False)]
len(duplicates)

In [None]:
final_df.drop_duplicates(subset='id', keep='first', inplace = True)
final_df.shape

In [None]:
final_df

In [None]:
df

In [None]:
save_df = pd.merge(df, final_df, on = 'id', how = 'inner')
save_df

In [None]:
# final_output_file = os.path.join(output_dir, f"claude_sentnob_{column}.csv")
final_output_file = f"claude_hate_{column}.csv"
# final_df.to_csv(f"final_df_{final_output_file}", index=False)
save_df.to_csv(final_output_file, index=False)

print(f"Final DataFrame saved to {final_output_file}")