In [None]:
import os
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Initialize the OpenAI client with your API key.
client = OpenAI(api_key='key')

def classify_review(review):
    """
    Uses GPT-4o-mini to classify the sentiment of a given movie review as either Positive or Negative.
    The prompt instructs the model to respond with a single word: 'Positive' or 'Negative'.
    """
    prompt = (
        "Classify the sentiment of the following movie review as either Positive or Negative. "
        "Only respond with a single word: 'Positive' or 'Negative'.\n\n"
        f"Review: {review}\n\nSentiment:"
    )

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",  # Using GPT-4o-mini as the cost-effective model
            messages=[{"role": "user", "content": prompt}],
            temperature=0  # Deterministic output
        )
        answer = response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error during API call: {e}")
        return "unknown"

    # Normalize the response to lowercase for reliable matching.
    answer = answer.lower()
    if "positive" in answer:
        return "positive"
    elif "negative" in answer:
        return "negative"
    else:
        return "unknown"

# Load the CSV file from the same folder; it is expected to contain headers "review" and "sentiment".
df_full = pd.read_csv("imdb_reviews.csv")

# Randomly sample 1000 rows for evaluation (set random_state for reproducibility)
df_sample = df_full.sample(n=1000, random_state=42)

predictions = []

# Iterate over the sampled reviews and classify each using GPT-4o-mini.
for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample)):
    review_text = row['review']
    prediction = classify_review(review_text)
    predictions.append(prediction)

# Add the predictions to the DataFrame.
df_sample['prediction'] = predictions

# Prepare true and predicted labels in lowercase.
y_true = df_sample['sentiment'].apply(lambda s: s.strip().lower())
y_pred = df_sample['prediction'].apply(lambda s: s.strip().lower())

# Compute the evaluation metrics.
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)

# Print the computed metrics.
print("Evaluation Metrics:")
print("Accuracy: {:.2%}".format(accuracy))
print("Precision (macro): {:.2%}".format(precision))
print("Recall (macro): {:.2%}".format(recall))
print("F1 Score (macro): {:.2%}".format(f1))

# Optionally, print a detailed classification report.
report = classification_report(y_true, y_pred, zero_division=0)
print("\nDetailed Classification Report:\n", report)
