In [None]:
import os
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Initialize the OpenAI client with your API key.
client = OpenAI(api_key='key')

def classify_topic(article, categories):
    """
    Uses GPT-4o-mini to classify the topic of a given news article among the provided categories.
    The prompt instructs the model to respond with exactly one of the category names as it appears in the list.
    """
    # Build a comma-separated list of available categories.
    categories_list = ", ".join(categories)
    
    prompt = (
        "Classify the topic of the following news article among these categories:\n"
        f"{categories_list}\n\n"
        "Only respond with one category name (exactly as it appears in the list).\n\n"
        f"News Article:\n{article}\n\n"
        "Topic:"
    )
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",  # using GPT-4o-mini as requested
            messages=[{"role": "user", "content": prompt}],
            temperature=0  # deterministic output
        )
        answer = response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error during API call: {e}")
        return "unknown"
    
    return answer

# Fetch the complete 20 Newsgroups dataset.
newsgroups_data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
df = pd.DataFrame({
    "article": newsgroups_data.data,
    "topic": [newsgroups_data.target_names[i] for i in newsgroups_data.target]
})

# List of all target categories.
categories = newsgroups_data.target_names

# Randomly sample 1000 articles for evaluation.
df_sample = df.sample(n=1000, random_state=42)

predictions = []

# Classify each sampled article using GPT-4o-mini.
for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample)):
    article_text = row['article']
    predicted_topic = classify_topic(article_text, categories)
    predictions.append(predicted_topic)

# Add predictions to the DataFrame.
df_sample['prediction'] = predictions

# Prepare true labels and predicted labels in lower-case for evaluation.
y_true = df_sample['topic'].apply(lambda s: s.strip().lower())
y_pred = df_sample['prediction'].apply(lambda s: s.strip().lower())

# Compute evaluation metrics.
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)

# Print the computed metrics.
print("Evaluation Metrics:")
print("Accuracy: {:.2%}".format(accuracy))
print("Precision (macro): {:.2%}".format(precision))
print("Recall (macro): {:.2%}".format(recall))
print("F1 Score (macro): {:.2%}".format(f1))

# Optionally, also print a detailed classification report.
report = classification_report(y_true, y_pred, zero_division=0)
print("\nDetailed Classification Report:\n", report)
