In [35]:
!pip install pandas scikit-learn tqdm requests



In [36]:
import os
import pandas as pd
import json
import requests
from tqdm import tqdm
from sklearn.metrics import accuracy_score

In [37]:
df = pd.read_csv("yelp.csv")

# sample for efficiency
df = df.sample(200, random_state=42)

# keep required columns
df = df[["text", "stars"]]

df.head()


Unnamed: 0,text,stars
6252,We got here around midnight last Friday... the...,4
4684,Brought a friend from Louisiana here. She say...,5
1731,"Every friday, my dad and I eat here. We order ...",3
4742,"My husband and I were really, really disappoin...",1
4521,Love this place! Was in phoenix 3 weeks for w...,5


In [38]:
import os
os.environ["OPENROUTER_API_KEY"] = "PASTE_YOUR_NEW_KEY_HERE"


In [39]:
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

if OPENROUTER_API_KEY is None:
    raise ValueError(
        "OPENROUTER_API_KEY not found. "
        "Please set it as an environment variable."
    )

In [40]:
PROMPT_V1 = """
You are a system that MUST output valid JSON only.

Output format (NO extra text):
{"predicted_stars": 1, "explanation": "reason"}

Review:
"""

In [41]:
PROMPT_V2 = """
Classify the Yelp review sentiment strictly.

Respond ONLY in this JSON format:
{"predicted_stars": 1-5, "explanation": "max 15 words"}

No markdown. No commentary.

Review:
"""

In [42]:
PROMPT_V3 = """
You must output ONLY valid JSON.

Examples:
{"predicted_stars": 1, "explanation": "Very negative"}
{"predicted_stars": 3, "explanation": "Neutral experience"}
{"predicted_stars": 5, "explanation": "Highly positive"}

Now classify the review below.

Review:
"""

In [43]:
def get_prediction(review, prompt):
    final_prompt = prompt + f'"{review}"\n\nReturn ONLY valid JSON.'

    response = requests.post(
        "https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {OPENROUTER_API_KEY}",
            "Content-Type": "application/json"
        },
        json={
            "model": "mistralai/mistral-7b-instruct",
            "messages": [
                {"role": "user", "content": final_prompt}
            ],
            "temperature": 0
        }
    )

    return response.json()["choices"][0]["message"]["content"]

In [44]:
def safe_parse(resp):
    try:
        resp = resp.strip()
        if resp.startswith("```"):
            resp = resp.split("```")[1]
        return json.loads(resp)
    except:
        return None

In [45]:
def evaluate_prompt(prompt):
    y_true, y_pred = [], []
    valid_json = 0

    for _, row in tqdm(df.iterrows(), total=len(df)):
        raw = get_prediction(row["text"], prompt)
        parsed = safe_parse(raw)

        if parsed and "predicted_stars" in parsed:
            valid_json += 1
            y_true.append(int(row["stars"]))
            y_pred.append(int(parsed["predicted_stars"]))

    accuracy = accuracy_score(y_true, y_pred)
    json_validity = valid_json / len(df)

    return accuracy, json_validity

In [46]:
results = {}

results["Zero-shot"] = evaluate_prompt(PROMPT_V1)
results["Rule-based"] = evaluate_prompt(PROMPT_V2)
results["Few-shot"] = evaluate_prompt(PROMPT_V3)

results

100%|██████████| 200/200 [06:53<00:00,  2.07s/it]
100%|██████████| 200/200 [03:42<00:00,  1.11s/it]
100%|██████████| 200/200 [03:10<00:00,  1.05it/s]


{'Zero-shot': (0.5333333333333333, 0.075),
 'Rule-based': (0.5794871794871795, 0.975),
 'Few-shot': (0.625, 0.8)}

In [47]:
comparison = pd.DataFrame.from_dict(
    results,
    orient="index",
    columns=["Accuracy", "JSON Validity"]
)

comparison

Unnamed: 0,Accuracy,JSON Validity
Zero-shot,0.533333,0.075
Rule-based,0.579487,0.975
Few-shot,0.625,0.8
