In [1]:
import json
import time
import random
from typing import Dict, List

import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
def load_yelp_data(csv_path: str, sample_size: int = 200, seed: int = 42) -> pd.DataFrame:
    """
    Loads Yelp reviews dataset and samples a subset for evaluation.
    """
    df = pd.read_csv(csv_path)
    
    required_columns = {"text", "stars"}
    if not required_columns.issubset(df.columns):
        raise ValueError("Dataset must contain 'text' and 'stars' columns")
    
    df = df.sample(n=sample_size, random_state=seed).reset_index(drop=True)
    return df


In [9]:
df = pd.read_csv(
    r"C:\Users\deepe\Downloads\fynd_ai_intern_assignment\yelp.csv",
    nrows=200
)
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


LLM Client Wrapper (Gemini / OpenRouter / Any LLM)

In [10]:
def call_llm(prompt: str) -> str:
    """
    Sends a prompt to the LLM and returns raw text response.
    Replace this with actual API call.
    """
    # --- MOCK EXAMPLE ---
    # In real implementation, call Gemini / OpenRouter here
    return """
    {
      "predicted_stars": 4,
      "explanation": "The review expresses overall satisfaction with minor complaints."
    }
    """

Prompt V1 — Simple Zero-Shot

In [11]:
def prompt_v1(review_text: str) -> str:
    return f"""
You are a sentiment analysis assistant.

Classify the following Yelp review into a star rating from 1 to 5.

Review:
"{review_text}"

Return JSON in the following format:
{{
  "predicted_stars": number,
  "explanation": "short reasoning"
}}
"""

Prompt V2 — Rubric-Based Classification

In [12]:
def prompt_v2(review_text: str) -> str:
    return f"""
You are an expert reviewer classifier.

Use the following rubric:
1 star: extremely negative experience
2 stars: mostly negative
3 stars: mixed or neutral
4 stars: mostly positive
5 stars: extremely positive and enthusiastic

Review:
"{review_text}"

Return ONLY valid JSON:
{{
  "predicted_stars": number,
  "explanation": "short reasoning"
}}
"""

Prompt V3 — Structured Reasoning (Hidden Chain-of-Thought)

In [13]:
def prompt_v3(review_text: str) -> str:
    return f"""
You are a professional review analyst.

Internally:
1. Determine sentiment polarity
2. Determine sentiment strength
3. Map sentiment to a star rating

Review:
"{review_text}"

Return ONLY the final JSON:
{{
  "predicted_stars": number,
  "explanation": "concise justification"
}}
"""

JSON Parsing & Validation

In [14]:
def parse_llm_response(response_text: str) -> Dict:
    """
    Safely parses LLM JSON response.
    """
    try:
        data = json.loads(response_text.strip())
        if "predicted_stars" not in data:
            raise ValueError("Missing predicted_stars")
        return data
    except Exception:
        return None


Single Prompt Evaluation

In [15]:
def evaluate_prompt(
    df: pd.DataFrame,
    prompt_fn,
    max_retries: int = 2
) -> pd.DataFrame:
    """
    Runs evaluation for a given prompt function.
    """
    results = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        review = row["text"]
        actual = int(row["stars"])

        parsed = None
        for _ in range(max_retries):
            response = call_llm(prompt_fn(review))
            parsed = parse_llm_response(response)
            if parsed:
                break

        if parsed:
            predicted = int(parsed["predicted_stars"])
            valid_json = True
        else:
            predicted = None
            valid_json = False

        results.append({
            "actual_stars": actual,
            "predicted_stars": predicted,
            "json_valid": valid_json
        })

        time.sleep(0.2)  # rate limiting

    return pd.DataFrame(results)


Metric Computation

In [16]:
def compute_metrics(results_df: pd.DataFrame) -> Dict:
    """
    Computes accuracy and JSON validity.
    """
    valid_df = results_df[results_df["json_valid"] == True]

    accuracy = (
        (valid_df["actual_stars"] == valid_df["predicted_stars"]).mean()
        if len(valid_df) > 0 else 0
    )

    json_valid_rate = results_df["json_valid"].mean()

    return {
        "accuracy": round(accuracy * 100, 2),
        "json_valid_rate": round(json_valid_rate * 100, 2)
    }

Run Evaluation for all the prompts

In [17]:
results_v1 = evaluate_prompt(df, prompt_v1)
results_v2 = evaluate_prompt(df, prompt_v2)
results_v3 = evaluate_prompt(df, prompt_v3)

metrics_v1 = compute_metrics(results_v1)
metrics_v2 = compute_metrics(results_v2)
metrics_v3 = compute_metrics(results_v3)

100%|██████████| 200/200 [00:40<00:00,  4.93it/s]
100%|██████████| 200/200 [00:40<00:00,  4.93it/s]
100%|██████████| 200/200 [00:40<00:00,  4.94it/s]


Comparison Table

In [18]:
comparison_df = pd.DataFrame([
    {"Prompt": "V1 - Zero Shot", **metrics_v1},
    {"Prompt": "V2 - Rubric Based", **metrics_v2},
    {"Prompt": "V3 - Structured Reasoning", **metrics_v3},
])

comparison_df

Unnamed: 0,Prompt,accuracy,json_valid_rate
0,V1 - Zero Shot,35.5,100.0
1,V2 - Rubric Based,35.5,100.0
2,V3 - Structured Reasoning,35.5,100.0
