In [59]:
import pandas as pd
import json
import re
import time
from tqdm import tqdm
from openai import OpenAI

In [1]:
!pip install openai dotenv

Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: dotenv
Successfully installed dotenv-0.9.9



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


#### API configuration

In [7]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import pandas as pd

# Load .env file (adjust path based on your notebook location)
# Since notebook is in task1/ and .env is in backend/
load_dotenv(dotenv_path="../backend/.env")

# Alternative: load from task1/.env
# load_dotenv()

GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if not GROQ_API_KEY:
    raise ValueError("GROQ_API_KEY not found in .env file. Please check the path.")

# Configuration
client = OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=GROQ_API_KEY
)

MODEL_NAME = "llama-3.1-8b-instant"
DATA_PATH = "yelp.csv"
SAMPLE_SIZE = 100

print("✓ API Key loaded successfully!")
print(f"✓ Model: {MODEL_NAME}")
print(f"✓ Data path: {DATA_PATH}")

✓ API Key loaded successfully!
✓ Model: llama-3.1-8b-instant
✓ Data path: yelp.csv


#### Exploratory Data Analysis (EDA) & Sampling

In [61]:
def run_eda_and_sample(path, size):
    df = pd.read_csv(path)
    
    print("--- EDA Overview ---")
    print(f"Total Rows: {len(df)}")
    print("Star Distribution:\n", df['stars'].value_counts(normalize=True).sort_index())
    
    # Sampling 200 rows (Stratified sample ensures we have all star types)
    sample_df = df.groupby('stars', group_keys=False).apply(lambda x: x.sample(int(size/5), random_state=42))
    
    print(f"\nSampled {len(sample_df)} rows for evaluation.")
    return sample_df.reset_index(drop=True)

# Execute EDA
test_df = run_eda_and_sample(DATA_PATH, SAMPLE_SIZE)

--- EDA Overview ---
Total Rows: 10000
Star Distribution:
 stars
1    0.0749
2    0.0927
3    0.1461
4    0.3526
5    0.3337
Name: proportion, dtype: float64

Sampled 100 rows for evaluation.


  sample_df = df.groupby('stars', group_keys=False).apply(lambda x: x.sample(int(size/5), random_state=42))


#### Prompt designs

Prompt 1- V1 (Baseline): Tests raw understanding.

In [62]:
def prompt_v1_zeroshot(text):
    """Approach 1: Basic Zero-Shot."""
    return f"""Predict the star rating (1-5) for this Yelp review.
Output MUST be valid JSON: {{"predicted_stars": <int>, "explanation": "<string>"}}

Review: {text}"""

Prompt 2- V2 (Improved Context): Adds examples to calibrate "star" thresholds.

In [63]:
def prompt_v2_fewshot(text):
    """Approach 2: Few-Shot (Calibration). 
    Improved by providing 'anchor' points for high, medium, and low quality."""
    return f"""Classify Yelp reviews into 1-5 stars. 
Output JSON: {{"predicted_stars": <int>, "explanation": "<string>"}}

Examples:
Review: "Terrible service, food was cold." -> {{"predicted_stars": 1, "explanation": "Negative experience with service and product."}}
Review: "It was okay, nothing special." -> {{"predicted_stars": 3, "explanation": "Neutral/Average sentiment."}}
Review: "Absolutely amazing! Loved the atmosphere." -> {{"predicted_stars": 5, "explanation": "High praise and positive emotion."}}

Review: {text}"""

Prompt 3- V3 (Advanced Reasoning): Uses Chain-of-Thought to improve accuracy on nuanced reviews.

In [64]:
def prompt_v3_cot(text):
    """Approach 3: Chain-of-Thought.
    Improved by forcing the model to analyze 'Service' and 'Quality' before deciding."""
    return f"""Analyze the Service, Food Quality, and Value of this Yelp review.
Based on your internal analysis, provide a 1-5 star rating.

Output strictly as JSON: {{"predicted_stars": <int>, "explanation": "<string>"}}

Review: {text}"""

#### LLM Inference Engine

In [65]:
def get_llm_prediction(prompt):
    try:
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=MODEL_NAME,
            temperature=0.1, # Low temperature for consistency
            response_format={"type": "json_object"}
        )
        raw_content = chat_completion.choices[0].message.content
        
        # Regex to extract JSON if the LLM includes extra text
        json_match = re.search(r'\{.*\}', raw_content, re.DOTALL)
        if json_match:
            return json.loads(json_match.group())
        return None
    except Exception as e:
        print(f"Error: {e}")
        return None

#### Evaluation Logic

In [66]:
def evaluate_strategy(df, prompt_func):
    correct = 0
    valid_json_count = 0
    total = len(df)
    
    print(f"\nEvaluating strategy: {prompt_func.__name__}")
    
    for _, row in tqdm(df.iterrows(), total=total):
        prompt = prompt_func(row['text'])
        result = get_llm_prediction(prompt)
        
        if result:
            valid_json_count += 1
            if int(result.get('predicted_stars', 0)) == int(row['stars']):
                correct += 1
        
        # Small delay to avoid rate limits
        time.sleep(0.5)
        
    accuracy = (correct / total) * 100
    json_validity = (valid_json_count / total) * 100
    
    return {
        "Accuracy": f"{accuracy:.2f}%",
        "JSON_Validity": f"{json_validity:.2f}%",
        "Reliability": "High" if accuracy > 75 else "Moderate"
    }

#### Final Execution and Comparison Table

In [67]:
# Run Evaluations
results = {}
strategies = [prompt_v1_zeroshot, prompt_v2_fewshot, prompt_v3_cot]

for strategy in strategies:
    metrics = evaluate_strategy(test_df, strategy)
    results[strategy.__name__] = metrics

# Convert to Table for Report
report_df = pd.DataFrame(results).T
print("\n--- Final Comparison Table ---")
print(report_df)

# Discussion of results (For your Short Report)
print("\nQuick Discussion:")
print("1. Zero-Shot is fast but often misses nuance in sarcastic reviews.")
print("2. Few-Shot helps calibrate what a '3-star' vs '4-star' review looks like.")
print("3. CoT has the highest accuracy but consumes more tokens/time.")


Evaluating strategy: prompt_v1_zeroshot


100%|██████████| 100/100 [04:41<00:00,  2.82s/it]



Evaluating strategy: prompt_v2_fewshot


 33%|███▎      | 33/100 [03:09<04:58,  4.46s/it]

Error: Extra data: line 5 column 2 (char 884)


 60%|██████    | 60/100 [05:16<02:49,  4.24s/it]

Error: Extra data: line 5 column 2 (char 1103)


 61%|██████    | 61/100 [05:21<02:56,  4.53s/it]

Error: Extra data: line 5 column 2 (char 875)


 67%|██████▋   | 67/100 [05:48<02:16,  4.14s/it]

Error: Extra data: line 5 column 2 (char 263)


 80%|████████  | 80/100 [06:43<01:25,  4.26s/it]

Error: Extra data: line 5 column 2 (char 538)


100%|██████████| 100/100 [08:02<00:00,  4.83s/it]



Evaluating strategy: prompt_v3_cot


100%|██████████| 100/100 [05:54<00:00,  3.55s/it]


--- Final Comparison Table ---
                   Accuracy JSON_Validity Reliability
prompt_v1_zeroshot   70.00%       100.00%    Moderate
prompt_v2_fewshot    28.00%        95.00%    Moderate
prompt_v3_cot        75.00%       100.00%    Moderate

Quick Discussion:
1. Zero-Shot is fast but often misses nuance in sarcastic reviews.
2. Few-Shot helps calibrate what a '3-star' vs '4-star' review looks like.
3. CoT has the highest accuracy but consumes more tokens/time.



