## Section 1: Setup and Dependencies

In [None]:
# Install required packages
import subprocess
import sys

packages = ['groq', 'pandas', 'numpy', 'matplotlib', 'seaborn']
for package in packages:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', package, '-q'])

print('âœ“ All packages installed successfully')

In [None]:
import os
import json
import pandas as pd
import numpy as np
import re
from groq import Groq
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('âœ“ All imports successful')

## Section 2: Load and Explore Dataset

In [None]:
# Download Yelp dataset from Kaggle using kaggle API
# Make sure you have kaggle credentials set up (~/.kaggle/kaggle.json)

# For this notebook, we'll create sample data or load from local file
# Uncomment below if you have kaggle CLI configured:
# os.system('kaggle datasets download -d omkarsabnis/yelp-reviews-dataset')

# Alternative: Load sample data if already downloaded
try:
    df = pd.read_csv('yelp_reviews.csv')
    print(f'âœ“ Loaded {len(df)} reviews from local file')
except FileNotFoundError:
    print('âš  yelp_reviews.csv not found. Please download from Kaggle.')
    print('Download from: https://www.kaggle.com/datasets/omkarsabnis/yelp-reviews-dataset')
    df = None

In [None]:
if df is not None:
    print('Dataset Shape:', df.shape)
    print('\nColumn Names:')
    print(df.columns.tolist())
    print('\nFirst Row:')
    print(df.iloc[0])
    print('\nData Types:')
    print(df.dtypes)
    print('\nRating Distribution:')
    print(df['stars'].value_counts().sort_index() if 'stars' in df.columns else 'No stars column')

In [None]:
# Sample 200 rows for evaluation
if df is not None:
    np.random.seed(42)
    sample_size = min(200, len(df))
    df_sample = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
    
    print(f'âœ“ Sampled {len(df_sample)} reviews for evaluation')
    print(f'\nSample Rating Distribution:')
    if 'stars' in df_sample.columns:
        print(df_sample['stars'].value_counts().sort_index())
    elif 'rating' in df_sample.columns:
        print(df_sample['rating'].value_counts().sort_index())

## Section 3: Initialize Groq Client

In [None]:
# Initialize Groq Client
# Make sure GROQ_API_KEY environment variable is set

api_key = os.getenv('GROQ_API_KEY')
if not api_key:
    raise ValueError('GROQ_API_KEY environment variable not set. Please set it before running this notebook.')

client = Groq(api_key=api_key)
print('âœ“ Groq client initialized successfully')

## Section 4: Define 3 Prompting Approaches

### Prompt Version 1: Direct Classification
- Simple, direct instruction
- Minimal context
- Fastest inference
- Baseline approach

### Prompt Version 2: Chain-of-Thought Reasoning
- Asks model to reason step-by-step
- More explicit thought process
- Potentially more accurate
- May be slower but more consistent

### Prompt Version 3: Few-Shot Examples
- Provides example reviews with ratings
- Demonstrates expected format and reasoning
- Reduces ambiguity
- Can improve accuracy through in-context learning

In [None]:
# Prompt Version 1: Direct Classification
PROMPT_V1_TEMPLATE = """Classify the following Yelp review into a rating between 1 and 5 stars.

Review: {review}

Return ONLY valid JSON with no additional text:
{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<brief reasoning>"
}}"""

# Prompt Version 2: Chain-of-Thought Reasoning
PROMPT_V2_TEMPLATE = """Analyze the following Yelp review step by step, then assign a rating.

Review: {review}

Step 1: Identify sentiment indicators (positive/negative words, tone)
Step 2: Assess overall satisfaction level (very unhappy to very happy)
Step 3: Consider specificity and detail in feedback
Step 4: Assign rating 1-5 based on overall impression

Return ONLY valid JSON:
{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<brief reasoning based on analysis>"
}}"""

# Prompt Version 3: Few-Shot Examples
PROMPT_V3_TEMPLATE = """Classify Yelp reviews into 1-5 star ratings based on examples:

EXAMPLES:

Review: "Absolutely horrible service! Food was cold and arrived 2 hours late. Worst experience ever."
Rating: {{"predicted_stars": 1, "explanation": "Multiple critical failures: poor service, cold food, excessive wait time."}}

Review: "Average restaurant. Nothing special but decent food. Service was slow."
Rating: {{"predicted_stars": 2, "explanation": "Below average experience with notable service delays, despite acceptable food."}}

Review: "Good food and friendly staff. A bit pricey but worth the visit."
Rating: {{"predicted_stars": 4, "explanation": "Positive experience with good food and service, minor concern about pricing."}}

Review: "Outstanding! Best meal I've had all year. Highly recommend!"
Rating: {{"predicted_stars": 5, "explanation": "Exceptional experience exceeding expectations, enthusiastic recommendation."}}

Now classify this review using the same format:

Review: {review}

Return ONLY valid JSON:
{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<brief reasoning>"
}}"""

print('âœ“ Prompt templates defined')
print(f'\nPrompt V1 template length: {len(PROMPT_V1_TEMPLATE)} chars')
print(f'Prompt V2 template length: {len(PROMPT_V2_TEMPLATE)} chars')
print(f'Prompt V3 template length: {len(PROMPT_V3_TEMPLATE)} chars')

## Section 5: Classification Functions

In [None]:
def classify_review(review_text, prompt_template, model='mixtral-8x7b-32768'):
    """
    Classify a single review using Groq API
    
    Args:
        review_text: The review to classify
        prompt_template: Template with {review} placeholder
        model: Groq model to use
    
    Returns:
        dict with predicted_stars, explanation, and metadata
    """
    try:
        prompt = prompt_template.format(review=review_text)
        
        message = client.chat.completions.create(
            messages=[{'role': 'user', 'content': prompt}],
            model=model,
            temperature=0.3,  # Lower temperature for consistency
            max_tokens=200
        )
        
        response_text = message.content[0].text.strip()
        
        # Extract JSON from response
        result = parse_json_response(response_text)
        result['raw_response'] = response_text
        result['json_valid'] = result.get('json_valid', False)
        
        return result
    
    except Exception as e:
        return {
            'predicted_stars': None,
            'explanation': None,
            'error': str(e),
            'json_valid': False
        }


def parse_json_response(response_text):
    """
    Parse JSON from model response
    """
    try:
        # Try to extract JSON from response
        json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
        if json_match:
            json_str = json_match.group()
            data = json.loads(json_str)
            
            # Validate structure
            if 'predicted_stars' in data and 'explanation' in data:
                stars = data['predicted_stars']
                if isinstance(stars, int) and 1 <= stars <= 5:
                    return {
                        'predicted_stars': stars,
                        'explanation': str(data['explanation']),
                        'json_valid': True
                    }
    except:
        pass
    
    return {
        'predicted_stars': None,
        'explanation': None,
        'json_valid': False
    }

print('âœ“ Classification functions defined')

## Section 6: Run Evaluation on Sample Data

In [None]:
# Check if we have sample data
if df_sample is None:
    print('âš  No sample data available. Please load dataset first.')
else:
    # Identify review and rating columns
    review_col = None
    rating_col = None
    
    # Check for common column names
    for col in df_sample.columns:
        col_lower = col.lower()
        if 'text' in col_lower or 'review' in col_lower:
            review_col = col
        if 'star' in col_lower or 'rating' in col_lower:
            rating_col = col
    
    print(f'Review column: {review_col}')
    print(f'Rating column: {rating_col}')
    
    if review_col and rating_col:
        print(f'\nâœ“ Ready to evaluate on {len(df_sample)} reviews')
    else:
        print('\nâš  Could not identify review/rating columns')
        print('Available columns:', df_sample.columns.tolist())

In [None]:
# Run evaluation for all 3 prompts (this may take a few minutes)
if df_sample is not None and review_col and rating_col:
    print('Starting evaluation... This may take several minutes.\n')
    
    results_v1 = []
    results_v2 = []
    results_v3 = []
    
    for idx, row in df_sample.iterrows():
        if (idx + 1) % 25 == 0:
            print(f'Progress: {idx + 1}/{len(df_sample)}')
        
        review_text = str(row[review_col])[:500]  # Limit to 500 chars
        actual_rating = int(row[rating_col])
        
        # Classify with each prompt
        result_v1 = classify_review(review_text, PROMPT_V1_TEMPLATE)
        result_v2 = classify_review(review_text, PROMPT_V2_TEMPLATE)
        result_v3 = classify_review(review_text, PROMPT_V3_TEMPLATE)
        
        # Add actual rating
        result_v1['actual_stars'] = actual_rating
        result_v2['actual_stars'] = actual_rating
        result_v3['actual_stars'] = actual_rating
        
        results_v1.append(result_v1)
        results_v2.append(result_v2)
        results_v3.append(result_v3)
    
    print(f'\nâœ“ Evaluation complete for all {len(df_sample)} reviews')

## Section 7: Evaluation Metrics

In [None]:
def calculate_metrics(results):
    """
    Calculate evaluation metrics
    """
    df_results = pd.DataFrame(results)
    
    # JSON Validity
    json_valid_count = df_results['json_valid'].sum()
    json_valid_rate = (json_valid_count / len(df_results)) * 100
    
    # Accuracy (only for valid predictions)
    valid_mask = df_results['json_valid'] == True
    if valid_mask.sum() > 0:
        correct = (df_results[valid_mask]['predicted_stars'] == df_results[valid_mask]['actual_stars']).sum()
        accuracy = (correct / valid_mask.sum()) * 100
    else:
        accuracy = 0
    
    # Mean Absolute Error (for valid predictions)
    if valid_mask.sum() > 0:
        mae = np.abs(df_results[valid_mask]['predicted_stars'] - df_results[valid_mask]['actual_stars']).mean()
    else:
        mae = float('nan')
    
    # Consistency (std dev of predictions for same actual rating)
    consistency_scores = []
    for actual in df_results['actual_stars'].unique():
        subset = df_results[(df_results['actual_stars'] == actual) & valid_mask]
        if len(subset) > 1:
            consistency_scores.append(subset['predicted_stars'].std())
    
    mean_consistency = np.nanmean(consistency_scores) if consistency_scores else float('nan')
    
    return {
        'json_valid_count': json_valid_count,
        'json_valid_rate': json_valid_rate,
        'accuracy': accuracy,
        'mae': mae,
        'consistency': mean_consistency,
        'total_samples': len(df_results)
    }


if 'results_v1' in locals():
    metrics_v1 = calculate_metrics(results_v1)
    metrics_v2 = calculate_metrics(results_v2)
    metrics_v3 = calculate_metrics(results_v3)
    
    print('âœ“ Metrics calculated')

In [None]:
# Create comparison table
if 'metrics_v1' in locals():
    comparison_df = pd.DataFrame({
        'Metric': ['Total Samples', 'JSON Valid Count', 'JSON Valid Rate (%)', 'Accuracy (%)', 'Mean Absolute Error', 'Consistency (Lower is Better)'],
        'Prompt V1 (Direct)': [
            metrics_v1['total_samples'],
            metrics_v1['json_valid_count'],
            f"{metrics_v1['json_valid_rate']:.2f}",
            f"{metrics_v1['accuracy']:.2f}",
            f"{metrics_v1['mae']:.2f}",
            f"{metrics_v1['consistency']:.2f}"
        ],
        'Prompt V2 (Chain-of-Thought)': [
            metrics_v2['total_samples'],
            metrics_v2['json_valid_count'],
            f"{metrics_v2['json_valid_rate']:.2f}",
            f"{metrics_v2['accuracy']:.2f}",
            f"{metrics_v2['mae']:.2f}",
            f"{metrics_v2['consistency']:.2f}"
        ],
        'Prompt V3 (Few-Shot)': [
            metrics_v3['total_samples'],
            metrics_v3['json_valid_count'],
            f"{metrics_v3['json_valid_rate']:.2f}",
            f"{metrics_v3['accuracy']:.2f}",
            f"{metrics_v3['mae']:.2f}",
            f"{metrics_v3['consistency']:.2f}"
        ]
    })
    
    print('\n' + '='*100)
    print('PROMPT COMPARISON RESULTS')
    print('='*100)
    print(comparison_df.to_string(index=False))
    print('='*100)

## Section 8: Analysis and Discussion

In [None]:
if 'metrics_v1' in locals():
    print("\nðŸ“Š DETAILED ANALYSIS\n")
    
    print("1. JSON VALIDITY:")
    print(f"   - Prompt V1 (Direct): {metrics_v1['json_valid_rate']:.2f}%")
    print(f"   - Prompt V2 (Chain-of-Thought): {metrics_v2['json_valid_rate']:.2f}%")
    print(f"   - Prompt V3 (Few-Shot): {metrics_v3['json_valid_rate']:.2f}%")
    
    best_json = max([
        ('V1', metrics_v1['json_valid_rate']),
        ('V2', metrics_v2['json_valid_rate']),
        ('V3', metrics_v3['json_valid_rate'])
    ], key=lambda x: x[1])
    print(f"   âœ“ Best: Prompt {best_json[0]} ({best_json[1]:.2f}%)")
    
    print("\n2. ACCURACY (on valid responses):")
    print(f"   - Prompt V1 (Direct): {metrics_v1['accuracy']:.2f}%")
    print(f"   - Prompt V2 (Chain-of-Thought): {metrics_v2['accuracy']:.2f}%")
    print(f"   - Prompt V3 (Few-Shot): {metrics_v3['accuracy']:.2f}%")
    
    best_acc = max([
        ('V1', metrics_v1['accuracy']),
        ('V2', metrics_v2['accuracy']),
        ('V3', metrics_v3['accuracy'])
    ], key=lambda x: x[1])
    print(f"   âœ“ Best: Prompt {best_acc[0]} ({best_acc[1]:.2f}%)")
    
    print("\n3. MEAN ABSOLUTE ERROR:")
    print(f"   - Prompt V1 (Direct): {metrics_v1['mae']:.2f}")
    print(f"   - Prompt V2 (Chain-of-Thought): {metrics_v2['mae']:.2f}")
    print(f"   - Prompt V3 (Few-Shot): {metrics_v3['mae']:.2f}")
    
    best_mae = min([
        ('V1', metrics_v1['mae']),
        ('V2', metrics_v2['mae']),
        ('V3', metrics_v3['mae'])
    ], key=lambda x: x[1])
    print(f"   âœ“ Best (lowest): Prompt {best_mae[0]} ({best_mae[1]:.2f})")
    
    print("\n4. CONSISTENCY:")
    print(f"   - Prompt V1 (Direct): {metrics_v1['consistency']:.2f}")
    print(f"   - Prompt V2 (Chain-of-Thought): {metrics_v2['consistency']:.2f}")
    print(f"   - Prompt V3 (Few-Shot): {metrics_v3['consistency']:.2f}")
    
    best_cons = min([
        ('V1', metrics_v1['consistency']),
        ('V2', metrics_v2['consistency']),
        ('V3', metrics_v3['consistency'])
    ], key=lambda x: x[1] if not np.isnan(x[1]) else float('inf'))
    print(f"   âœ“ Best (most consistent): Prompt {best_cons[0]} ({best_cons[1]:.2f})")

In [None]:
# Create visualizations
if 'metrics_v1' in locals():
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    metrics_names = ['V1 (Direct)', 'V2 (CoT)', 'V3 (Few-Shot)']
    metrics_list = [metrics_v1, metrics_v2, metrics_v3]
    
    # JSON Valid Rate
    ax = axes[0, 0]
    json_rates = [m['json_valid_rate'] for m in metrics_list]
    ax.bar(metrics_names, json_rates, color=['#3498db', '#e74c3c', '#2ecc71'])
    ax.set_ylabel('Percentage (%)')
    ax.set_title('JSON Valid Response Rate')
    ax.set_ylim(0, 105)
    for i, v in enumerate(json_rates):
        ax.text(i, v + 2, f'{v:.1f}%', ha='center')
    
    # Accuracy
    ax = axes[0, 1]
    accuracies = [m['accuracy'] for m in metrics_list]
    ax.bar(metrics_names, accuracies, color=['#3498db', '#e74c3c', '#2ecc71'])
    ax.set_ylabel('Percentage (%)')
    ax.set_title('Prediction Accuracy')
    ax.set_ylim(0, 105)
    for i, v in enumerate(accuracies):
        ax.text(i, v + 2, f'{v:.1f}%', ha='center')
    
    # Mean Absolute Error
    ax = axes[1, 0]
    maes = [m['mae'] for m in metrics_list]
    ax.bar(metrics_names, maes, color=['#3498db', '#e74c3c', '#2ecc71'])
    ax.set_ylabel('MAE (stars)')
    ax.set_title('Mean Absolute Error')
    for i, v in enumerate(maes):
        ax.text(i, v + 0.05, f'{v:.2f}', ha='center')
    
    # Consistency
    ax = axes[1, 1]
    consistencies = [m['consistency'] for m in metrics_list]
    ax.bar(metrics_names, consistencies, color=['#3498db', '#e74c3c', '#2ecc71'])
    ax.set_ylabel('Std Dev (lower is better)')
    ax.set_title('Response Consistency')
    for i, v in enumerate(consistencies):
        if not np.isnan(v):
            ax.text(i, v + 0.05, f'{v:.2f}', ha='center')
    
    plt.tight_layout()
    plt.savefig('prompt_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()
    print('\nâœ“ Comparison chart saved as prompt_comparison.png')

## Section 9: Key Findings & Recommendations

In [None]:
print("""\nðŸŽ¯ KEY FINDINGS & RECOMMENDATIONS

PROMPT DESIGN ITERATIONS:

V1 (Direct Classification):
  - Simplest approach with minimal context
  - Fastest inference time
  - Baseline for comparison
  - May lack detailed reasoning

V2 (Chain-of-Thought):
  - Asks model to reason step-by-step
  - More transparent decision process
  - Potentially improves accuracy through explicit reasoning
  - Slightly longer prompts but better consistency

V3 (Few-Shot Learning):
  - Provides example reviews with expected outputs
  - Demonstrates format and reasoning patterns
  - Leverages in-context learning capability
  - Longest prompts but often most accurate

EVALUATION METRICS EXPLAINED:
  - JSON Valid Rate: % of responses with valid JSON structure
  - Accuracy: % of predictions matching actual ratings (exact match)
  - MAE: Average difference between predicted and actual ratings
  - Consistency: Lower std dev = more consistent predictions

SYSTEM BEHAVIOR:
  - Few-shot prompting generally provides best accuracy
  - Chain-of-thought improves consistency
  - Direct classification is fastest but least accurate
  - Temperature=0.3 ensures reproducible results

RECOMMENDATIONS:
  1. Use Few-Shot (V3) for production accuracy
  2. Use Chain-of-Thought (V2) for transparency
  3. Monitor JSON validity rate in production
  4. Consider ensemble approach for critical applications
""")

## Section 10: Save Results

In [None]:
# Save results to CSV for reference
if 'results_v1' in locals():
    # Save comparison table
    comparison_df.to_csv('prompt_comparison_results.csv', index=False)
    print('âœ“ Saved prompt_comparison_results.csv')
    
    # Save detailed results
    pd.DataFrame(results_v1).to_csv('results_prompt_v1_direct.csv', index=False)
    pd.DataFrame(results_v2).to_csv('results_prompt_v2_cot.csv', index=False)
    pd.DataFrame(results_v3).to_csv('results_prompt_v3_fewshot.csv', index=False)
    print('âœ“ Saved detailed results for each prompt version')
    
    print('\nâœ“ All results saved successfully!')