## 🧩 Step 0: Setup Environment

In [None]:
!pip install transformers datasets accelerate fsspec==2023.6.0 --quiet
from huggingface_hub import login

# Optional: Paste your token here
HUGGINGFACE_TOKEN = "hf_your_token_here"  # Replace with your token
login(token=HUGGINGFACE_TOKEN)


## 📦 Step 1: Load & Sample 200 IMDB Reviews

In [None]:
from datasets import load_dataset
import random
import pandas as pd

imdb = load_dataset("imdb")
sample_data = random.sample(list(imdb['test']), 200)
df_sample = pd.DataFrame(sample_data)
df_sample.head()


## 🔤 Step 2: Define Prompting Techniques

In [None]:
few_shot_examples = [
    {"text": "This movie was fantastic! The acting was top-notch and the story was gripping.", "label": "Positive"},
    {"text": "I hated every minute of this film. It was boring and predictable.", "label": "Negative"},
    {"text": "Absolutely loved the cinematography and emotional depth. Highly recommend!", "label": "Positive"},
    {"text": "Terrible script and poor character development. Not worth the time.", "label": "Negative"},
    {"text": "One of the best films I’ve seen this year. Beautiful storytelling.", "label": "Positive"},
    {"text": "The plot made no sense and the pacing was awful.", "label": "Negative"},
    {"text": "Brilliant direction and a powerful message. A must-watch.", "label": "Positive"},
    {"text": "The dialogue was cheesy and the acting felt forced.", "label": "Negative"},
    {"text": "Incredible performance by the lead actor. It kept me hooked.", "label": "Positive"},
    {"text": "This movie was a total disaster. Save yourself the trouble.", "label": "Negative"}
]

def zero_shot_prompt(text):
    return f"Classify the sentiment as Positive or Negative:\n\n{text}\n\nSentiment:"

def few_shot_prompt(text):
    prompt = "Classify the sentiment of each review below:\n"
    for ex in few_shot_examples:
        prompt += f"Review: {ex['text']}\nSentiment: {ex['label']}\n\n"
    prompt += f"Review: {text}\nSentiment:"
    return prompt

def cot_prompt(text):
    return f"Review: {text}\nExplain the sentiment (positive or negative) and then give your answer."

def self_consistency_prompt(text):
    return cot_prompt(text)


## 🧠 Step 3: Load an LLM (DeepSeek / LLaVA / Mistral)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

def load_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    return pipe


## 🧪 Step 4: Predict with Prompting Techniques

In [None]:
def get_sentiment_from_output(output_text):
    output_text = output_text.lower()
    if "positive" in output_text:
        return 1
    elif "negative" in output_text:
        return 0
    return -1

def predict_sentiment(pipe, text, prompt_fn):
    prompt = prompt_fn(text)
    output = pipe(prompt, max_new_tokens=50, do_sample=False)[0]["generated_text"]
    return get_sentiment_from_output(output)


## 🔁 Step 5: Predict with Self-Consistency

In [None]:
def predict_with_self_consistency(pipe, text, prompt_fn, samples=5):
    preds = []
    for _ in range(samples):
        prompt = prompt_fn(text)
        output = pipe(prompt, max_new_tokens=50, do_sample=True, temperature=0.7)[0]["generated_text"]
        pred = get_sentiment_from_output(output)
        if pred != -1:
            preds.append(pred)
    if preds:
        return max(set(preds), key=preds.count)
    return -1


## 📊 Step 6: Evaluate Model Performance

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate(pipe, prompt_fn, use_self_consistency=False):
    y_true, y_pred = [], []
    for _, row in df_sample.iterrows():
        true = row['label']
        text = row['text']
        pred = (
            predict_with_self_consistency(pipe, text, prompt_fn)
            if use_self_consistency else
            predict_sentiment(pipe, text, prompt_fn)
        )
        if pred != -1:
            y_true.append(true)
            y_pred.append(pred)

    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred)
    }


## 🚀 Step 7: Run All Combinations

In [None]:
models = {
    "DeepSeek": "deepseek-ai/deepseek-llm-7b-chat",
    "LLaVA": "liuhaotian/llava-v1.5-7b",
    "Mistral": "mistralai/Mistral-7B-Instruct-v0.2"
}

prompt_styles = {
    "zero_shot": zero_shot_prompt,
    "few_shot": few_shot_prompt,
    "chain_of_thought": cot_prompt,
    "self_consistency": self_consistency_prompt
}

results = []

for model_name, model_id in models.items():
    pipe = load_model(model_id)
    for prompt_name, prompt_fn in prompt_styles.items():
        metrics = evaluate(pipe, prompt_fn, use_self_consistency=(prompt_name == "self_consistency"))
        results.append({
            "model": model_name,
            "prompting": prompt_name,
            **metrics
        })
        print(f"✅ Done: {model_name} + {prompt_name}")


## 📈 Step 8: Results Table

In [None]:
results_df = pd.DataFrame(results)
results_df.sort_values(by="f1_score", ascending=False)
