In [1]:
import kagglehub
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

from huggingface_hub import login
from sklearn.metrics import accuracy_score
from google.colab import userdata
login(token=userdata.get('HF_TOKEN'))

In [None]:
# Pull data from kaggle
path = kagglehub.dataset_download("rmisra/news-headlines-dataset-for-sarcasm-detection")
df = pd.read_json(path + '/Sarcasm_Headlines_Dataset.json', lines=True)
df = df[['headline', 'is_sarcastic']]

In [None]:
# --- 1. Model and Tokenizer (Mistral 7B) ---
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# Add pad_token if not defined by the tokenizer (important for batching)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# --- 2. Prompt Template ---
prompt_template = """
[INST]
Analyze the following headline and determine if it is sarcastic/parody (1) or real news (0).

Here are two examples:

Headline: Former Versace store clerk sues over secret 'black code' for minority shoppers
Classification: 0 (Real News)

Headline: Boehner Just Wants Wife To Listen, Not Come Up With Alternative Debt-Reduction Ideas
Classification: 1 (Sarcastic/Parody)

Now analyze this headline:

Headline: {headline}

Respond only with the number 1 or 0.
[/INST]
"""

# --- 4. Batching Function ---
def create_batches(data, batch_size, prompt_template, tokenizer):
    batches = []
    for i in range(0, len(data), batch_size):
        batch_headlines = data[i:i + batch_size]
        batch_prompts = [prompt_template.format(headline=h) for h in batch_headlines]

        # Tokenize the entire batch at once
        inputs = tokenizer(
            batch_prompts,
            return_tensors="pt",
            padding="max_length",  # Pad to the maximum length in the batch
            truncation=True,
            max_length=512, # Control max input length
        ).to(model.device)

        batches.append((inputs, batch_headlines))
    return batches

# --- 5. Inference Function (Modified for Batching) ---
def classify_batch(batch, model, tokenizer):
    inputs, batch_headlines = batch

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode all generated texts in the batch
    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    results = []
    for headline, generated_text in zip(batch_headlines, generated_texts):
        # Extract classification
        parts = generated_text.split("[/INST]")[1].strip().split(":", 1)
        classification = parts[0].strip().capitalize()
        results.append({
            "headline": headline,
            "classification": classification
        })

    return results

# --- 6. Main Loop (Batch Processing) ---
batch_size = 8  # Adjust as needed based on your GPU memory
batches = create_batches(df['headline'], batch_size, prompt_template, tokenizer)

all_results = []
for batch in batches:
    batch_results = classify_batch(batch, model, tokenizer)
    all_results.extend(batch_results)

# --- 7. Save Results ---
results_df = pd.DataFrame(all_results)
print("Batch classification complete.")

In [12]:
results_df['is_sarcastic_pred'] = results_df['classification'].str.extract(r"^([01])").astype(int)

In [15]:
accuracy = accuracy_score(df['is_sarcastic'], results_df['is_sarcastic_pred'])
print(f"Accuracy of the classifier: {accuracy}")

Accuracy of the classifier: 0.5749372870567974
