In [3]:
from datasets import load_dataset

# Load local sarcasm dataset
dataset = load_dataset("json", data_files="C:/Users/RAGHU/datasets/Sarcasm_Headlines_Dataset_v2.json", split="train")

# Check format
print(dataset[0])


{'is_sarcastic': 1, 'headline': 'thirtysomething scientists unveil doomsday clock of hair loss', 'article_link': 'https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205'}


In [2]:
# 🚀 Optimized Sarcasm Classifier with Accelerated Transformers

import warnings
warnings.filterwarnings("ignore")

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    pipeline
)
from datasets import load_dataset, ClassLabel, concatenate_datasets
from sklearn.metrics import accuracy_score, f1_score
import torch
import numpy as np

# ✅ 1. Load and Balance Dataset
dataset = load_dataset("json", data_files="C:/Users/RAGHU/datasets/Sarcasm_Headlines_Dataset_v2.json", split="train").shuffle(seed=42)
sarc = dataset.filter(lambda x: x["is_sarcastic"] == 1).select(range(4000))
non_sarc = dataset.filter(lambda x: x["is_sarcastic"] == 0).select(range(4000))
balanced_dataset = concatenate_datasets([sarc, non_sarc]).shuffle(seed=42)
label_features = ClassLabel(names=["not_sarcastic", "sarcastic"])
balanced_dataset = balanced_dataset.cast_column("is_sarcastic", label_features)

# ✅ 2. Tokenization
model_checkpoint = "distilroberta-base"  # smaller & faster than bert-base-uncased
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(example):
    tokens = tokenizer(example["headline"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = example["is_sarcastic"]
    return tokens

tokenized_dataset = balanced_dataset.map(tokenize_function, batched=True)

# ✅ 3. Split
split = tokenized_dataset.train_test_split(test_size=0.1)
train_ds = split["train"]
eval_ds = split["test"]

# ✅ 4. Model & Data Collator
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to("cuda")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ✅ 5. Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# ✅ 6. Training Args (With GPU Optimizations)
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    fp16=True,
    logging_dir="./logs",
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="none"
)

# ✅ 7. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# ✅ 8. Train and Evaluate
trainer.train()
results = trainer.evaluate()
print("\n📊 Evaluation:", results)

# ✅ 9. Save
model_path = "./fine-tuned-sarcasm-distilroberta"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

# ✅ 10. Inference Pipeline
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path, device=0 if torch.cuda.is_available() else -1)

# ✅ 11. Device Info
if torch.cuda.is_available():
    print("\n✅ Using GPU:", torch.cuda.get_device_name(0))
    print("   CUDA Version:", torch.version.cuda)
    print("   cuDNN Enabled:", torch.backends.cudnn.enabled)
else:
    print("\n⚠️ Running on CPU.")

# ✅ 12. Human-Readable Output
label_map = {0: "Not Sarcastic", 1: "Sarcastic"}
examples = [
    "I'm thrilled to be ignored for the third time today.",
    "It's raining again. What a surprise.",
    "This software never crashes! Oh wait...",
    "Thank you so much for your generous fine!",
    "The weather is genuinely beautiful today."
]

for sentence in examples:
    output = classifier(sentence)[0]
    label_id = int(output["label"].split("_")[-1])
    print(f"\n🔍 {sentence}\n → Prediction: {label_map[label_id]} ({output['score']:.2%})")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.252157,0.8875,0.885204
2,No log,0.254197,0.9025,0.900763
3,0.246000,0.441977,0.9,0.896104



📊 Evaluation: {'eval_loss': 0.25215739011764526, 'eval_accuracy': 0.8875, 'eval_f1': 0.8852040816326531, 'eval_runtime': 1.3416, 'eval_samples_per_second': 596.306, 'eval_steps_per_second': 18.635, 'epoch': 3.0}


Device set to use cuda:0



✅ Using GPU: NVIDIA GeForce RTX 4070 Laptop GPU
   CUDA Version: 12.1
   cuDNN Enabled: True

🔍 I'm thrilled to be ignored for the third time today.
 → Prediction: Not Sarcastic (93.44%)

🔍 It's raining again. What a surprise.
 → Prediction: Not Sarcastic (95.88%)

🔍 This software never crashes! Oh wait...
 → Prediction: Not Sarcastic (89.01%)

🔍 Thank you so much for your generous fine!
 → Prediction: Not Sarcastic (96.26%)

🔍 The weather is genuinely beautiful today.
 → Prediction: Not Sarcastic (98.68%)


In [18]:
# ✅ Evaluation Script for Sarcasm Classifier

from transformers import pipeline
import torch
import pandas as pd

# ✅ Human-labeled sarcasm test examples
examples = [
    {"text": "I'm so glad my phone died right when I needed it the most.", "label": 1},
    {"text": "Oh joy, another Monday morning meeting!", "label": 1},
    {"text": "Perfect! My favorite show got canceled again!", "label": 1},
    {"text": "I'm absolutely thrilled to do your work too.", "label": 1},
    {"text": "What a brilliant update, it broke everything!", "label": 1},
    {"text": "I really enjoyed the movie last night.", "label": 0},
    {"text": "The weather is beautiful today.", "label": 0},
    {"text": "Thanks for your help on the project!", "label": 0},
    {"text": "This book was very informative.", "label": 0},
    {"text": "I finally completed my marathon training goal.", "label": 0},
]

# ✅ Load the fine-tuned model
model_path = "./fine-tuned-sarcasm-detector"
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path, device=device)

# ✅ Label decoding
label_map = {"LABEL_0": "Not Sarcastic", "LABEL_1": "Sarcastic"}

# ✅ Prediction and comparison
results = []
for item in examples:
    prediction = classifier(item["text"])[0]
    predicted_label = 1 if prediction["label"] == "LABEL_1" else 0
    results.append({
        "Input Text": item["text"],
        "Expected": "Sarcastic (1)" if item["label"] == 1 else "Not Sarcastic (0)",
        "Predicted": label_map[prediction["label"]],
        "Confidence": f"{prediction['score']:.2%}",
        "Correct": "✅" if predicted_label == item["label"] else "❌"
    })

# ✅ Display as DataFrame
df = pd.DataFrame(results)
print("\n🎯 Evaluation Results:")
print(df.to_string(index=False))


Device set to use cuda:0



🎯 Evaluation Results:
                                                Input Text          Expected     Predicted Confidence Correct
I'm so glad my phone died right when I needed it the most.     Sarcastic (1) Not Sarcastic     99.97%       ❌
                   Oh joy, another Monday morning meeting!     Sarcastic (1) Not Sarcastic     99.95%       ❌
             Perfect! My favorite show got canceled again!     Sarcastic (1) Not Sarcastic     99.91%       ❌
              I'm absolutely thrilled to do your work too.     Sarcastic (1) Not Sarcastic     99.98%       ❌
             What a brilliant update, it broke everything!     Sarcastic (1) Not Sarcastic     99.97%       ❌
                    I really enjoyed the movie last night. Not Sarcastic (0) Not Sarcastic     99.97%       ✅
                           The weather is beautiful today. Not Sarcastic (0) Not Sarcastic     99.98%       ✅
                      Thanks for your help on the project! Not Sarcastic (0) Not Sarcastic     99

In [1]:
# 🚀 Optimized Sarcasm Classifier with Human-Evaluated Inference

import warnings
warnings.filterwarnings("ignore")

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    pipeline
)
from datasets import load_dataset, ClassLabel, concatenate_datasets
from sklearn.metrics import accuracy_score, f1_score
import torch
import numpy as np
from tabulate import tabulate

# ✅ 1. Load and Balance Dataset
dataset = load_dataset("json", data_files="C:/Users/RAGHU/datasets/Sarcasm_Headlines_Dataset_v2.json", split="train").shuffle(seed=42)
sarc = dataset.filter(lambda x: x["is_sarcastic"] == 1).select(range(4000))
non_sarc = dataset.filter(lambda x: x["is_sarcastic"] == 0).select(range(4000))
balanced_dataset = concatenate_datasets([sarc, non_sarc]).shuffle(seed=42)
label_features = ClassLabel(names=["not_sarcastic", "sarcastic"])
balanced_dataset = balanced_dataset.cast_column("is_sarcastic", label_features)

# ✅ 2. Tokenization
model_checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(example):
    tokens = tokenizer(example["headline"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = example["is_sarcastic"]
    return tokens

tokenized_dataset = balanced_dataset.map(tokenize_function, batched=True)

# ✅ 3. Split
split = tokenized_dataset.train_test_split(test_size=0.1)
train_ds = split["train"]
eval_ds = split["test"]

# ✅ 4. Model & Data Collator
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to("cuda")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ✅ 5. Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# ✅ 6. Training Args
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    fp16=True,
    logging_dir="./logs",
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="none"
)

# ✅ 7. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# ✅ 8. Train and Evaluate
trainer.train()
results = trainer.evaluate()
print("\n📊 Evaluation:", results)

# ✅ 9. Save
model_path = "./fine-tuned-sarcasm-distilroberta"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

# ✅ 10. Inference Pipeline
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path, device=0 if torch.cuda.is_available() else -1)

# ✅ 11. Device Info
if torch.cuda.is_available():
    print("\n✅ Using GPU:", torch.cuda.get_device_name(0))
    print("   CUDA Version:", torch.version.cuda)
    print("   cuDNN Enabled:", torch.backends.cudnn.enabled)
else:
    print("\n⚠️ Running on CPU.")

# ✅ 12. Human-Evaluated Test Set
human_test_examples = [
    ("I'm so glad my phone died right when I needed it the most.", 1),
    ("Oh joy, another Monday morning meeting!", 1),
    ("Perfect! My favorite show got canceled again!", 1),
    ("I'm absolutely thrilled to do your work too.", 1),
    ("What a brilliant update, it broke everything!", 1),
    ("I really enjoyed the movie last night.", 0),
    ("The weather is beautiful today.", 0),
    ("Thanks for your help on the project!", 0),
    ("This book was very informative.", 0),
    ("I finally completed my marathon training goal.", 0)
]

# ✅ 13. Evaluate on Human Test Set
results_table = []
label_map = {0: "Not Sarcastic", 1: "Sarcastic"}

for text, expected in human_test_examples:
    output = classifier(text)[0]
    label_id = int(output["label"].split("_")[-1])
    prediction = label_map[label_id]
    expected_str = label_map[expected]
    is_correct = label_id == expected
    emoji = "✅" if is_correct else "❌"
    results_table.append([text, f"{expected_str} ({expected})", prediction, f"{output['score']:.2%}", emoji])

# ✅ 14. Display
print("\n🎯 Evaluation Results:")
print(tabulate(results_table, headers=["Input Text", "Expected", "Predicted", "Confidence", "Correct"], tablefmt="fancy_grid"))


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.276545,0.88,0.876923
2,No log,0.251266,0.9025,0.903704
3,0.265400,0.373197,0.90375,0.905055



📊 Evaluation: {'eval_loss': 0.2512662410736084, 'eval_accuracy': 0.9025, 'eval_f1': 0.9037037037037037, 'eval_runtime': 1.678, 'eval_samples_per_second': 476.764, 'eval_steps_per_second': 14.899, 'epoch': 3.0}


Device set to use cuda:0



✅ Using GPU: NVIDIA GeForce RTX 4070 Laptop GPU
   CUDA Version: 12.1
   cuDNN Enabled: True

🎯 Evaluation Results:
╒════════════════════════════════════════════════════════════╤═══════════════════╤═══════════════╤══════════════╤═══════════╕
│ Input Text                                                 │ Expected          │ Predicted     │ Confidence   │ Correct   │
╞════════════════════════════════════════════════════════════╪═══════════════════╪═══════════════╪══════════════╪═══════════╡
│ I'm so glad my phone died right when I needed it the most. │ Sarcastic (1)     │ Not Sarcastic │ 98.50%       │ ❌        │
├────────────────────────────────────────────────────────────┼───────────────────┼───────────────┼──────────────┼───────────┤
│ Oh joy, another Monday morning meeting!                    │ Sarcastic (1)     │ Not Sarcastic │ 89.98%       │ ❌        │
├────────────────────────────────────────────────────────────┼───────────────────┼───────────────┼──────────────┼───────────┤
│ P

In [1]:
from transformers import pipeline

# Load the zero-shot classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Input examples
examples = [
    "I'm so glad my phone died right when I needed it the most.",
    "Oh great, another email from the boss at 2 AM.",
    "I really enjoyed the movie last night.",
    "Thank you so much for your generous fine.",
    "The weather is beautiful today."
]

# Define candidate labels
labels = ["sarcastic", "not sarcastic"]

# Run classification
for sentence in examples:
    result = classifier(sentence, labels)
    prediction = result["labels"][0]
    score = result["scores"][0]
    print(f"\n🔍 {sentence}")
    print(f" → Prediction: {prediction.capitalize()} ({score:.2%})")


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0



🔍 I'm so glad my phone died right when I needed it the most.
 → Prediction: Sarcastic (56.53%)

🔍 Oh great, another email from the boss at 2 AM.
 → Prediction: Not sarcastic (57.62%)

🔍 I really enjoyed the movie last night.
 → Prediction: Not sarcastic (69.73%)

🔍 Thank you so much for your generous fine.
 → Prediction: Not sarcastic (68.02%)

🔍 The weather is beautiful today.
 → Prediction: Not sarcastic (71.83%)


In [None]:
# ✅ Sarcasm Classifier Fine-Tuning with GPU using Hugging Face (Enhanced Version)

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    pipeline,
)
from datasets import load_dataset, ClassLabel, DatasetDict
import torch
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# ✅ 1. Load and Balance the Sarcasm Dataset
path = "C:/Users/RAGHU/datasets/Sarcasm_Headlines_Dataset_v2.json"
dataset = load_dataset("json", data_files=path, split="train")
dataset = dataset.shuffle(seed=42)

# Balance the dataset (equal sarcastic and non-sarcastic samples)
sarc = dataset.filter(lambda x: x["is_sarcastic"] == 1).select(range(7000))
non_sarc = dataset.filter(lambda x: x["is_sarcastic"] == 0).select(range(7000))
from datasets import concatenate_datasets
balanced_dataset = concatenate_datasets([sarc, non_sarc]).shuffle(seed=42)


# ✅ 2. Convert labels to ClassLabel
label_features = ClassLabel(names=["not_sarcastic", "sarcastic"])
balanced_dataset = balanced_dataset.cast_column("is_sarcastic", label_features)

# ✅ 3. Tokenization
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(example):
    tokens = tokenizer(example["headline"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = example["is_sarcastic"]
    return tokens

tokenized_dataset = balanced_dataset.map(tokenize_function, batched=True)

# ✅ 4. Split the Dataset
split = tokenized_dataset.train_test_split(test_size=0.1)
train_ds = split["train"]
test_ds = split["test"]

# ✅ 5. Model Setup
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to("cuda")

# ✅ 6. Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ✅ 7. Metric Function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# ✅ 8. Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none"
)

# ✅ 9. Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ✅ 10. Train
trainer.train()

# ✅ 11. Save
model_path = "./fine-tuned-sarcasm-detector"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

# ✅ 12. Inference Pipeline
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path, device=0 if torch.cuda.is_available() else -1)

# ✅ 13. Hardware Info
if torch.cuda.is_available():
    print("\n✅ Using GPU:", torch.cuda.get_device_name(0))
    print("   CUDA Version:", torch.version.cuda)
    print("   cuDNN Enabled:", torch.backends.cudnn.enabled)
else:
    print("\n⚠️ Running on CPU. Consider enabling a GPU for better performance.")

# ✅ 14. Label Mapping and Inference
label_map = {0: "Not Sarcastic", 1: "Sarcastic"}
examples = [
    "The meeting was incredibly productive. Everyone just yelled for two hours.",
    "I absolutely love it when my computer crashes.",
    "The weather is lovely today and I’m having a great time.",
    "Oh fantastic, another email from my boss at midnight!",
    "Thanks a lot for your help... not."
]

for sentence in examples:
    output = classifier(sentence)[0]
    label_id = int(output["label"].split("_")[-1])
    print(f"\n🔍 {sentence}\n → Prediction: {label_map[label_id]} ({output['score']:.2%})")

In [None]:
# 🚀 Fine-Tune a Sarcasm Detector with Hugging Face + GPU + Improvements

import warnings
warnings.filterwarnings("ignore")

# ✅ 1. Imports
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    pipeline
)
from sklearn.metrics import accuracy_score, f1_score
import torch

# ✅ 2. Load and Balance Dataset (local JSON file)
dataset = load_dataset("json", data_files="C:/Users/RAGHU/datasets/Sarcasm_Headlines_Dataset_v2.json", split="train")
dataset = dataset.shuffle(seed=42)

# ✅ 3. Preprocess (use only headline and is_sarcastic)
dataset = dataset.remove_columns(["article_link"])

# ✅ 4. Tokenizer Setup
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize(example):
    return tokenizer(example["headline"], truncation=True, padding="max_length", max_length=128)

tokenized = dataset.map(tokenize, batched=True)
tokenized = tokenized.rename_column("is_sarcastic", "labels")
tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# ✅ 5. Train/Validation Split
split = tokenized.train_test_split(test_size=0.1)
train_ds = split["train"]
eval_ds = split["test"]

# ✅ 6. Model Initialization
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to("cuda")

# ✅ 7. Data Collator
collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ✅ 8. Metric Computation
def compute_metrics(pred):
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(pred.label_ids, preds),
        "f1": f1_score(pred.label_ids, preds)
    }

# ✅ 9. Training Arguments
args = TrainingArguments(
    output_dir="./sarcasm_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none"
)

# ✅ 10. Trainer Setup
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics
)

# ✅ 11. Train
trainer.train()

# ✅ 12. Save
trainer.save_model("./fine-tuned-sarcasm")
tokenizer.save_pretrained("./fine-tuned-sarcasm")

# ✅ 13. Inference - Human-Friendly Output

# Show device info
if torch.cuda.is_available():
    print("✅ Using GPU:", torch.cuda.get_device_name(0))
    print("   CUDA Version :", torch.version.cuda)
    print("   cuDNN Enabled:", torch.backends.cudnn.enabled)
else:
    print("⚠️ Running on CPU. Consider enabling GPU acceleration for faster training/inference.")

# Map model output labels to readable form
label_map = {"LABEL_0": "Not Sarcastic", "LABEL_1": "Sarcastic"}

# Examples to classify
examples = [
    "Oh great, another Monday!",
    "I just love getting stuck in traffic for hours.",
    "Thank you for the gift, it's exactly what I never wanted.",
    "The weather is lovely today and I’m having a great time."
]

print("\n🎯 Inference Results:")
for sentence in examples:
    prediction = classifier(sentence)[0]
    label = label_map.get(prediction["label"], prediction["label"])
    score = prediction["score"]
    print(f"🗣 \"{sentence}\"\n → Prediction: {label} ({score:.2%})\n")

In [None]:
# ✅ Sarcasm Classifier Fine-Tuning with GPU using Hugging Face

import warnings
warnings.filterwarnings("ignore")

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    pipeline,
)
from datasets import load_dataset, ClassLabel
import torch

# ✅ 1. Load the Sarcasm Dataset
path = "C:/Users/RAGHU/datasets/Sarcasm_Headlines_Dataset_v2.json"
dataset = load_dataset("json", data_files=path, split="train")
dataset = dataset.shuffle(seed=42)

# ✅ 2. Convert labels to ClassLabel if needed
label_features = ClassLabel(names=["not_sarcastic", "sarcastic"])
dataset = dataset.cast_column("is_sarcastic", label_features)

# ✅ 3. Tokenizer & Tokenization
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(example):
    tokens = tokenizer(example["headline"], truncation=True, padding="max_length", max_length=256)
    tokens["labels"] = example["is_sarcastic"]
    return tokens


tokenized_dataset = dataset.map(tokenize_function, batched=True)

# ✅ 4. Model Setup
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to("cuda")

# ✅ 5. Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ✅ 6. Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=20,
    save_total_limit=1,
    load_best_model_at_end=False,
    report_to="none"
)

# ✅ 7. Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset.select(range(8000)),  # use a subset for quicker training
#    tokenizer=tokenizer,
    data_collator=data_collator,
)

# ✅ 8. Train the Model
trainer.train()

# ✅ 9. Save the Model
model_path = "./fine-tuned-sarcasm-detector"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

# ✅ 10. Inference Pipeline
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path, device=0 if torch.cuda.is_available() else -1)

# ✅ Test Examples
#print("\n🟢 Honest:", classifier("The government passed a great policy today!"))
#print("\n🔴 Sarcasm:", classifier("Oh wow, what a brilliant idea! Let’s ignore science again."))

# Better device info
if torch.cuda.is_available():
    print("✅ Using GPU:", torch.cuda.get_device_name(0))
    print("   CUDA Version :", torch.version.cuda)
    print("   cuDNN Enabled:", torch.backends.cudnn.enabled)
else:
    print("⚠️ Running on CPU. Consider enabling a GPU for better performance.")

# Human-readable labels
label_map = {0: "Not Sarcastic", 1: "Sarcastic"}

examples = [
    "The government passed a great policy today!",
    "Oh wow, what a brilliant idea! Let’s ignore science again.",
    "Totally what I wanted to hear at 3 a.m.",
    "I'm just thrilled the meeting was extended by 2 hours."
]

for sentence in examples:
    output = classifier(sentence)[0]
    label_id = int(output["label"].split("_")[-1])
    print(f"\n🔍 {sentence}\n → Prediction: {label_map[label_id]} ({output['score']:.2%})")