In [None]:
# ✅ Install dependencies from requirements.txt
!pip install -r requirements.txt

In [None]:
# 🔐 Set up Hugging Face credentials
from huggingface_hub import login
import os
from dotenv import load_dotenv

load_dotenv()
login(token=os.getenv("HF_TOKEN"), add_to_git_credential=False)


In [None]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="Both.csv")

label_to_id = {
   "Invalid": 0,
    "Not a Scientific Claim": 1,
    "Grey Area Claim": 2,
    "Scientific Claim": 3
}

def encode_labels(example):
    example["label"] = label_to_id[example["label"]]
    return example

dataset = dataset.map(encode_labels)

from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_function(example):
    return tokenizer(
        example["input"],  # ✅ use 'input' here
        truncation=True,
        padding="max_length",
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
# 🔤 Tokenize with DistilBERT tokenizer
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_function(example):
    return tokenizer(
        example["input"],
        truncation=True,
        padding="max_length",
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets["train"].train_test_split(test_size=0.2)
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])


In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

# Load DistilBERT model with 3 classification labels
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

In [None]:
# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("models/fine-tuned-claims-distilbert")
tokenizer.save_pretrained("models/fine-tuned-claims-distilbert")

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

predictions = trainer.predict(tokenized_datasets["test"])
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

In [None]:
cm = confusion_matrix(y_true, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

acc = accuracy_score(y_true, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')

print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")