In [1]:
import sys
sys.path.append("..")  # points to project root
from src.data import load_data
from src.data import build_datasets

filename = "../data/training_data.csv"

df = load_data(filename)

X = df.text
y = df.labels

X_train, X_test, y_train, y_test = build_datasets(X, y)

  from scipy.sparse import csr_matrix, issparse


In [2]:
X_train.head()

4621     campaign in damage control mode after trump jr...
23049    virginia judge issues new injunction against t...
19507    trump to nominate richard grenell to be ambass...
16067    crony corrupt politics: obama admin blocked fb...
33249    "saudi prince\trelieved from national guard\to...
Name: text, dtype: object

In [3]:
y_train.head()

4621     0
23049    1
19507    1
16067    0
33249    1
Name: labels, dtype: int64

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

model.to(device)


In [None]:

sample_text = X_train[0]
sample_answer = y_train[0]

classes = ["fake news", "real news"]
hypothesis_template = "This news is {}."

results = []

for label in classes:
    hypothesis = hypothesis_template.format(label)
    input = tokenizer(sample_text, hypothesis, truncation=True, return_tensors="pt").to(device)
    
    with torch.no_grad():
        output = model(**input)
    
    probs = torch.softmax(output["logits"], dim=-1)
    
    entailment_prob = probs[0][0].item()
    results.append(entailment_prob)


final_probs = torch.softmax(torch.tensor(results), dim=-1).tolist()
predictions = dict(zip(classes, final_probs))
print(predictions)



In [None]:
tags = {"fake_news":0, "real_news":1}

print(sample_text)
sample_answer

In [None]:
# Returns 'cuda:0' if on GPU, 'cpu' if on CPU
print(next(model.parameters()).device) 

# Boolean check
print(next(model.parameters()).is_cuda) 


## DistilBert (Fine-Tuning)

### Prepare the Dataset

In [4]:
import pandas as pd
from datasets import Dataset

train_df = pd.DataFrame()
train_df["label"] = y_train
train_df["text"] = X_train

train_df = train_df.reset_index(drop=True)
train_df

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,label,text
0,0,campaign in damage control mode after trump jr...
1,1,virginia judge issues new injunction against t...
2,1,trump to nominate richard grenell to be ambass...
3,0,crony corrupt politics: obama admin blocked fb...
4,1,"""saudi prince\trelieved from national guard\to..."
...,...,...
27316,0,4 reasons why this little-known trump aide is ...
27317,1,"""speaker ryan dented by healthcare debacle\tbu..."
27318,1,philippine military kills escaping islamist mi...
27319,1,egyptian lawmaker to propose anti-gay bill as ...


In [5]:
dataset = Dataset.from_pandas(train_df)
dataset = dataset.train_test_split(test_size=0.2, seed=42) # train test split for the training (test = validation)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 21856
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 5465
    })
})

### Tokenize

In [7]:
import torch
from transformers import AutoTokenizer, DataCollatorWithPadding

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# truncation and max_length ? 
def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized = dataset.map(preprocess, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 21856/21856 [00:00<00:00, 65277.23 examples/s]
Map: 100%|██████████| 5465/5465 [00:00<00:00, 63970.04 examples/s]


In [8]:

import transformers
print(transformers.__version__)

from transformers import DistilBertForSequenceClassification
print("✅ Import OK")

5.2.0
✅ Import OK


### Model

In [9]:
from transformers import AutoModelForSequenceClassification

label_mapping = {0:"fake",
                 1: "real"}

classes = {"fake":0, "real":1}

model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels = 2,
                                                           id2label = label_mapping,
                                                           label2id=classes)


Loading weights: 100%|██████████| 100/100 [00:00<00:00, 789.52it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
classifier.bias         | MISSING    | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


### Config metrics and training args

In [14]:
import numpy as np
import evaluate
from transformers import TrainingArguments

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="distilbert-fake-news",
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    weight_decay=0.01,
    bf16=True,
    eval_strategy= "epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)


### Training

In [15]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# trainer.train()

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.75it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.76it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.60it/s]
There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].


TrainOutput(global_step=1026, training_loss=0.02465871301775555, metrics={'train_runtime': 29.8366, 'train_samples_per_second': 2197.568, 'train_steps_per_second': 34.387, 'total_flos': 633061698177024.0, 'train_loss': 0.02465871301775555, 'epoch': 3.0})

In [18]:
trainer.save_model("distilbert-fake-news")

Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.80it/s]


### Test eval

In [19]:
test_df = pd.DataFrame()
test_df["label"] = y_test
test_df["text"] = X_test

test_df = test_df.reset_index(drop=True)
test_df

Unnamed: 0,label,text
0,0,belgium‚s political leader micha√´l modrikamen...
1,1,pence voices u.s. concern to turkish prime min...
2,0,‚racist‚ and ‚reprehensible‚: how joe biden ju...
3,0,democratic congressman leaves gov. snyder sput...
4,0,kellyanne conway threatens media: it‚s ‚inappr...
...,...,...
6826,0,another clinton casualty? sister of woman who ...
6827,1,world's stateless deserve nationality: unhcr
6828,1,australia arrests man accused of trying to sel...
6829,1,house democrats escalate effort to obtain trum...


In [23]:
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(preprocess, batched=True)


Map: 100%|██████████| 6831/6831 [00:00<00:00, 80718.55 examples/s]


In [24]:
results = trainer.evaluate(eval_dataset=test_dataset)
print(results)

{'eval_loss': 0.07061298936605453, 'eval_accuracy': 0.9803835456009369, 'eval_runtime': 0.7439, 'eval_samples_per_second': 9182.968, 'eval_steps_per_second': 143.841, 'epoch': 3.0}


### Fine-Tuned results

In [25]:
results

{'eval_loss': 0.07061298936605453,
 'eval_accuracy': 0.9803835456009369,
 'eval_runtime': 0.7439,
 'eval_samples_per_second': 9182.968,
 'eval_steps_per_second': 143.841,
 'epoch': 3.0}