In [None]:
!pip install transformers pandas torch

import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load GPT-2
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

headlines = []
prompt = "Breaking News:"

for _ in range(50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    output = model.generate(
        input_ids,
        max_length=50,
        num_return_sequences=1,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    generated = tokenizer.decode(output[0], skip_special_tokens=True)
    headlines.append(generated)

df = pd.DataFrame(headlines, columns=["headline"])
df.to_csv("fake_headlines.csv", index=False)
print("✅ Saved 50 fake headlines")


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


✅ Saved 50 fake headlines


In [1]:
import pandas as pd

# Load fake headlines
fake_df = pd.read_csv("fake_headlines.csv")
fake_df["label"] = 0  # 0 = fake

# Create sample real headlines manually (or later from dataset)
real_headlines = [
    "India wins gold at Commonwealth Games",
    "NASA announces new moon mission",
    "Government launches startup funding scheme",
    "Apple releases latest iPhone model",
    "AIIMS reports progress in cancer treatment",
    "Weather department issues rain alert in Delhi",
    "BCCI confirms World Cup team lineup",
    "ISRO successfully tests rocket engine",
    "CBSE announces board exam schedule",
    "UN holds climate summit in Geneva"
] * 5  # repeat to make 50

real_df = pd.DataFrame(real_headlines, columns=["headline"])
real_df["label"] = 1  # 1 = real

# Combine both
df = pd.concat([fake_df, real_df], ignore_index=True)

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)

# Save to file
df.to_csv("news_dataset.csv", index=False)
print("✅ Combined dataset saved to news_dataset.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'fake_headlines.csv'

In [None]:
# Fake News Detection Using DistilBERT (lighter and optimized for local machines)

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch



# Load only the train split to avoid ambiguity
dataset = load_dataset("imdb", split="train[:2000]")  # loads first 2000 samples



# 2. Preprocess text
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, padding=True)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# 3. Load model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# 4. Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

# 5. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(1600)),
    eval_dataset=tokenized_dataset["train"].select(range(1600, 2000)),
)

# 6. Train
trainer.train()

# 7. Save model
model.save_pretrained("./distilbert-fake-news-detector")
tokenizer.save_pretrained("./distilbert-fake-news-detector")

print("✅ Training completed and model saved!")



In [None]:
!pip uninstall transformers -y
!pip install transformers==4.53.0



In [None]:
# Fake News Detection Using DistilBERT (lighter and optimized for local machines)

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# 1. Load dataset (you can replace with your own CSV)
dataset = load_dataset("imdb")  # We'll treat sentiment like fake (neg) vs real (pos)
dataset = dataset.shuffle(seed=42).select(range(2000))  # limit for memory

# 2. Preprocess text
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, padding=True)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# 3. Load model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# 4. Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

# 5. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(1600)),
    eval_dataset=tokenized_dataset["train"].select(range(1600, 2000)),
)

# 6. Train
trainer.train()

# 7. Save model
model.save_pretrained("./distilbert-fake-news-detector")
tokenizer.save_pretrained("./distilbert-fake-news-detector")

print("✅ Training completed and model saved!")

