In [3]:
import os
import pandas as pd
import random

random.seed(42)

labels = ["real", "misleading", "out_of_context", "fake"]
samples = []

templates = {
    "real": [
        "Breaking: {event} just occurred in {location}. Emergency crews on site.",
        "{location} faces severe {disaster}. Stay tuned to official channels.",
        "Confirmed by authorities: {event} in {location} caused {impact}."
    ],
    "misleading": [
        "Why is no one talking about {event} in {location}? Cover-up?",
        "Reports suggest {exaggerated_event} – but is it true?",
        "{location} {disaster}? Government silent. #suspicious"
    ],
    "out_of_context": [
        "Terrifying image of {disaster} in {location} – but from 2017!",
        "People think this video is from today, but it's old.",
        "Reusing images from {year} to scare people about {event}."
    ],
    "fake": [
        "Aliens caused the {disaster} in {location}. Wake up!",
        "Fake alert: Tsunami in {location} — no official sources confirm.",
        "Leaked audio: {disaster} was man-made."
    ]
}

for i in range(100):
    label = random.choice(labels)
    template = random.choice(templates[label])
    text = template.format(
        event="earthquake",
        location=random.choice(["Tokyo", "San Francisco", "Mumbai", "Jakarta", "Istanbul"]),
        disaster=random.choice(["flood", "earthquake", "tsunami", "wildfire"]),
        exaggerated_event="magnitude 9.9 quake",
        impact="significant damage",
        year=random.choice(["2010", "2015", "2018", "2008"])
    )
    samples.append({"id": i+1, "text": text, "label": label})

df = pd.DataFrame(samples)
os.makedirs("data/data_processed", exist_ok=True)
df.to_csv("data/data_processed/train.csv", index=False)
df.head()


Unnamed: 0,id,text,label
0,1,Breaking: earthquake just occurred in Mumbai. ...,real
1,2,Tokyo flood? Government silent. #suspicious,misleading
2,3,Breaking: earthquake just occurred in Tokyo. E...,real
3,4,Confirmed by authorities: earthquake in San Fr...,real
4,5,Leaked audio: flood was man-made.,fake


In [1]:
import pandas as pd
import os

# Load generated dataset
df = pd.read_csv("data/data_processed/train.csv")

#  Step 1: Drop rows with invalid labels
df = df[df["label"].isin(["real", "fake"])].copy()

#  Step 2: Map string labels to integers
label_map = {"real": 0, "fake": 1}
df["label"] = df["label"].map(label_map)

#  Step 3: Ensure no missing labels remain
assert df["label"].isnull().sum() == 0, "There are still NaNs in the label column"

#  Optional: save cleaned version
df.to_csv("data/data_processed/train_clean.csv", index=False)

# Preview
print(df["label"].unique())
print("Total samples:", len(df))
df.head()


[0 1]
Total samples: 54


Unnamed: 0,id,text,label
0,1,Breaking: earthquake just occurred in Mumbai. ...,0
2,3,Breaking: earthquake just occurred in Tokyo. E...,0
3,4,Confirmed by authorities: earthquake in San Fr...,0
4,5,Leaked audio: flood was man-made.,1
5,6,Fake alert: Tsunami in Mumbai — no official so...,1
