## Load & gabungkan dataset

In [1]:
import pandas as pd

# load dataset
df_fake = pd.read_csv("../data/raw/Fake.csv")
df_true = pd.read_csv("../data/raw/True.csv")

# labeling
df_fake["label"] = 1   # fake news
df_true["label"] = 0   # real news

# combine & shuffle
df = pd.concat([df_fake, df_true], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df.head()

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",0
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",1
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",0


## EDA singkat

In [2]:
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nLabel distribution:")
print(df["label"].value_counts())
print("\nMissing values:")
print(df.isna().sum())

Dataset shape: (44898, 5)

Columns: ['title', 'text', 'subject', 'date', 'label']

Label distribution:
label
1    23481
0    21417
Name: count, dtype: int64

Missing values:
title      0
text       0
subject    0
date       0
label      0
dtype: int64


## Gabungkan title + text

In [3]:
df["text_full"] = (df["title"].fillna("") + " " + df["text"].fillna("")).str.strip()
df[["text_full", "label"]].head()

Unnamed: 0,text_full,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,1
1,Trump drops Steve Bannon from National Securit...,0
2,Puerto Rico expects U.S. to lift Jones Act shi...,0
3,OOPS: Trump Just Accidentally Confirmed He Lea...,1
4,Donald Trump heads for Scotland to reopen a go...,0


## Text cleaning

In [4]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text_clean"] = df["text_full"].apply(clean_text)
df[["text_clean", "label"]].head()


Unnamed: 0,text_clean,label
0,ben stein calls out th circuit court committed...,1
1,trump drops steve bannon from national securit...,0
2,puerto rico expects u s to lift jones act ship...,0
3,oops trump just accidentally confirmed he leak...,1
4,donald trump heads for scotland to reopen a go...,0


## Split data (train / val / test)

In [5]:
from sklearn.model_selection import train_test_split

X = df["text_clean"]
y = df["label"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Train:", len(X_train))
print("Validation:", len(X_val))
print("Test:", len(X_test))

Train: 35918
Validation: 4490
Test: 4490


## Simpan dataset hasil preprocessing

In [6]:
import os

os.makedirs("../data/processed", exist_ok=True)

pd.DataFrame({"text": X_train, "label": y_train}).to_csv("../data/processed/train.csv", index=False)
pd.DataFrame({"text": X_val, "label": y_val}).to_csv("../data/processed/val.csv", index=False)
pd.DataFrame({"text": X_test, "label": y_test}).to_csv("../data/processed/test.csv", index=False)

print("Processed data saved successfully.")

Processed data saved successfully.
