# Import Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Preparing Dataset

In [None]:
df_hoax = pd.read_csv("/content/Cleaned_TurnBackHoax_v3.csv")
df_hoax = df_hoax.rename(columns={'clean_text': 'Text','label':'Label'})
df_hoax.head()

Unnamed: 0,url,judul,narasi,Label,Text
0,https://turnbackhoax.id/articles/30091,[SALAH] Menkeu Purbaya Curiga Permainan Bunga ...,FAKTOR PENCAIRAN TPG SERING TELAT Menkeu Purba...,1,faktor pencairan tpg sering telat menkeu purba...
1,https://turnbackhoax.id/articles/30090,[SALAH] Bahlil Sambut Kedatangan Ahli Gizi dar...,BREAKING NEWS Ahli gizi dari india resmi tiba ...,1,breaking news ahli gizi dari india resmi tiba ...
2,https://turnbackhoax.id/articles/30089,[SALAH] Video Cak Imin Membicarakan tentang Pe...,Program pemutihan tunggakan BPJS Kesehatan aka...,1,program pemutihan tunggakan bpjs kesehatan aka...
3,https://turnbackhoax.id/articles/30088,Cek Fakta: Tidak Benar Link Pendaftaran Progra...,Kabar baik untuk semua! tahun 2025 tunggakan B...,1,kabar baik untuk semua! tahun 2025 tunggakan b...
4,https://turnbackhoax.id/articles/30087,Viral Aturan Tilang 2026 Denda Manual Naik 150...,Isu semakin panas! Pak Kapolri yang satu ini k...,1,isu semakin panas! pak kapolri yang satu ini k...


In [None]:
df_hoax.shape

(12744, 5)

In [None]:
df_detik = pd.read_csv("/content/Cleaned_Detik_v2.csv")
df_antara = pd.read_csv('/content/Cleaned_Antaranews_v1.csv')
df_kompas = pd.read_csv('/content/Cleaned_Kompas_v2.csv')

In [None]:
df_non_hoax = pd.concat([df_detik, df_kompas, df_antara], ignore_index=True)
df_non_hoax = df_non_hoax.rename(columns={'clean_text': 'Text','label':'Label'})
df_non_hoax.head()

Unnamed: 0,url,judul,narasi,Label,Text
0,https://health.detik.com/fotohealth/d-8212146/...,Sekolah Lansia Ceria Bersama Hadirkan Kegiatan...,Aceh - Lansia di Aceh Besar mengikuti kegiatan...,0,lansia di aceh besar mengikuti kegiatan anyama...
1,https://health.detik.com/berita-detikhealth/d-...,Peneliti AS Semakin Dekat Wujudkan Transplanta...,Para dokter di NYU Langone Health melaporkan d...,0,para dokter di nyu langone health melaporkan d...
2,https://health.detik.com/kebugaran/d-8213096/l...,"Lari di Rute Hijau IPB Half Marathon 2025, Pul...",Ingin menjajal alternatif race lari dengan rut...,0,ingin menjajal alternatif race lari dengan rut...
3,https://health.detik.com/berita-detikhealth/d-...,Gen Z Makin Berisiko Kena Penyakit Gula gegara...,Diabetes melitus merupakan salah penyakit kron...,0,diabetes melitus merupakan salah penyakit kron...
4,https://health.detik.com/berita-detikhealth/d-...,Mengenal Kondisi Syok Kardiogenik Jantung dan ...,Tidak semua serangan jantung langsung berakhir...,0,tidak semua serangan jantung langsung berakhir...


In [None]:
df_non_hoax.shape

(11200, 5)

In [None]:
df = pd.concat([df_hoax[['Text', 'Label']], df_non_hoax[['Text', 'Label']]], ignore_index=True)
print(df['Label'].value_counts())

Label
1    12744
0    11200
Name: count, dtype: int64


In [None]:
print(df.shape)

(23944, 2)


# Split Data

In [None]:
print("Original distribution:")
print(df['Label'].value_counts())
print("Total :", len(df))

Original distribution:
Label
1    12744
0    11200
Name: count, dtype: int64
Total : 23944


In [None]:
df_true = df[df['Label'] == 0]
df_fake = df[df['Label'] == 1]

df_true_sub = df_true.sample(frac=0.30, random_state=42)

df_final = pd.concat([df_fake, df_true_sub], ignore_index=True)

print(df_final['Label'].value_counts())
print("Total:", len(df_final))

Label
1    12744
0     3360
Name: count, dtype: int64
Total: 16104


In [None]:
df_train, df_test = train_test_split(
    df_final,
    test_size=0.1,
    stratify=df_final['Label'],
    random_state=42
)

print("Train:", len(df_train))
print("Test :", len(df_test))

Train: 14493
Test : 1611


In [None]:
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(df_train[['Text']], df_train['Label'])

df_train_balanced = pd.DataFrame({
    'Text': X_res['Text'],
    'Label': y_res
})

print("\nBalanced Train:")
print(df_train_balanced['Label'].value_counts())
print("Total:", len(df_train_balanced))


Balanced Train:
Label
1    11469
0    11469
Name: count, dtype: int64
Total: 22938


In [None]:
print("jumlah data test baru:", len(df_test))
print("Real:", (df_test['Label'] == 0).sum())
print("Fake:", (df_test['Label'] == 1).sum())

jumlah data test baru: 1611
Real: 336
Fake: 1275


In [None]:
df_train_balanced.to_csv('hoax_train_balanced.csv', index=False)
df_test.to_csv('hoax_test.csv', index=False)

# Text Cleaning

In [None]:
df_train = pd.read_csv('/content/hoax_train_balanced.csv')
df_test = pd.read_csv('/content/hoax_test.csv')

In [None]:
df_train.shape, df_test.shape

((22938, 2), (1611, 2))

In [None]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'@\w+|#\w+|http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)

    ps = PorterStemmer()
    stop_words = set(stopwords.words('indonesian'))
    filtered = [ps.stem(w) for w in words if w not in stop_words]

    return ' '.join(filtered)

In [None]:
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)

df_train['Text'] = df_train['Text'].apply(clean_text)
df_test['Text'] = df_test['Text'].apply(clean_text)

In [None]:
df_train.head(), df_test.head(), df_train.shape, df_test.shape

(                                                Text  Label
 0  rakyat ga hutang listrik pln rugi rakyat beli ...      1
 1  media sosial beredar unggahan video diklaim pu...      1
 2                      terkuak mantan napi ex juddol      1
 3                             gerik gampang dipantau      1
 4      kawan penyebab penyakit itusungguh terweluuuu      1,
                                                 Text  Label
 0  polisi mengungkap momen mencekam aditya hanafi...      0
 1  kontek taiwan dilanda gempa bumi magnitudo rab...      1
 2  salah tangkap nih egi palsu resmi dilepaskan s...      1
 3  gojek tokopedia menyepakati peraturan pemerint...      1
 4  ani pinang orma pemuda pancasila ketua sosok a...      1,
 (22938, 2),
 (1611, 2))

In [None]:
df_train_balanced.to_csv('hoax_train_balanced_clean.csv', index=False)
df_test.to_csv('hoax_test_clean.csv', index=False)