# Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Preparing Dataset

In [3]:
df_hoax = pd.read_excel("/content/turnbackhoax.xlsx")
df_hoax = df_hoax.rename(columns={'Clean Narasi': 'Text','hoax':'Label'})
df_hoax.head()

Unnamed: 0.1,Unnamed: 0,Title,Timestamp,FullText,Tags,Author,Url,politik,Narasi,Text,Label
0,0,[SALAH] Anies Baswedan Dekat Dengan Aliran Krs...,"Maret 1, 2023",Hasil Periksa Fakta Gabriela Nauli Sinaga (Uni...,Fitnah;Hasut;Hoax,Pemeriksa Fakta Junior,https://turnbackhoax.id/2023/03/01/salah-anies...,1,\n“BISA DILIHAT SI ONTA YAMAN NGGAK PEDULI ITU...,BISA DILIHAT SI ONTA YAMAN NGGAK PEDULI ITU AP...,1
1,1,[SALAH] Hakim Wahyu Iman Santoso Alami Kecelak...,"Maret 1, 2023",Hasil Periksa Fakta Gabriela Nauli Sinaga (Uni...,Fitnah;Hasut;Hoax,Pemeriksa Fakta Junior,https://turnbackhoax.id/2023/03/01/salah-hakim...,0,\n“ini bener gasih?? Ya Allah gimna keadaan pa...,ini bener gasih?? Ya Allah gimna keadaan pa ha...,1
2,2,[SALAH] GAMBAR MEGAWATI DAN PUAN BERMAIN SLOT,"Februari 28, 2023",Hasil Periksa Fakta Gabriela Nauli Sinaga (Uni...,Fitnah;Hasut;Hoax,Pemeriksa Fakta Junior,https://turnbackhoax.id/2023/02/28/salah-gamba...,1,\n“Nenek lampir pemimpin partai banteng bercul...,Nenek lampir pemimpin partai banteng bercula s...,1
3,3,[SALAH] JONATHAN LATUMAHINA SEORANG NASRANI DA...,"Februari 28, 2023",Hasil Periksa Fakta Gabriela Nauli Sinaga (Uni...,Fitnah;Hasut;Hoax,Pemeriksa Fakta Junior,https://turnbackhoax.id/2023/02/28/salah-jonat...,0,\n“gerombolan kulup banyak menyusup ke ormas2 ...,gerombolan kulup banyak menyusup ke ormas2 isl...,1
4,4,[SALAH] PESAN WHATSAPP DARI BMKG YANG KABARKAN...,"Februari 28, 2023",Hasil Periksa Fakta Gabriela Nauli Sinaga (Uni...,Fitnah;Hasut;Hoax,Pemeriksa Fakta Junior,https://turnbackhoax.id/2023/02/28/salah-pesan...,1,,,1


In [4]:
df_hoax.shape

(10381, 11)

In [5]:
df_cnn = pd.read_excel("/content/cnn_10k.xlsx")
df_kompas = pd.read_excel('/content/kompas_4k.xlsx')
df_tempo = pd.read_excel('/content/tempo_6k.xlsx')

In [6]:
df_non_hoax = pd.concat([df_cnn, df_kompas, df_tempo], ignore_index=True)
df_non_hoax = df_non_hoax.rename(columns={'text_new': 'Text','hoax':'Label'})
df_non_hoax.head()

Unnamed: 0.1,Unnamed: 0,Title,Timestamp,FullText,Tags,Author,Url,Text,Label
0,0,Anies di Milad BKMT: Pengajian Menghasilkan Ib...,"Selasa, 21 Feb 2023 21:22 WIB","Jakarta, CNN Indonesia -- Mantan Gubernur DKI ...",anies baswedan;pengajian;pilpres 2024;badan ko...,CNN Indonesia,https://www.cnnindonesia.com/nasional/20230221...,Anies di Milad BKMT: Pengajian Menghasilkan Ib...,0
1,1,Edy Soal Pilgub Sumut: Kalau yang Maju Abal-ab...,"Selasa, 21 Feb 2023 20:46 WIB","Medan, CNN Indonesia -- Gubernur Sumatera Utar...",edy rahmayadi;pemilu 2024;pilkada 2024,CNN Indonesia,https://www.cnnindonesia.com/nasional/20230221...,Edy Soal Pilgub Sumut: Kalau yang Maju Abal-ab...,0
2,2,PKB Bakal Daftarkan Menaker Ida Fauziyah Jadi ...,"Selasa, 21 Feb 2023 20:33 WIB","Jakarta, CNN Indonesia -- Partai Kebangkitan B...",ida fauziyah;pkb;pemilu 2024;pileg 2024,CNN Indonesia,https://www.cnnindonesia.com/nasional/20230221...,PKB Bakal Daftarkan Menaker Ida Fauziyah Jadi ...,0
3,3,Gede Pasek Doakan AHY Jadi Capres atau Cawapres,"Selasa, 21 Feb 2023 19:58 WIB","Jakarta, CNN Indonesia -- Ketua Umum Partai Ke...",gede pasek suardika;ahy;pilpres 2024;pemilu 20...,CNN Indonesia,https://www.cnnindonesia.com/nasional/20230221...,Gede Pasek Doakan AHY Jadi Capres atau Cawapre...,0
4,4,PKN Siapkan Jabatan Khusus Buat Anas Urbaningr...,"Selasa, 21 Feb 2023 18:56 WIB","Jakarta, CNN Indonesia -- Dewan Pimpinan Pusat...",anas urbaningrum;pkn;pemilu 2024,CNN Indonesia,https://www.cnnindonesia.com/nasional/20230221...,PKN Siapkan Jabatan Khusus Buat Anas Urbaningr...,0


In [7]:
df_non_hoax.shape

(20972, 9)

In [8]:
df = pd.concat([df_hoax[['Text', 'Label']], df_non_hoax[['Text', 'Label']]], ignore_index=True)
print(df['Label'].value_counts())

Label
0    20972
1    10381
Name: count, dtype: int64


In [9]:
print(df.shape)

(31353, 2)


# Split Data

In [10]:
print("Original distribution:")
print(df['Label'].value_counts())
print("Total :", len(df))

Original distribution:
Label
0    20972
1    10381
Name: count, dtype: int64
Total : 31353


In [11]:
df_true = df[df['Label'] == 0]
df_fake = df[df['Label'] == 1]

df_fake_sub = df_fake.sample(frac=0.10, random_state=42)

df_final = pd.concat([df_true, df_fake_sub], ignore_index=True)

print(df_final['Label'].value_counts())
print("Total:", len(df_final))

Label
0    20972
1     1038
Name: count, dtype: int64
Total: 22010


In [12]:
df_train, df_test = train_test_split(
    df_final,
    test_size=0.1,
    stratify=df_final['Label'],
    random_state=42
)

print("Train:", len(df_train))
print("Test :", len(df_test))

Train: 19809
Test : 2201


In [13]:
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(df_train[['Text']], df_train['Label'])

df_train_balanced = pd.DataFrame({
    'Text': X_res['Text'],
    'Label': y_res
})

print("\nBalanced Train:")
print(df_train_balanced['Label'].value_counts())
print("Total:", len(df_train_balanced))


Balanced Train:
Label
0    18875
1    18875
Name: count, dtype: int64
Total: 37750


In [14]:
print("jumlah data test baru:", len(df_test))
print("Real:", (df_test['Label'] == 0).sum())
print("Fake:", (df_test['Label'] == 1).sum())

jumlah data test baru: 2201
Real: 2097
Fake: 104


In [15]:
df_train_balanced.to_csv('news_train_balanced.csv', index=False)
df_test.to_csv('news_test.csv', index=False)

# Text Cleaning

In [16]:
df_train = pd.read_csv('/content/news_train_balanced.csv')
df_test = pd.read_csv('/content/news_test.csv')

In [17]:
df_train.shape, df_test.shape

((37750, 2), (2201, 2))

In [18]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'@\w+|#\w+|http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)

    ps = PorterStemmer()
    stop_words = set(stopwords.words('indonesian'))
    filtered = [ps.stem(w) for w in words if w not in stop_words]

    return ' '.join(filtered)

In [19]:
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)

df_train['Text'] = df_train['Text'].apply(clean_text)
df_test['Text'] = df_test['Text'].apply(clean_text)

In [20]:
df_train.head(), df_test.head(), df_train.shape, df_test.shape

(                                                Text  Label
 0  politiku gerindra sentil wagub uu sepelekan bu...      0
 1  sahroni buru pembocor data urusan mural wakil ...      0
 2  sekjen golkar pertemuan puan airlangga diagend...      0
 3  partai buruh kampany langgar uu pemilu partai ...      0
 4  mahfud publik terprovokasi manuv politik wabup...      0,
                                                 Text  Label
 0  jokowi bambang susantono dhoni kombinasi uru i...      0
 1  pengamat rakerna nasdem munculkan ridwan kamil...      0
 2  selamat malam teman info viru corona masuk pek...      1
 3  pk pertemuan pkb gerindra biarkan meminang par...      0
 4  komisi ix dpr desak kemenk vaksin covid balita...      0,
 (37750, 2),
 (2201, 2))

In [21]:
df_train_balanced.to_csv('news_train_balanced_clean.csv', index=False)
df_test.to_csv('news_test_clean.csv', index=False)