In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Split Data

In [3]:
df = pd.read_csv('/content/train_data.csv')
df = df[['preprocessed_text', 'Label']]
df = df.rename(columns={'preprocessed_text': 'Text'})
display(df.head())

Unnamed: 0,Text,Label
0,skema gw denger langsung petinggi pengusung ga...,0
1,user mah kegoblokan unfaedah,1
2,hasibuan buzzerp kumpul prabowo perintah klo p...,1
3,awowkwkw bangun kau ni mimpi mulu,0
4,partai udah kebebasan menentukan cawapr nya tp...,0


In [4]:
print("Original distribution:")
print(df['Label'].value_counts())
print("Total :", len(df))

Original distribution:
Label
1    3514
0    3486
Name: count, dtype: int64
Total : 7000


In [5]:
df_true = df[df['Label'] == 0]
df_fake = df[df['Label'] == 1]

df_fake_sub = df_fake.sample(frac=0.10, random_state=42)

df_final = pd.concat([df_true, df_fake_sub], ignore_index=True)

print(df_final['Label'].value_counts())
print("Total:", len(df_final))

Label
0    3486
1     351
Name: count, dtype: int64
Total: 3837


In [6]:
df_train, df_test = train_test_split(
    df_final,
    test_size=0.1,
    stratify=df_final['Label'],
    random_state=42
)

print("Train:", len(df_train))
print("Test :", len(df_test))

Train: 3453
Test : 384


In [7]:
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(df_train[['Text']], df_train['Label'])

df_train_balanced = pd.DataFrame({
    'Text': X_res['Text'],
    'Label': y_res
})

print("\nBalanced Train:")
print(df_train_balanced['Label'].value_counts())
print("Total:", len(df_train_balanced))


Balanced Train:
Label
0    3137
1    3137
Name: count, dtype: int64
Total: 6274


In [8]:
print("jumlah data test baru:", len(df_test))
print("Real:", (df_test['Label'] == 0).sum())
print("Fake:", (df_test['Label'] == 1).sum())

jumlah data test baru: 384
Real: 349
Fake: 35


In [9]:
df_train_balanced.to_csv('pilpres_train_balanced.csv', index=False)
df_test.to_csv('pilpres_test.csv', index=False)

# Text Cleaning

In [10]:
df_train = pd.read_csv('/content/pilpres_train_balanced.csv')
df_test = pd.read_csv('/content/pilpres_test.csv')

In [11]:
df_train.shape, df_test.shape

((6274, 2), (384, 2))

In [12]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'@\w+|#\w+|http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)

    ps = PorterStemmer()
    stop_words = set(stopwords.words('indonesian'))
    filtered = [ps.stem(w) for w in words if w not in stop_words]

    return ' '.join(filtered)

In [15]:
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)

df_train['Text'] = df_train['Text'].apply(clean_text)
df_test['Text'] = df_test['Text'].apply(clean_text)

In [18]:
df_train.head(), df_test.head(), df_train.shape, df_test.shape

(                                                Text  Label
 0          gayanya pemerannya prabowo gini ceritanya      0
 1  ganjar kenalkan tawangmangu event siksorogo tr...      0
 2  negara org cuman jelekin negara sendiringaku l...      0
 3  politik identita fanat belakangnya yg ahirnya ...      0
 4  ya elahhh tetep ya bawa agama allah tidur tau ...      0,
                                                 Text  Label
 0  china ya yg prakarsai damai berita bikin bingu...      0
 1                                         yg bkn elo      0
 2  ind santai aja oomada bukti cabul pentolan pen...      1
 3  dhalim yg suka yg dhalimi benci islam pk jokow...      0
 4  ganjar satusatunya capr responsif perkembangan...      0,
 (6274, 2),
 (384, 2))

In [19]:
df_train_balanced.to_csv('pilpres_train_balanced_clean.csv', index=False)
df_test.to_csv('pilpres_test_clean.csv', index=False)