In [1]:
# === Import Library ===
import pandas as pd
import torch
import re
import wordninja
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. Load Dataset Real dan Fake
real_df = pd.read_csv('dataset TA - data real fix.csv')
fake_df = pd.read_csv('dataset TA - data fake fix.csv')

# 3. Gabungkan Dataset
combined_df = pd.concat([real_df, fake_df], axis=0)
print("Dataset Information:\n", combined_df.info())
print("Contoh Data:\n", combined_df.head())


<class 'pandas.core.frame.DataFrame'>
Index: 494 entries, 0 to 275
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   date       494 non-null    object
 1   URL        494 non-null    object
 2   Title      494 non-null    object
 3   Narrative  494 non-null    object
 4   Statement  494 non-null    object
 5   Label      494 non-null    int64 
dtypes: int64(1), object(5)
memory usage: 27.0+ KB
Dataset Information:
 None
Contoh Data:
              date                                                URL  \
0    Mei 20, 2019  https://turnbackhoax.id/2019/05/20/benar-klari...   
1     Mei 4, 2019  https://turnbackhoax.id/2019/05/04/benar-klari...   
2     Mei 3, 2019  https://turnbackhoax.id/2019/05/03/benar-ungga...   
3     Mei 2, 2019  https://turnbackhoax.id/2019/05/02/benar-klari...   
4  April 30, 2019          https://turnbackhoax.id/2019/04/30/22526/   

                                               Title  \
0  [BENA

In [3]:
# 1. Hapus Emoji
def remove_emoji(text):
    emoji_pattern = re.compile("["    
                                u"\U0001F600-\U0001F64F"  # emoticon
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F700-\U0001F77F"  # alchemical symbols
                                u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                                u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                                u"\U00002600-\U000026FF"  # Miscellaneous Symbols
                                u"\U00002700-\U000027BF"  # Dingbats
                                "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# 2. Hapus Karakter Khusus ([], @, dll)
def remove_special_chars(text):
    # Hapus karakter [ ], @
    text = re.sub(r'[\[\]@]', '', text)
    
    # Hapus tanda petik di awal dan akhir teks
    text = re.sub(r'(^"|"$)', '', text)
    
    return text

# 3. Hapus URL
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

# 4. Case Folding (Lowercasing)
def lowercase_text(text):
    return text.lower()

# 5. Gabungkan Semua Preprocessing
def preprocess_text(text):
    text = remove_emoji(text)
    text = remove_special_chars(text)
    text = remove_urls(text)
    text = lowercase_text(text)
    return text

# 6. Terapkan Preprocessing
combined_df['Narrative'] = combined_df['Narrative'].apply(preprocess_text)
combined_df['Statement'] = combined_df['Statement'].apply(preprocess_text)

# 7. Simpan Dataset yang Telah Diproses
combined_df.to_csv('processed_dataset.csv', index=False)
print("Contoh Data Setelah Preprocessing:\n", combined_df.head())


Contoh Data Setelah Preprocessing:
              date                                                URL  \
0    Mei 20, 2019  https://turnbackhoax.id/2019/05/20/benar-klari...   
1     Mei 4, 2019  https://turnbackhoax.id/2019/05/04/benar-klari...   
2     Mei 3, 2019  https://turnbackhoax.id/2019/05/03/benar-ungga...   
3     Mei 2, 2019  https://turnbackhoax.id/2019/05/02/benar-klari...   
4  April 30, 2019          https://turnbackhoax.id/2019/04/30/22526/   

                                               Title  \
0  [BENAR] Klarifikasi BIN terkait Pesan Berantai...   
1  [BENAR] Klarifikasi Putra Aji Adhari, Anak Kel...   
2  [BENAR] Unggahan Hasil C1 Pleno TPS di Bangil ...   
3  [BENAR] Klarifikasi Surat Edaran Intruksi Mema...   
4  [BENAR] Klarifikasi terkait Kabar Petugas KPPS...   

                                           Narrative  \
0  beredar di media sosial sebuah pesan yang beri...   
1  melalu akun instagramnya, putra aji adhari mem...   
2  beredar di media sosial

In [4]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    combined_df[['Statement', 'Narrative']],  # Hipotesis + Premis
    combined_df['Label'],
    test_size=0.2,
    stratify=combined_df['Label'],
    random_state=42
)

In [5]:
# Gabungkan kembali data Train
train_df = pd.DataFrame({
    'Statement': train_texts['Statement'],
    'Narrative': train_texts['Narrative'],
    'Label': train_labels
})

# Gabungkan kembali data Test
val_df = pd.DataFrame({
    'Statement': val_texts['Statement'],
    'Narrative': val_texts['Narrative'],
    'Label': val_labels
})

# Simpan ke CSV
train_df.to_csv('train_dataset.csv', index=False)
val_df.to_csv('val_dataset.csv', index=False)

print("Dataset Train dan Test berhasil disimpan:")
print(f"Jumlah Data Train: {train_df.shape[0]}")
print(f"Jumlah Data Test : {val_df.shape[0]}")

Dataset Train dan Test berhasil disimpan:
Jumlah Data Train: 395
Jumlah Data Test : 99


In [6]:
# 1. Load IndoBERT Tokenizer
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

# 2. Tokenisasi untuk Simple Model (Statement Saja)
train_encodings_simple = tokenizer(
    list(train_texts['Statement']),
    truncation=True,
    padding=True,
    max_length=512
)
val_encodings_simple = tokenizer(
    list(val_texts['Statement']),
    truncation=True,
    padding=True,
    max_length=512
)

# 3. Tokenisasi untuk NLI Model (Statement + Narrative)
train_encodings_nli = tokenizer(
    list(train_texts['Statement']),
    list(train_texts['Narrative']),
    truncation=True,
    padding=True,
    max_length=512
)
val_encodings_nli = tokenizer(
    list(val_texts['Statement']),
    list(val_texts['Narrative']),
    truncation=True,
    padding=True,
    max_length=512
)




In [7]:
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

train_dataset_simple = NewsDataset(train_encodings_simple, train_labels)
val_dataset_simple = NewsDataset(val_encodings_simple, val_labels)

train_dataset_nli = NewsDataset(train_encodings_nli, train_labels)
val_dataset_nli = NewsDataset(val_encodings_nli, val_labels)


In [9]:
simple_model = AutoModelForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1", num_labels=2
)

training_args_simple = TrainingArguments(
    output_dir="./results_simple",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01
)

trainer_simple = Trainer(
    model=simple_model,
    args=training_args_simple,
    train_dataset=train_dataset_simple,
    eval_dataset=val_dataset_simple
)

trainer_simple.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                  
 25%|██▌       | 50/200 [23:32<51:27, 20.58s/it]

{'eval_loss': 0.17715370655059814, 'eval_runtime': 132.4726, 'eval_samples_per_second': 0.747, 'eval_steps_per_second': 0.098, 'epoch': 1.0}


                                                  
 50%|█████     | 100/200 [46:46<34:31, 20.72s/it]

{'eval_loss': 0.1694612205028534, 'eval_runtime': 132.0904, 'eval_samples_per_second': 0.749, 'eval_steps_per_second': 0.098, 'epoch': 2.0}


                                                   
 75%|███████▌  | 150/200 [1:09:33<16:58, 20.36s/it]

{'eval_loss': 0.2069169580936432, 'eval_runtime': 132.117, 'eval_samples_per_second': 0.749, 'eval_steps_per_second': 0.098, 'epoch': 3.0}


                                                   
100%|██████████| 200/200 [1:32:09<00:00, 27.65s/it]

{'eval_loss': 0.24072180688381195, 'eval_runtime': 131.2818, 'eval_samples_per_second': 0.754, 'eval_steps_per_second': 0.099, 'epoch': 4.0}
{'train_runtime': 5529.4773, 'train_samples_per_second': 0.286, 'train_steps_per_second': 0.036, 'train_loss': 0.161722412109375, 'epoch': 4.0}





TrainOutput(global_step=200, training_loss=0.161722412109375, metrics={'train_runtime': 5529.4773, 'train_samples_per_second': 0.286, 'train_steps_per_second': 0.036, 'train_loss': 0.161722412109375, 'epoch': 4.0})

In [10]:
nli_model = AutoModelForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1", num_labels=2
)

training_args_nli = TrainingArguments(
    output_dir="./results_nli",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01
)

trainer_nli = Trainer(
    model=nli_model,
    args=training_args_nli,
    train_dataset=train_dataset_nli,
    eval_dataset=val_dataset_nli
)

trainer_nli.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 25%|██▌       | 50/200 [19:28<47:51, 19.14s/it]  
 25%|██▌       | 50/200 [21:21<47:51, 19.14s/it]

{'eval_loss': 0.17838071286678314, 'eval_runtime': 113.33, 'eval_samples_per_second': 0.874, 'eval_steps_per_second': 0.115, 'epoch': 1.0}


 50%|█████     | 100/200 [40:26<31:38, 18.98s/it] 
 50%|█████     | 100/200 [42:20<31:38, 18.98s/it]

{'eval_loss': 0.14524953067302704, 'eval_runtime': 113.7492, 'eval_samples_per_second': 0.87, 'eval_steps_per_second': 0.114, 'epoch': 2.0}


 75%|███████▌  | 150/200 [1:00:14<13:30, 16.20s/it]
 75%|███████▌  | 150/200 [1:01:54<13:30, 16.20s/it]

{'eval_loss': 0.16194185614585876, 'eval_runtime': 99.8686, 'eval_samples_per_second': 0.991, 'eval_steps_per_second': 0.13, 'epoch': 3.0}


100%|██████████| 200/200 [1:18:11<00:00, 18.85s/it]
100%|██████████| 200/200 [1:20:08<00:00, 24.04s/it]

{'eval_loss': 0.1213579848408699, 'eval_runtime': 117.2055, 'eval_samples_per_second': 0.845, 'eval_steps_per_second': 0.111, 'epoch': 4.0}
{'train_runtime': 4808.8796, 'train_samples_per_second': 0.329, 'train_steps_per_second': 0.042, 'train_loss': 0.12649866104125976, 'epoch': 4.0}





TrainOutput(global_step=200, training_loss=0.12649866104125976, metrics={'train_runtime': 4808.8796, 'train_samples_per_second': 0.329, 'train_steps_per_second': 0.042, 'train_loss': 0.12649866104125976, 'epoch': 4.0})

In [None]:
# 1. Evaluasi Simple Model
simple_preds = trainer_simple.predict(val_dataset_simple).predictions.argmax(axis=1)
simple_accuracy = accuracy_score(val_labels, simple_preds)

print("Evaluasi Simple Model:\n")
print(f"Accuracy: {simple_accuracy:.2f}")
print(classification_report(val_labels, simple_preds, target_names=["Real (Entailment)", "Fake (Contradiction)"]))

# 2. Evaluasi NLI Model
nli_preds = trainer_nli.predict(val_dataset_nli).predictions.argmax(axis=1)
nli_accuracy = accuracy_score(val_labels, nli_preds)

print("\nEvaluasi NLI Model:\n")
print(f"Accuracy: {nli_accuracy:.2f}")
print(classification_report(val_labels, nli_preds, target_names=["Real (Entailment)", "Fake (Contradiction)"]))


100%|██████████| 13/13 [02:00<00:00,  9.26s/it]


Evaluasi Simple Model:

Accuracy: 0.93
                      precision    recall  f1-score   support

   Real (Entailment)       1.00      0.84      0.91        44
Fake (Contradiction)       0.89      1.00      0.94        55

            accuracy                           0.93        99
           macro avg       0.94      0.92      0.93        99
        weighted avg       0.94      0.93      0.93        99



100%|██████████| 13/13 [01:41<00:00,  7.79s/it]


Evaluasi NLI Model:

Accuracy: 0.98
                      precision    recall  f1-score   support

   Real (Entailment)       0.98      0.98      0.98        44
Fake (Contradiction)       0.98      0.98      0.98        55

            accuracy                           0.98        99
           macro avg       0.98      0.98      0.98        99
        weighted avg       0.98      0.98      0.98        99






In [14]:
trainer_nli.save_model("./first_nli_model_0.98")
trainer_simple.save_model("./first_simple_model_0.93")