In [43]:
import pandas as pd
import numpy as np
import re
import string
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

In [44]:
# Install required packages
!pip install datasets



In [45]:
from datasets import Dataset

In [46]:
# Function to load data from Excel files
def excel_to_df(excel):
    df = pd.read_excel(excel, index_col='index')
    return df

In [47]:
# Load data
CNN= '/content/dataset_cnn.xlsx'
KOMPAS= '/content/dataset_kompas.xlsx'
TEMPO= '/content/dataset_tempo.xlsx'
TURNBACKHOAX= '/content/dataset_turnbackhoax.xlsx'

df_cnn = excel_to_df(CNN)
df_kompas = excel_to_df(KOMPAS)
df_tempo = excel_to_df(TEMPO)
df_turnbackhoax = excel_to_df(TURNBACKHOAX)

In [48]:
# Preprocess data
def preprocess_text(text):
    text = str(text)
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@[^\s]+', '', text)  # Remove usernames
    text = re.sub(r'#', '', text)  # Remove hashtags
    text = re.sub(r'RT[\s]+', '', text)  # Remove retweets
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    text = re.sub(' +', ' ', text)  # Remove extra spaces
    return text

In [49]:
# Apply preprocessing
df_cnn['cleaned'] = df_cnn['cleaned'].apply(preprocess_text)
df_kompas['cleaned'] = df_kompas['cleaned'].apply(preprocess_text)
df_tempo['cleaned'] = df_tempo['cleaned'].apply(preprocess_text)
df_turnbackhoax['cleaned'] = df_turnbackhoax['cleaned'].apply(preprocess_text)

In [50]:
# Concatenate datasets
df_combined = pd.concat([df_cnn, df_kompas, df_tempo, df_turnbackhoax], ignore_index=True)

In [51]:
# Split data into training and testing sets
train_df, test_df = train_test_split(df_combined, test_size=0.2, random_state=42)

In [52]:
# Tokenize and create datasets
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X_train = vectorizer.fit_transform(train_df['cleaned']).toarray()
X_test = vectorizer.transform(test_df['cleaned']).toarray()

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['label'])
y_test = label_encoder.transform(test_df['label'])

In [53]:
# Build and compile the Sequential model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(1000,)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [54]:
# Train the model
batch_size = 8
epochs = 5

model.fit(
    X_train, y_train,
    epochs=epochs,
    validation_data=(X_test, y_test),
    batch_size=batch_size
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7d5e00a54f70>

In [55]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.9821286201477051


In [56]:
texts = [
    "Slogan Pemerintahan sekarang Koalisi Indonesia Hebat",
    "Jokowi Ajak Masyarakat Hadapi 2022 dengan Semangat Baru commerce, misalnya, mencapai nilai 24,8 miliar dolar AS tahun ini"
]

In [57]:
# Preprocess teks
preprocessed_texts = [preprocess_text(text) for text in texts]

In [58]:
# Tokenize teks menggunakan CountVectorizer
X_pred = vectorizer.transform(preprocessed_texts).toarray()

In [59]:
# Lakukan prediksi
predictions = model.predict(X_pred)



In [60]:
# Konversi prediksi menjadi label
predicted_labels = [1 if prediction > 0.5 else 0 for prediction in predictions]

In [61]:
# Decode label menggunakan label_encoder
decoded_labels = label_encoder.inverse_transform(predicted_labels)

In [62]:
# Tampilkan hasil prediksi
for text, label in zip(texts, decoded_labels):
    print(f"Text: {text}")
    print(f"Predicted Label: {label}")
    print("------------")

Text: Slogan Pemerintahan sekarang Koalisi Indonesia Hebat
Predicted Label: 1
------------
Text: Jokowi Ajak Masyarakat Hadapi 2022 dengan Semangat Baru commerce, misalnya, mencapai nilai 24,8 miliar dolar AS tahun ini
Predicted Label: 0
------------


In [63]:
# Save the entire model in the .h5 format
model.save("hoax_detection_model.h5")

  saving_api.save_model(
