In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# ==============================================================================
# 1. MEMUAT DATASET
# ==============================================================================
import pandas as pd

# Pastikan path ini benar sesuai dengan lokasi file di Google Drive Anda
file_path = '/content/drive/MyDrive/Magang/dataset_berita.csv'

try:
    df = pd.read_csv(file_path)
    print("✅ Dataset berhasil dimuat.")
    display(df.head())
except FileNotFoundError:
    print(f"❌ ERROR: File tidak ditemukan di '{file_path}'. Pastikan path sudah benar dan Anda sudah menghubungkan Google Drive.")

✅ Dataset berhasil dimuat.


Unnamed: 0,berita,hoax
0,Masyarakat Antikorupsi Indonesia (MAKI) menyer...,0
1,“Kodya Malang Jatim.Ahad 1 Des 2024 jam 08 25 ...,1
2,"Pengacara Eks Mendikbudristek Nadiem Makarim, ...",0
3,"“Imunisasi, apa yang bermacam-macam, tidak per...",1
4,KANTOR Wilayah (Kanwil) Bea Cukai Jawa Tengah ...,0


In [4]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [5]:
# ==============================================================================
# 2. PREPROCESSING TEKS
# ==============================================================================
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory


# Inisialisasi library
factory_stemmer = StemmerFactory()
stemmer = factory_stemmer.create_stemmer()
factory_stopword = StopWordRemoverFactory()
stopwords_set = set(factory_stopword.get_stop_words())

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\d+|[^\w\s]', ' ', text)
    tokens = [word for word in text.split() if word not in stopwords_set]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

print("Memulai preprocessing teks (mungkin butuh beberapa menit)...")
df['cleaned_text'] = df['berita'].apply(clean_text)
print("✅ Preprocessing teks selesai.")
display(df[['berita', 'cleaned_text']].head())

Memulai preprocessing teks (mungkin butuh beberapa menit)...
✅ Preprocessing teks selesai.


Unnamed: 0,berita,cleaned_text
0,Masyarakat Antikorupsi Indonesia (MAKI) menyer...,masyarakat antikorupsi indonesia maki serah da...
1,“Kodya Malang Jatim.Ahad 1 Des 2024 jam 08 25 ...,kodya malang jatim ahad des jam wib menang tra...
2,"Pengacara Eks Mendikbudristek Nadiem Makarim, ...",acara eks mendikbudristek nadiem makarim hotma...
3,"“Imunisasi, apa yang bermacam-macam, tidak per...",imunisasi apa macam macam perlu ikut ikut bany...
4,KANTOR Wilayah (Kanwil) Bea Cukai Jawa Tengah ...,kantor wilayah kanwil bea cukai jawa tengah da...


In [6]:
# ==============================================================================
# 3. EKSTRAKSI FITUR (MEMBUAT VARIABEL X dan y)
# ==============================================================================
from sklearn.feature_extraction.text import TfidfVectorizer

print("Memulai ekstraksi fitur dengan TF-IDF...")

# Inisialisasi TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Membuat variabel X (fitur) dan y (label)
X = tfidf_vectorizer.fit_transform(df['cleaned_text'])
y = df['hoax']

print("✅ Variabel X dan y berhasil dibuat.")
print(f"Ukuran matriks fitur (X): {X.shape}")

Memulai ekstraksi fitur dengan TF-IDF...
✅ Variabel X dan y berhasil dibuat.
Ukuran matriks fitur (X): (3000, 5000)


In [7]:
# ==============================================================================
# 4. PEMODELAN DAN PENYIMPANAN MODEL
# ==============================================================================
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import joblib
import os

print("1. Membagi data menjadi data latih dan data uji...")
# Baris ini sekarang akan berhasil karena X dan y sudah ada
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("2. Memulai pelatihan model SVM baru (dengan kemampuan probabilitas)...")
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train, y_train)
print("   -> Pelatihan model selesai.")

print("3. Menyimpan model dan vectorizer yang baru...")
output_dir = '/content/drive/MyDrive/Magang/'
os.makedirs(output_dir, exist_ok=True)
joblib.dump(svm_model, os.path.join(output_dir, 'svm_model.pkl'))
joblib.dump(tfidf_vectorizer, os.path.join(output_dir, 'tfidf_vectorizer.pkl'))

print(f"   -> Model berhasil disimpan di: {output_dir}")
print("\n✅ SELESAI. Silakan unduh file 'svm_model.pkl' yang BARU.")

1. Membagi data menjadi data latih dan data uji...
2. Memulai pelatihan model SVM baru (dengan kemampuan probabilitas)...
   -> Pelatihan model selesai.
3. Menyimpan model dan vectorizer yang baru...
   -> Model berhasil disimpan di: /content/drive/MyDrive/Magang/

✅ SELESAI. Silakan unduh file 'svm_model.pkl' yang BARU.


In [8]:
# ==============================================================================
# 6. EVALUASI MODEL
# ==============================================================================
from sklearn.metrics import classification_report, accuracy_score

# Lakukan prediksi pada data uji (X_test)
y_pred = svm_model.predict(X_test)

# Tampilkan Accuracy
print("--- Accuracy Score ---")
print(f"{accuracy_score(y_test, y_pred):.2%}")
print("\n" + "="*30 + "\n")

# --- INI BARIS YANG ANDA MINTA ---
# Tampilkan Classification Report
print("--- Classification Report ---")
# Menambahkan target_names agar labelnya jelas (bukan hanya 0 dan 1)
report = classification_report(y_test, y_pred, target_names=['VALID (Kelas 0)', 'HOAX (Kelas 1)'])
print(report)

--- Accuracy Score ---
98.67%


--- Classification Report ---
                 precision    recall  f1-score   support

VALID (Kelas 0)       0.98      0.99      0.99       272
 HOAX (Kelas 1)       0.99      0.98      0.99       328

       accuracy                           0.99       600
      macro avg       0.99      0.99      0.99       600
   weighted avg       0.99      0.99      0.99       600

