## Import library dan load data scraping

In [108]:
import pandas as pd
import numpy as np
import re
import emoji
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from textblob import TextBlob  # auto-labeling

In [109]:
# Load Data
df = pd.read_csv('ytb-bobonvswillie.csv')
print("Jumlah data:", len(df))

Jumlah data: 24392


In [110]:
print(df.head())

            publishedAt      videoId  authorDisplayName  \
0  2025-03-27T16:48:21Z  ygcodv_xrUE   @SteveDunham1312   
1  2025-03-27T16:47:36Z  ygcodv_xrUE       @Arfannn-o9f   
2  2025-03-27T16:46:00Z  ygcodv_xrUE    @Test.tv.tester   
3  2025-03-27T16:45:45Z  ygcodv_xrUE  @nunikariyyan7232   
4  2025-03-27T16:43:26Z  ygcodv_xrUE        @Bangziii15   

                                         textDisplay  likeCount  
0                      Dalam hati bobon, wili GOBLOK          0  
1  njir gaya nya bobon ngotak amat ngomong sm wel...          0  
2  Saya cuma minya willie mengakui salah ini sett...          1  
3  Bang bobon dan Willy Salim sama sama ora yg ba...          0  
4                                                oke          0  


## Preprocessing text

In [111]:
def clean_text(text):
    text = emoji.replace_emoji(text, replace='')    # Hapus emoji
    text = re.sub(r'<.*?>', '', text)               # Hapus tag HTML
    text = re.sub(r'[^a-zA-Z\s]', '', text)         # Hapus karakter khusus dan angka dg regex
    text = text.lower()                             # Konversi ke lowercase
    return text

df['cleaned_text'] = df['textDisplay'].apply(clean_text)

## Pelabelan data

In [112]:
def auto_label(text):
    text = str(text).lower()

    # karena sebelumnya distribusi 3 label beda jauh.
    # jadi saya pakai keywoards untuk memperbanyak sampel
    # Negative keywords
    if any(word in text for word in ["goblok", "bodoh", "jelek", "sampah", "gak mutu",
                                   "pencitraan", "pura-pura", "setingan", "cari perhatian",
                                   "dibuat-buat", "rekayasa", "akting", "bangsat", "drama",
                                   "pembodohan", "gak jelas", "cari viewer", "biar viral", "untuk konten",
                                   "cari viewer", "untuk konten", "tipu-tipu", "gak ikhlas", "cari popularitas",
                                   "bohong", "gak guna", "rakus", "serakah", "boikot", "penjarakan", "najis"]):
        return "negatif"

    # Positive keywords
    if any(word in text for word in ["keren", "bagus", "mantap", "top", "semangat", "peduli", "berbagi", "salut", "sukses"]):
        return "positif"

    # TextBlob
    return "positif" if TextBlob(text).sentiment.polarity > 0.1 else "negatif" if TextBlob(text).sentiment.polarity < -0.1 else "netral"

# Labeling
df['label'] = df['cleaned_text'].apply(auto_label)

In [113]:
# Label dist
print("Distribusi Label:")
print(df['label'].value_counts())

Distribusi Label:
label
netral     19623
negatif     2924
positif     1845
Name: count, dtype: int64


## TF-IDF + SVM

In [114]:
# Skema 1
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=2000)
X_tfidf = tfidf.fit_transform(df['cleaned_text'])
y = df['label']

In [115]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [116]:
# Train SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

In [117]:
# Evaluasi
y_pred = svm_model.predict(X_test)
print("\nSVM Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


SVM Accuracy: 0.9463004714080754
              precision    recall  f1-score   support

     negatif       0.97      0.72      0.83       582
      netral       0.94      1.00      0.97      3937
     positif       0.94      0.74      0.83       360

    accuracy                           0.95      4879
   macro avg       0.95      0.82      0.88      4879
weighted avg       0.95      0.95      0.94      4879



## CountVectorizer + Naive Bayes

In [118]:
# Skema 2
# Count Vectorizer
count_vec = CountVectorizer(max_features=2000)
X_count = count_vec.fit_transform(df['cleaned_text'])

In [119]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(X_count, y, test_size=0.2, random_state=42)

In [120]:
# Train Random Forest
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [121]:
# Evaluasi
y_pred = nb_model.predict(X_test)
print("\nNaive Bayes Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Naive Bayes Accuracy: 0.9010043041606887
              precision    recall  f1-score   support

     negatif       0.72      0.75      0.74       582
      netral       0.95      0.94      0.95      3937
     positif       0.68      0.71      0.69       360

    accuracy                           0.90      4879
   macro avg       0.79      0.80      0.79      4879
weighted avg       0.90      0.90      0.90      4879



## LSTM

In [122]:
# Skema 3
# Tokenisasi
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(df['cleaned_text'])
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
X_pad = pad_sequences(sequences, maxlen=100)

In [123]:
# Encoding label
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(df['label'])

In [124]:
# Data splitting 70/30
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_encoded, test_size=0.3, random_state=42)

In [125]:
# model LSTM
model = Sequential([
    Embedding(2000, 64, input_length=100),
    LSTM(64),
    Dense(3, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=3, batch_size=64, validation_split=0.2)

Epoch 1/3




[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.7821 - loss: 0.6534 - val_accuracy: 0.8829 - val_loss: 0.3615
Epoch 2/3
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9140 - loss: 0.2766 - val_accuracy: 0.9488 - val_loss: 0.2121
Epoch 3/3
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9561 - loss: 0.1628 - val_accuracy: 0.9572 - val_loss: 0.1760


<keras.src.callbacks.history.History at 0x7ae56c873510>

In [126]:
# Evaluasi
loss, accuracy = model.evaluate(X_test, y_test)
print("\nLSTM Accuracy:", accuracy)

[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9491 - loss: 0.2096

LSTM Accuracy: 0.9514894485473633


**Kesimpulan:**
- SVM (TF-IDF) mencapai akurasi sebesar **94.63%**
- Naive Bayes (CountVectorizer) mencapai akurasi sebesar **90.10%**
- LSTM mencapai akurasi sebesar **95.15%**

## Contoh prediksi

In [127]:
# Contoh predict
def predict_all_models(text):
    # Preprocess text
    cleaned_text = clean_text(text)

    # 1. Predict SVM (TF-IDF)
    svm_pred = svm_model.predict(tfidf.transform([cleaned_text]))[0]

    # 2. Predict Naive Bayes (CountVectorizer)
    nb_pred = nb_model.predict(count_vec.transform([cleaned_text]))[0]

    # 3. Predict LSTM
    seq = tokenizer.texts_to_sequences([cleaned_text])
    padded = pad_sequences(seq, maxlen=100)
    lstm_pred = le.inverse_transform([np.argmax(model.predict(padded))])[0]

    return {
        "Text": text[:50] + "...",  # tampil 50 karakter pertama saja
        "Cleaned Text": cleaned_text[:50] + "...",
        "SVM (TF-IDF)": svm_pred,
        "Naive Bayes (CountVec)": nb_pred,
        "LSTM": lstm_pred
    }

# Contoh prediksi
test_samples = [
    "videonya keren pembahasannya",
    "konten pembodohan",
    "bobon keren, aku suka :)",
    "penjarakan willie"
]

print("="*50)
print("Contoh hasil prediksi")
print("="*50)
for sample in test_samples:
    result = predict_all_models(sample)
    print("\nOriginal Text:", result["Text"])
    print("Cleaned Text:", result["Cleaned Text"])
    print("- SVM:", result["SVM (TF-IDF)"])
    print("- Naive Bayes:", result["Naive Bayes (CountVec)"])
    print("- LSTM:", result["LSTM"])
    print("-"*30)

Contoh hasil prediksi
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step

Original Text: videonya keren pembahasannya...
Cleaned Text: videonya keren pembahasannya...
- SVM: positif
- Naive Bayes: positif
- LSTM: positif
------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step

Original Text: konten pembodohan...
Cleaned Text: konten pembodohan...
- SVM: negatif
- Naive Bayes: negatif
- LSTM: negatif
------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step

Original Text: bobon keren, aku suka :)...
Cleaned Text: bobon keren aku suka ...
- SVM: positif
- Naive Bayes: positif
- LSTM: positif
------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step

Original Text: penjarakan willie...
Cleaned Text: penjarakan willie...
- SVM: negatif
- Naive Bayes: negatif
- LSTM: negatif
------------------------------
