## Import library dan load data scraping

In [24]:
import pandas as pd
import numpy as np
import re
import emoji
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from textblob import TextBlob  # auto-labeling

In [25]:
# Load Data
df = pd.read_csv('ytb-bobonvswillie.csv')
print("Jumlah data:", len(df))

Jumlah data: 24392


In [26]:
print(df.head())

            publishedAt      videoId  authorDisplayName  \
0  2025-03-27T16:48:21Z  ygcodv_xrUE   @SteveDunham1312   
1  2025-03-27T16:47:36Z  ygcodv_xrUE       @Arfannn-o9f   
2  2025-03-27T16:46:00Z  ygcodv_xrUE    @Test.tv.tester   
3  2025-03-27T16:45:45Z  ygcodv_xrUE  @nunikariyyan7232   
4  2025-03-27T16:43:26Z  ygcodv_xrUE        @Bangziii15   

                                         textDisplay  likeCount  
0                      Dalam hati bobon, wili GOBLOK          0  
1  njir gaya nya bobon ngotak amat ngomong sm wel...          0  
2  Saya cuma minya willie mengakui salah ini sett...          1  
3  Bang bobon dan Willy Salim sama sama ora yg ba...          0  
4                                                oke          0  


## Preprocessing text

In [27]:
def clean_text(text):
    text = emoji.replace_emoji(text, replace='')    # Hapus emoji
    text = re.sub(r'<.*?>', '', text)               # Hapus tag HTML
    text = re.sub(r'[^a-zA-Z\s]', '', text)         # Hapus karakter khusus dan angka dg regex
    text = text.lower()                             # Konversi ke lowercase
    return text

df['cleaned_text'] = df['textDisplay'].apply(clean_text)

## Pelabelan data

In [28]:
# Lexicon (kata positif, negatif, netral)
positif_words = [
    "keren", "bagus", "mantap", "top", "semangat", "peduli", "berbagi", "salut", "sukses",
    "hebat", "positif", "menginspirasi", "luar biasa", "menghibur", "apresiasi", "terbaik"
]
negatif_words = [
    "goblok", "bodoh", "jelek", "sampah", "gak mutu", "pencitraan", "pura-pura", "setingan",
    "cari perhatian", "dibuat-buat", "rekayasa", "akting", "bangsat", "drama", "pembodohan",
    "gak jelas", "biar viral", "tipu-tipu", "rakus", "bohong", "serakah", "najis", "benci",
    "parah", "menjijikkan", "tidak sopan", "sarkas", "fitnah"
]

def auto_label(text):
    text = str(text).lower()

    # Cek kemunculan kata dari lexicon
    pos_score = sum(word in text for word in positif_words)
    neg_score = sum(word in text for word in negatif_words)

    # Penilaian berdasarkan skor
    if pos_score > neg_score:
        return "positif"
    elif neg_score > pos_score:
        return "negatif"
    else:
        return "netral"

# pelabelan
df['label'] = df['cleaned_text'].apply(auto_label)

In [29]:
# Label dist
print("Distribusi Label:")
print(df['label'].value_counts())

Distribusi Label:
label
netral     20273
negatif     2630
positif     1489
Name: count, dtype: int64


In [30]:
print("\nContoh teks dan label hasil pelabelan:")
print(df[['cleaned_text', 'label']].sample(5))


Contoh teks dan label hasil pelabelan:
                                            cleaned_text    label
5512   mungkin ya ws kalau datang pasti grogi merindi...   netral
17643                        suka aku sama gayamu bg bon   netral
20370  bobon itu dulu jauuuuuhhhh lebih parah dr will...  negatif
11062                  ko bobo santoso the best respect    netral
7188   ke toilet ga dapet noh belakang tmpt konten di...   netral


## TF-IDF + SVM

In [31]:
# Skema 1
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=2000)
X_tfidf = tfidf.fit_transform(df['cleaned_text'])
y = df['label']

In [32]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [33]:
# Train SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

In [34]:
# Evaluasi
y_pred = svm_model.predict(X_test)
print("\nSVM Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


SVM Accuracy: 0.957983193277311
              precision    recall  f1-score   support

     negatif       0.98      0.77      0.86       513
      netral       0.96      1.00      0.98      4062
     positif       0.97      0.76      0.85       304

    accuracy                           0.96      4879
   macro avg       0.97      0.84      0.90      4879
weighted avg       0.96      0.96      0.96      4879



## CountVectorizer + Naive Bayes

In [35]:
# Skema 2
# Count Vectorizer
count_vec = CountVectorizer(max_features=2000)
X_count = count_vec.fit_transform(df['cleaned_text'])

In [36]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(X_count, y, test_size=0.2, random_state=42)

In [37]:
# Train Random Forest
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [38]:
# Evaluasi
y_pred = nb_model.predict(X_test)
print("\nNaive Bayes Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Naive Bayes Accuracy: 0.9046935847509736
              precision    recall  f1-score   support

     negatif       0.74      0.74      0.74       513
      netral       0.95      0.94      0.94      4062
     positif       0.63      0.71      0.67       304

    accuracy                           0.90      4879
   macro avg       0.77      0.80      0.78      4879
weighted avg       0.91      0.90      0.91      4879



## LSTM

In [39]:
# Skema 3
# Tokenisasi
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(df['cleaned_text'])
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
X_pad = pad_sequences(sequences, maxlen=100)

In [40]:
# Encoding label
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(df['label'])

In [41]:
# Data splitting 70/30
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_encoded, test_size=0.3, random_state=42)

In [42]:
# model LSTM
model = Sequential([
    Embedding(2000, 64, input_length=100),
    LSTM(64),
    Dense(3, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=3, batch_size=64, validation_split=0.2)

Epoch 1/3




[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.8231 - loss: 0.5901 - val_accuracy: 0.8946 - val_loss: 0.3273
Epoch 2/3
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9177 - loss: 0.2618 - val_accuracy: 0.9613 - val_loss: 0.1755
Epoch 3/3
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9605 - loss: 0.1571 - val_accuracy: 0.9593 - val_loss: 0.1763


<keras.src.callbacks.history.History at 0x79d700d5be10>

In [43]:
# Evaluasi
loss, accuracy = model.evaluate(X_test, y_test)
print("\nLSTM Accuracy:", accuracy)

[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9546 - loss: 0.1838

LSTM Accuracy: 0.9559988975524902


## Hasil Evaluasi Model

- **SVM (TF-IDF)**  
  Model SVM dengan ekstraksi fitur TF-IDF memberikan performa terbaik dengan akurasi **95.80%** dan nilai macro F1-score sebesar **90%**. Model ini mampu memberikan keseimbangan memprediksi dengan baik

- **Naive Bayes (CountVectorizer)**  
  Naive Bayes dengan fitur CountVectorizer mencapai akurasi **90.47%** dengan macro F1-score sebesar **78%**. Akurasinya cukup tinggi, namun performanya lebih rendah dibandingkan SVM terutama dalam menangani data minoritas seperti sentimen positif dan negatif

- **LSTM**  
  Model LSTM yang menggunakan word embedding memberikan akurasi sebesar **95.60%**

Secara keseluruhan, semua model mencapai akurasi > 85%


## Contoh prediksi

In [44]:
# Contoh predict
def predict_all_models(text):
    # Preprocess text
    cleaned_text = clean_text(text)

    # 1. Predict SVM (TF-IDF)
    svm_pred = svm_model.predict(tfidf.transform([cleaned_text]))[0]

    # 2. Predict Naive Bayes (CountVectorizer)
    nb_pred = nb_model.predict(count_vec.transform([cleaned_text]))[0]

    # 3. Predict LSTM
    seq = tokenizer.texts_to_sequences([cleaned_text])
    padded = pad_sequences(seq, maxlen=100)
    lstm_pred = le.inverse_transform([np.argmax(model.predict(padded))])[0]

    return {
        "Text": text[:50] + "...",  # tampil 50 karakter pertama saja
        "Cleaned Text": cleaned_text[:50] + "...",
        "SVM (TF-IDF)": svm_pred,
        "Naive Bayes (CountVec)": nb_pred,
        "LSTM": lstm_pred
    }

# Contoh prediksi
test_samples = [
    "videonya keren pembahasannya",
    "konten pembodohan",
    "bobon keren, aku suka :)",
    "willie perusak citra kota orang woi !!!!!!!"
]

print("="*50)
print("Contoh hasil prediksi")
print("="*50)
for sample in test_samples:
    result = predict_all_models(sample)
    print("\nOriginal Text:", result["Text"])
    print("Cleaned Text:", result["Cleaned Text"])
    print("- SVM:", result["SVM (TF-IDF)"])
    print("- Naive Bayes:", result["Naive Bayes (CountVec)"])
    print("- LSTM:", result["LSTM"])
    print("-"*30)

Contoh hasil prediksi
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step

Original Text: videonya keren pembahasannya...
Cleaned Text: videonya keren pembahasannya...
- SVM: positif
- Naive Bayes: positif
- LSTM: positif
------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step

Original Text: konten pembodohan...
Cleaned Text: konten pembodohan...
- SVM: negatif
- Naive Bayes: negatif
- LSTM: negatif
------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step

Original Text: bobon keren, aku suka :)...
Cleaned Text: bobon keren aku suka ...
- SVM: positif
- Naive Bayes: positif
- LSTM: positif
------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step

Original Text: willie perusak citra kota orang woi !!!!!!!...
Cleaned Text: willie perusak citra kota orang woi ...
- SVM: netral
- Naive Bayes: netral
- LSTM: netral
-----------------