In [None]:
### TRAINING MODEL ANALISIS SENTIMEN (NLP - DEEP LEARNING) ###
# 1. Pendahuluan

# Notebook ini digunakan khusus untuk pelatihan model analisis sentimen menggunakan dataset hasil scraping mandiri sebanyak 10.740 sampel data. Proyek ini mengklasifikasikan teks ke dalam 3 kelas sentimen: Negatif, Netral, dan Positif dengan pendekatan Deep Learning serta melakukan 3 skema pelatihan berbeda.


# 2. Import Library

!pip install Sastrawi
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory



# 3. Load Dataset

df = pd.read_csv('dataset_scraping.csv')
print('Jumlah data:', df.shape[0])
df.head()



# 4. Preprocessing & Labeling Data

stemmer = StemmerFactory().create_stemmer()
stopwords = StopWordRemoverFactory().get_stop_words()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join([stemmer.stem(w) for w in text.split() if w not in stopwords])
    return text

df['clean_text'] = df['text'].astype(str).apply(clean_text)

# Kamus sentimen sederhana
positive_words = [
    'bagus','mantap','senang','puas','baik','cepat',
    'recommended','suka','memuaskan'
]

negative_words = [
    'jelek','buruk','kecewa','lambat','parah','error',
    'tidak puas','lemot','gagal'
]

def label_sentiment(text):
    score = 0
    for word in text.split():
        if word in positive_words:
            score += 1
        elif word in negative_words:
            score -= 1

    if score > 0:
        return 'Positif'
    elif score < 0:
        return 'Negatif'
    else:
        return 'Netral'

df['label'] = df['clean_text'].apply(label_sentiment)
df['label'].value_counts()

# 5. Label Encoding

label_map = {'Negatif':0, 'Netral':1, 'Positif':2}
df['label_enc'] = df['label'].map(label_map)



# 6. Tokenisasi dan Padding

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(df['clean_text'])

X = tokenizer.texts_to_sequences(df['clean_text'])
X = pad_sequences(X, maxlen=100)
y = tf.keras.utils.to_categorical(df['label_enc'], num_classes=3)



# 7. SKEMA 1 – LSTM (Split 80:20)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model1 = Sequential([
    Embedding(20000, 128),
    LSTM(128),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

model1.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

history1 = model1.fit(
    X_train, y_train,
    epochs=15,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[EarlyStopping(patience=3, restore_best_weights=True)]
)



# 8. SKEMA 2 – BiLSTM (Split 80:20)

model2 = Sequential([
    Embedding(20000, 128),
    Bidirectional(LSTM(128)),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

model2.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

history2 = model2.fit(
    X_train, y_train,
    epochs=15,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[EarlyStopping(patience=3, restore_best_weights=True)]
)



# 9. SKEMA 3 – CNN (Split 70:30)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

model3 = Sequential([
    Embedding(20000, 128),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

model3.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

history3 = model3.fit(
    X_train, y_train,
    epochs=15,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[EarlyStopping(patience=3, restore_best_weights=True)]
)



# 10. Evaluasi Model Terbaik

y_pred = model2.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

print(classification_report(y_true, y_pred))



# 11. Inference / Testing

sample_text = ['produk ini sangat bagus dan pengirimannya cepat']
seq = tokenizer.texts_to_sequences(sample_text)
pad = pad_sequences(seq, maxlen=100)
pred = model2.predict(pad)

label = ['Negatif','Netral','Positif']
print('Prediksi Sentimen:', label[np.argmax(pred)])



# 12. Kesimpulan

# Model deep learning berbasis LSTM, BiLSTM, dan CNN berhasil melakukan klasifikasi sentimen dengan sangat baik pada dataset hasil scraping mandiri sebanyak 10.740 data, dengan akurasi testing mencapai hingga hampir 100%, serta memenuhi seluruh kriteria proyek analisis sentimen NLP.

Jumlah data: 10740
Epoch 1/15
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.7111 - loss: 0.7804 - val_accuracy: 0.8901 - val_loss: 0.2917
Epoch 2/15
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9583 - loss: 0.1322 - val_accuracy: 0.9837 - val_loss: 0.0655
Epoch 3/15
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9851 - loss: 0.0471 - val_accuracy: 0.9832 - val_loss: 0.0427
Epoch 4/15
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9902 - loss: 0.0373 - val_accuracy: 0.9874 - val_loss: 0.0372
Epoch 5/15
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9899 - loss: 0.0273 - val_accuracy: 0.9930 - val_loss: 0.0207
Epoch 6/15
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9986 - loss: 0.0073 - val_accuracy: 0.9953 - val_loss: 0.0186
Epo