## Import Library

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

from gensim.models import Word2Vec

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping


In [103]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADVAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Eksplorasi Dataset

In [104]:
# Baca data
df = pd.read_csv('data_tokopedia_reviews.csv')

print(df.head())

                                              review  rating
0  kasih bintang 2 dulu, udah ga re amah buat hp ...       2
1  paket lambat sekali pas mau beli estimasi samp...       1
2  banyak sekali produk yang non cod saya takut b...       4
3                                              goood       5
4                      goood.Mudahan Lebih turboo 🏀🚀       5


In [105]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  10000 non-null  object
 1   rating  10000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 156.4+ KB
None


In [106]:
# Cek jumlah missing value per kolom
print("\nJumlah Missing Value per Kolom:")
print(df.isnull().sum())



Jumlah Missing Value per Kolom:
review    0
rating    0
dtype: int64


In [107]:
# Jumlah data total
print(f"\nJumlah total data: {len(df)}")


Jumlah total data: 10000


In [108]:
# Cek apakah ada kolom label
print("\nKolom yang tersedia:")
print(df.columns)


Kolom yang tersedia:
Index(['review', 'rating'], dtype='object')


## Labeling Sentimen Otomatis

In [109]:
# Mapping rating ke sentimen
def rating_to_sentiment(rating):
    if rating <= 2:
        return 'negatif'
    elif rating == 3:
        return 'netral'
    else:
        return 'positif'

# Buat kolom label
df['sentiment'] = df['rating'].apply(rating_to_sentiment)

# Cek distribusi label
print("Distribusi Sentimen:")
print(df['sentiment'].value_counts())


Distribusi Sentimen:
sentiment
positif    4990
negatif    4450
netral      560
Name: count, dtype: int64


## Preprocessing Teks

In [130]:
nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Hapus URL
    text = re.sub(r'http\S+|www\S+', '', text)
    # Hapus angka dan tanda baca
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenisasi dan hapus stopword
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(tokens)

# Terapkan pembersihan teks
df['clean_review'] = df['review'].astype(str).apply(clean_text)

# Cek hasil
print(df[['review', 'clean_review']].head())


                                              review  \
0  kasih bintang 2 dulu, udah ga re amah buat hp ...   
1  paket lambat sekali pas mau beli estimasi samp...   
2  banyak sekali produk yang non cod saya takut b...   
3                                              goood   
4                      goood.Mudahan Lebih turboo 🏀🚀   

                                        clean_review  
0  kasih bintang udah ga re amah hp kentang nge f...  
1  paket lambat pas beli estimasi sampainya besok...  
2     produk non cod takut barang nya sesuai pesanan  
3                                              goood  
4                                gooodmudahan turboo  


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADVAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Skema 1: TF-IDF + SVM
Menggunakan TF-IDF untuk ekstraksi fitur dan Support Vector Machine sebagai model klasifikasi.

In [111]:
# Inisialisasi TF-IDF
tfidf = TfidfVectorizer(max_features=5000)  # Batasin fitur biar efisien

# Transform teks bersih
X = tfidf.fit_transform(df['clean_review'])

# Label
y = df['sentiment']

### Split Data

In [112]:
# Split 80:20
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


### Training SVM

In [113]:
# Inisialisasi dan training model
svm = LinearSVC()
svm.fit(X_train, y_train)

# Prediksi
y_pred = svm.predict(X_test)

# Evaluasi
print("Akurasi:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Akurasi: 0.989

Classification Report:
              precision    recall  f1-score   support

     negatif       1.00      0.98      0.99       890
      netral       0.96      0.94      0.95       112
     positif       0.98      1.00      0.99       998

    accuracy                           0.99      2000
   macro avg       0.98      0.97      0.98      2000
weighted avg       0.99      0.99      0.99      2000



### Inference Skema 1

In [114]:
# Contoh kalimat
sample = ["produk ini sangat mengecewakan dan tidak sesuai deskripsi"]

# Preprocessing
cleaned = [clean_text(sample[0])]

# TF-IDF transform
vectorized = tfidf.transform(cleaned)

# Prediksi
prediction = svm.predict(vectorized)
print("Prediksi sentimen:", prediction[0])


Prediksi sentimen: negatif


# Skema 2: Word2Vec + Random Forest
Menggunakan Word2Vec untuk representasi kata dan Random Forest untuk klasifikasi.

### Tokenisasi untuk Word2Vec

In [115]:
# Tokenisasi berdasarkan teks yang sudah dibersihkan
tokenized_reviews = [review.split() for review in df['clean_review']]


### Training Model Word2Vec

In [116]:
w2v_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=2, workers=4)

### Fungsi: Rata-rata Vektor Word2Vec untuk Tiap Review

In [117]:
def get_w2v_vector(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

### Konversi Semua Review Jadi Vektor Word2Vec

In [118]:
# Konversi review jadi representasi vektor
X_w2v = np.array([
    get_w2v_vector(tokens, w2v_model, vector_size=100)
    for tokens in tokenized_reviews
])

# Label tetap pakai sentimen
y = df['sentiment']

### Split Data

In [119]:
# Split data dengan stratifikasi label
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(
    X_w2v, y, test_size=0.2, stratify=y, random_state=42
)

### Training Random Forest

In [120]:
# Inisialisasi dan latih model Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_w2v, y_train_w2v)

### Evaluasi Model

In [121]:
# Prediksi dan evaluasi performa
y_pred_w2v = rf.predict(X_test_w2v)

print("Akurasi Random Forest + Word2Vec:", accuracy_score(y_test_w2v, y_pred_w2v))
print("\nClassification Report:")
print(classification_report(y_test_w2v, y_pred_w2v))


Akurasi Random Forest + Word2Vec: 0.989

Classification Report:
              precision    recall  f1-score   support

     negatif       1.00      0.98      0.99       890
      netral       0.96      0.94      0.95       112
     positif       0.98      1.00      0.99       998

    accuracy                           0.99      2000
   macro avg       0.98      0.97      0.98      2000
weighted avg       0.99      0.99      0.99      2000



### Inference Skema 2

In [122]:
# Contoh kalimat
sample_w2v = ["pelayanan cepat dan ramah"]

# Preprocessing
sample_cleaned_w2v = clean_text(sample_w2v[0])

# Tokenisasi
sample_tokens_w2v = sample_cleaned_w2v.split()

# Dapatkan vektor representasi rata-rata Word2Vec
sample_vector_w2v = get_w2v_vector(sample_tokens_w2v, w2v_model, vector_size=100).reshape(1, -1)

# Prediksi dengan Random Forest
prediction_w2v = rf.predict(sample_vector_w2v)

print("Prediksi sentimen dengan Word2Vec + Random Forest:", prediction_w2v[0])


Prediksi sentimen dengan Word2Vec + Random Forest: positif


# Skema 3: LSTM (Deep Learning)
Menggunakan arsitektur LSTM dengan layer embedding untuk klasifikasi sentimen.


### Preprocessing Teks untuk LSTM

In [123]:
# Tokenisasi untuk LSTM
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)  # 5000 kata paling sering

# Fit tokenizer dengan teks yang sudah dibersihkan
tokenizer.fit_on_texts(df['clean_review'])

# Ubah teks menjadi urutan angka
X_seq = tokenizer.texts_to_sequences(df['clean_review'])

# Padding agar panjang setiap sequence sama
X_pad = pad_sequences(X_seq, padding='post', maxlen=100)  # maxlen bisa disesuaikan

# Labeling dengan LabelEncoder untuk konversi ke numerik
le = LabelEncoder()
y_encoded = le.fit_transform(df['sentiment'])


### Split Data

In [124]:
# Split data menjadi train dan test (80:20)
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(
    X_pad, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

### Membangun Model LSTM

In [125]:
# Model LSTM
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100))  # +1 untuk memasukkan indeks 0
model.add(Bidirectional(LSTM(128, return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))  # 3 kelas sentimen: positif, netral, negatif

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


### Training Model LSTM

In [126]:
# EarlyStopping untuk menghindari overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [127]:
history = model.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=64, 
                    validation_data=(X_test_lstm, y_test_lstm), callbacks=[early_stopping])


Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 67ms/step - accuracy: 0.7120 - loss: 0.6943 - val_accuracy: 0.9100 - val_loss: 0.2428
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 63ms/step - accuracy: 0.9339 - loss: 0.1878 - val_accuracy: 0.9735 - val_loss: 0.0779
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 63ms/step - accuracy: 0.9802 - loss: 0.0745 - val_accuracy: 0.9840 - val_loss: 0.0465
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 76ms/step - accuracy: 0.9890 - loss: 0.0425 - val_accuracy: 0.9900 - val_loss: 0.0305
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 96ms/step - accuracy: 0.9911 - loss: 0.0314 - val_accuracy: 0.9870 - val_loss: 0.0347
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 98ms/step - accuracy: 0.9895 - loss: 0.0315 - val_accuracy: 0.9870 - val_loss: 0.0319
Epoch 7/10
[1m125

### Evaluasi Model LSTM

In [128]:
# Evaluasi model di data test
y_pred_lstm = model.predict(X_test_lstm)
y_pred_lstm = np.argmax(y_pred_lstm, axis=1)  # Ubah probabilitas jadi kelas

print("Akurasi LSTM:", accuracy_score(y_test_lstm, y_pred_lstm))
print("\nClassification Report:")
print(classification_report(y_test_lstm, y_pred_lstm))


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step
Akurasi LSTM: 0.9905

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       890
           1       1.00      0.93      0.96       112
           2       0.98      1.00      0.99       998

    accuracy                           0.99      2000
   macro avg       0.99      0.97      0.98      2000
weighted avg       0.99      0.99      0.99      2000



### Inference dengan Model LSTM

In [129]:
# Contoh inferensi
sample = ["produk ini sangat keren"]

# Preprocessing
sample_cleaned = [clean_text(sample[0])]
sample_seq = tokenizer.texts_to_sequences(sample_cleaned)
sample_pad = pad_sequences(sample_seq, padding='post', maxlen=100)

# Prediksi
prediction_lstm = model.predict(sample_pad)
predicted_class = le.inverse_transform([np.argmax(prediction_lstm)])

print("Prediksi sentimen dengan LSTM:", predicted_class[0])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Prediksi sentimen dengan LSTM: positif


## Analisis Performa Model

Dalam proyek ini, dilakukan eksperimen klasifikasi sentimen menggunakan tiga pendekatan berbeda: **TF-IDF + SVM**, **Word2Vec + Random Forest**, dan **LSTM (Deep Learning)**. Berikut adalah hasil evaluasi dan analisis perbandingannya:

### 1. TF-IDF + SVM
- **Akurasi**: 98.9%
- **F1-Score Tertinggi**: Positif (0.99)
- **Kelebihan**:
  - Waktu pelatihan cepat dan efisien.
  - Performa sangat baik meski model sederhana.
- **Kekurangan**:
  - Kurang memahami konteks kata (hanya berdasarkan frekuensi dan bobot kata).

### 2. Word2Vec + Random Forest
- **Akurasi**: 98.9%
- **F1-Score Tertinggi**: Positif (0.99)
- **Kelebihan**:
  - Representasi kata lebih kontekstual dibanding TF-IDF.
  - Random Forest andal dalam klasifikasi berbasis fitur.
- **Kekurangan**:
  - Hasil sangat mirip dengan TF-IDF, tapi training Word2Vec lebih kompleks.

### 3. LSTM (Deep Learning)
- **Akurasi Tertinggi**: **99.05%**
- **Val Accuracy Stabil di Akhir Epoch**: ~99%
- **Kelebihan**:
  - Memahami urutan dan konteks kata dalam kalimat (berbasis sequence).
  - Performa terbaik secara keseluruhan dan sangat cocok untuk teks panjang.
- **Kekurangan**:
  - Butuh waktu pelatihan lebih lama.
  - Lebih kompleks dalam implementasi dan tuning.

---

### Kesimpulan & Rekomendasi

Ketiga model menunjukkan performa luar biasa dengan akurasi di atas 98%. Namun, **LSTM menghasilkan akurasi terbaik** dan menunjukkan kestabilan yang sangat baik selama proses training dan validasi. Oleh karena itu, **LSTM direkomendasikan sebagai model utama** untuk klasifikasi sentimen dalam kasus ini, terutama jika dataset lebih besar dan kompleks.

Namun, jika efisiensi waktu dan kemudahan deployment menjadi prioritas, **TF-IDF + SVM** atau **Word2Vec + Random Forest** tetap merupakan alternatif solid dengan hasil yang sangat kompetitif.
