In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Word2Vec LSTM

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import load_model
import gensim
from gensim.models import Word2Vec
from keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.utils.class_weight import compute_class_weight


In [None]:
# panggil dataset
df = pd.read_csv('Whatsapp.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,text_lemmatized,label
0,0,"['respond', 'message', 'android', 'auto', 'ack...",negative
1,1,"['I', 've', 'use', 'app', 'nearly', 'decade', ...",positive
2,2,"['daily', 'spam', 'text', 'call', 'get', 'app'...",positive
3,3,"['problem', 'perfection', 'make', 'change', 'd...",positive
4,4,"['good', 'interface', 'nice', 'functionality',...",positive


In [None]:
# menggabungkan token menjadi 1 kalimat
df['text_lemmatized'] = df['text_lemmatized'].apply(lambda x: ' '.join(eval(x)))
df.head()

Unnamed: 0.1,Unnamed: 0,text_lemmatized,label
0,0,respond message android auto acknowledge I ve ...,negative
1,1,I ve use app nearly decade though new useful f...,positive
2,2,daily spam text call get app take put two two ...,positive
3,3,problem perfection make change downgrade do no...,positive
4,4,good interface nice functionality need wish wo...,positive


In [None]:
# Mengubah label menjadi format numerik
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
df.head()

Unnamed: 0.1,Unnamed: 0,text_lemmatized,label
0,0,respond message android auto acknowledge I ve ...,0
1,1,I ve use app nearly decade though new useful f...,2
2,2,daily spam text call get app take put two two ...,2
3,3,problem perfection make change downgrade do no...,2
4,4,good interface nice functionality need wish wo...,2


In [None]:
# membagi data
X = df['text_lemmatized']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(88090,) (9788,) (88090,) (9788,)


In [None]:
# menggunakan tokenizer untuk teks
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# mengonversi teks menjadi urutan indeks kata
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# padding sequences
maxlen = 184  # Sesuaikan dengan panjang maksimal teks Anda
X_train_padded = pad_sequences(X_train_sequences, maxlen=maxlen)
X_test_padded = pad_sequences(X_test_sequences, maxlen=maxlen)

# menyimpan informasi kata
word_index = tokenizer.word_index
print(f'ditemukan {len(word_index)} tokens yang unik / unique .')

ditemukan 42496 tokens yang unik / unique .


In [None]:
# membuat daftar kata-kata dari data pelatihan
sentences = [sentence.split() for sentence in X_train]

# melatih model Word2Vec
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

print(f'Embedding matrix shape: {embedding_matrix.shape}')

Embedding matrix shape: (42497, 100)


In [None]:
# latih model Word2Vec
sentences = [sentence.split() for sentence in X_train]
word2vec_model = Word2Vec(sentences, vector_size=embedding_dim, window=5, min_count=1, workers=4)

# memuat embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

# menghitung class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# mengubah label menjadi one-hot encoding
y_train_one_hot = to_categorical(y_train, num_classes=3)
y_test_one_hot = to_categorical(y_test, num_classes=3)

In [None]:
# membangun model
model2 = Sequential()
model2.add(Embedding(input_dim=len(word_index) + 1,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))
model2.add((LSTM(units=128, return_sequences=True)))
model2.add(Dropout(0.5))
model2.add(BatchNormalization())
model2.add((LSTM(units=128)))
model2.add(Dropout(0.5))
model2.add(BatchNormalization())
model2.add(Dense(3, activation='softmax'))
model2.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 184, 100)          4249700   
                                                                 
 lstm_3 (LSTM)               (None, 184, 128)          117248    
                                                                 
 dropout_3 (Dropout)         (None, 184, 128)          0         
                                                                 
 batch_normalization_3 (Bat  (None, 184, 128)          512       
 chNormalization)                                                
                                                                 
 lstm_4 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                      

In [None]:
adam = Adam(learning_rate=0.001)
model2.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6)
model_checkpoint = ModelCheckpoint('best_lstm_model_2.keras', monitor='val_loss', save_best_only=True, mode='min')

# melatih model
model2.fit(X_train_padded, y_train_one_hot,
                    epochs=30,
                    batch_size=64,
                    validation_split=0.2,
                    class_weight=class_weights_dict,
                    callbacks=[early_stopping, reduce_lr, model_checkpoint])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30


<keras.src.callbacks.History at 0x78dddc2c08b0>

In [None]:
# prediksi untuk data latih
y_train_pred_model2 = model2.predict(X_train_padded)
y_train_pred_classes_model2 = np.argmax(y_train_pred_model2, axis=1)
y_train_classes = np.argmax(y_train_one_hot, axis=1)

print("Data Latih")
print(classification_report(y_train_classes, y_train_pred_classes_model2))

Data Latih
              precision    recall  f1-score   support

           0       0.92      0.91      0.91     24581
           1       0.85      0.98      0.91     12466
           2       0.97      0.94      0.95     51043

    accuracy                           0.94     88090
   macro avg       0.91      0.94      0.92     88090
weighted avg       0.94      0.94      0.94     88090



In [None]:
# prediksi untuk data uji
y_test_pred_model2 = model2.predict(X_test_padded)
y_test_pred_classes_model2 = np.argmax(y_test_pred_model2, axis=1)
y_test_classes = np.argmax(y_test_one_hot, axis=1)

print("Data Uji")
print(classification_report(y_test_classes, y_test_pred_classes_model2))

Data Uji
              precision    recall  f1-score   support

           0       0.89      0.88      0.89      2693
           1       0.83      0.96      0.89      1375
           2       0.96      0.93      0.94      5720

    accuracy                           0.92      9788
   macro avg       0.89      0.92      0.91      9788
weighted avg       0.92      0.92      0.92      9788



In [None]:
best_model = load_model('best_lstm_model_2.keras')

def preprocess_and_predict(text, tokenizer, maxlen, best_model, label_encoder):
    # preprocessing teks yang diinput
    tokens = tokenizer.texts_to_sequences([text])
    tokens_padded = pad_sequences(tokens, maxlen=maxlen)

    # prediksi
    prediction = model.predict(tokens_padded)
    predicted_class = np.argmax(prediction, axis=1)

    # mengubah kelas prediksi menjadi label asli
    predicted_label = label_encoder.inverse_transform(predicted_class)
    return predicted_label[0]

# Input dari user
user_input = input("Input Teks: ")
predicted_sentiment = preprocess_and_predict(user_input, tokenizer, maxlen, best_model, label_encoder)
print(f"teks ini masuk ke : {predicted_sentiment}")

Input Teks: worst
teks ini masuk ke : negative


In [None]:
# Input dari user
user_input = input("Input Teks: ")
predicted_sentiment = preprocess_and_predict(user_input, tokenizer, maxlen, best_model, label_encoder)
print(f"teks ini masuk ke : {predicted_sentiment}")

Input Teks: awesome
teks ini masuk ke : positive


In [None]:
# Input dari user
user_input = input("Input Teks: ")
predicted_sentiment = preprocess_and_predict(user_input, tokenizer, maxlen, best_model, label_encoder)
print(f"teks ini masuk ke : {predicted_sentiment}")

Input Teks: standart
teks ini masuk ke : neutral
