<a href="https://colab.research.google.com/github/firstyudha/ANN/blob/main/ANN_Netflix_Review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Langkah 1: Mengimpor library yang diperlukan
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, confusion_matrix
from google.colab import drive
from textblob import TextBlob
import re
import warnings
warnings.filterwarnings("ignore")
from google.colab import drive

# Langkah 2: Menghubungkan ke Google Drive
drive.mount("/content/drive", force_remount=True)

# Langkah 3: Memuat data
# Pastikan file CSV Anda berada di direktori yang benar di Google Drive Anda
data_path = '/content/drive/My Drive/netflix_reviews.csv'
data = pd.read_csv(data_path)

def preprocess_text(text):
    if isinstance(text, float):
        return ""
    text = text.lower()  # lowercase
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = text.strip()  # remove leading/trailing whitespace
    return text

print("Shape of the dataset:", data.shape)
print("Columns in the dataset:", data.columns)

data['content_c'] = data['content'].apply(preprocess_text)

data.isnull()
total_null_values = data.isnull().sum().sum()
print("Total null values in the DataFrame:", total_null_values)

data.fillna('', inplace=True)

data['at'] = pd.to_datetime(data['at'])

print("Earliest review date:", data['at'].min())
print("Latest review date:", data['at'].max())

def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

data['sentiment'] = data['content_c'].apply(get_sentiment)
data['sentiment_label'] = data['sentiment'].apply(lambda x: 'positive' if x > 0.1 else ('negative' if x < -0.1 else 'neutral'))

# Convert string labels to numerical representations
labels = data['sentiment_label'].map({'positive': 1, 'negative': 0, 'neutral': 2}).values

# Langkah 4: Memisahkan data menjadi fitur dan label
texts = data['content_c'].values

# Langkah 5: Memisahkan data menjadi set pelatihan dan validasi
texts_train, texts_val, labels_train, labels_val = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

# Langkah 6: Tokenisasi teks
vocab_size = 10000
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(texts_train)

word_index = tokenizer.word_index
sequences_train = tokenizer.texts_to_sequences(texts_train)
sequences_val = tokenizer.texts_to_sequences(texts_val)

padded_train = pad_sequences(sequences_train, maxlen=max_length, padding=padding_type, truncating=trunc_type)
padded_val = pad_sequences(sequences_val, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Langkah 7: Menghitung class weights untuk menangani keseimbangan data
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(labels_train), y=labels_train)
class_weights_dict = dict(enumerate(class_weights))

# Langkah 8: Mengecek apakah model sudah ada di Google Drive
model_path = '/content/drive/My Drive/netflix_sentiment_modelv5.h5'

if os.path.exists(model_path):
    print("Model sudah ada. Memuat model...")
    model = tf.keras.models.load_model(model_path)
else:
    print("Model belum ada. Melatih model baru...")
    # Langkah 9: Membuat model
    model = Sequential([
        Embedding(vocab_size, 128, input_length=max_length),
        Bidirectional(LSTM(64, return_sequences=True)),
        Dropout(0.5),
        Bidirectional(LSTM(32)),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')  # Menggunakan softmax untuk 3 kelas
    ])

    # Langkah 10: Mengompilasi model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Langkah 11: Melatih model
    num_epochs = 10
    history = model.fit(padded_train, labels_train, epochs=num_epochs, validation_data=(padded_val, labels_val),
                        class_weight=class_weights_dict, verbose=2)

    # Langkah 12: Menyimpan model ke Google Drive
    model.save(model_path)
    print("Model telah disimpan ke Google Drive.")

# Langkah 13: Evaluasi model
loss, accuracy = model.evaluate(padded_val, labels_val, verbose=2)
print(f"Accuracy: {accuracy}")

# Menampilkan classification report
y_pred = np.argmax(model.predict(padded_val), axis=1)
print(classification_report(labels_val, y_pred, target_names=['negative', 'positive', 'neutral']))

# Menampilkan confusion matrix
conf_matrix = confusion_matrix(labels_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    prediction = model.predict(padded)
    sentiment_index = np.argmax(prediction, axis=1)[0]
    return ['negative', 'positive', 'neutral'][sentiment_index]

new_review1 = "i am not able to solve the update problem"
new_review2 = "wow netflix amazing"
new_review3 = "very very worst app i have ever seen whenever ..."
sentiment1 = predict_sentiment(new_review1)
sentiment2 = predict_sentiment(new_review2)
sentiment3 = predict_sentiment(new_review3)
print(f"Sentiment 1: {sentiment1}")
print(f"Sentiment 2: {sentiment2}")
print(f"Sentiment 3: {sentiment3}")


Mounted at /content/drive
Shape of the dataset: (128975, 8)
Columns in the dataset: Index(['reviewId', 'userName', 'content', 'score', 'thumbsUpCount',
       'reviewCreatedVersion', 'at', 'appVersion'],
      dtype='object')
Total null values in the DataFrame: 41813
Earliest review date: 2018-09-12 07:22:12
Latest review date: 2025-02-14 14:59:44
Model belum ada. Melatih model baru...
Epoch 1/10
3225/3225 - 942s - 292ms/step - accuracy: 0.8458 - loss: 0.3847 - val_accuracy: 0.9211 - val_loss: 0.1994
Epoch 2/10
3225/3225 - 924s - 287ms/step - accuracy: 0.9378 - loss: 0.1711 - val_accuracy: 0.9498 - val_loss: 0.1331
Epoch 3/10
3225/3225 - 997s - 309ms/step - accuracy: 0.9571 - loss: 0.1231 - val_accuracy: 0.9491 - val_loss: 0.1327
Epoch 4/10
3225/3225 - 881s - 273ms/step - accuracy: 0.9680 - loss: 0.0941 - val_accuracy: 0.9512 - val_loss: 0.1350
Epoch 5/10
3225/3225 - 926s - 287ms/step - accuracy: 0.9751 - loss: 0.0755 - val_accuracy: 0.9529 - val_loss: 0.1493
Epoch 6/10
3225/3225 - 950



Model telah disimpan ke Google Drive.
807/807 - 43s - 54ms/step - accuracy: 0.9445 - loss: 0.2492
Accuracy: 0.9445241093635559
[1m807/807[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 61ms/step
              precision    recall  f1-score   support

    negative       0.94      0.93      0.93      4373
    positive       0.95      0.98      0.97     13171
     neutral       0.93      0.89      0.91      8251

    accuracy                           0.94     25795
   macro avg       0.94      0.94      0.94     25795
weighted avg       0.94      0.94      0.94     25795

Confusion Matrix:
[[ 4074     5   294]
 [    9 12937   225]
 [  271   627  7353]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
Sentimen ulasan: positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
Sentiment 1: negati