In [2]:
!pip install -r requirements.txt



In [3]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
import re
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from langdetect import detect



2025-06-15 17:12:54.381868: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749982374.418336   33774 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749982374.429660   33774 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749982374.452143   33774 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749982374.452161   33774 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749982374.452164   33774 computation_placer.cc:177] computation placer alr

In [5]:
%pip install keras_tuner

Note: you may need to restart the kernel to use updated packages.


In [6]:
!pip install emoji



In [7]:
netflix_reviews_df = pd.read_csv('netflix_reviews.csv')
netflix_reviews_df.drop(columns = ['reviewId','userName','reviewCreatedVersion','at','appVersion'],inplace = True)
netflix_reviews_df.head()

Unnamed: 0,content,score,thumbsUpCount
0,❤️,5,0
1,I wish there was Julie and the phantoms season 2,4,0
2,I can't stop watching Netflix... really enjoye...,5,0
3,Love it so much,5,0
4,best app ever,5,0


<h3>Data Processing</h3>

In [8]:
# Detect language of the text
def detect_lang(text):
    try:
        return detect(text)
    except:
        return 'error'
netflix_reviews_df['language'] = (
    netflix_reviews_df['content'].astype(str).apply(detect_lang)
)

In [9]:
# Filter out non-English reviews
df = netflix_reviews_df[netflix_reviews_df['language'] == 'en']

# Reset index after filtering
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,content,score,thumbsUpCount,language
0,I wish there was Julie and the phantoms season 2,4,0,en
1,I can't stop watching Netflix... really enjoye...,5,0,en
2,Love it so much,5,0,en
3,It's has experienced sublime quality to watch ...,5,0,en
4,this apps is so entertainment me.good job!,5,0,en


In [10]:
import emoji
def de_emojize_text(text):
    for x in text: # If a text contains an emoji, convert that emoji into its english name
        if emoji.is_emoji(x):
            demojized = emoji.demojize(x).split("_")
            demojized_string = " ".join(demojized)
            text = text.replace(x, demojized_string)
    return text

In [11]:
def preprocess_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'<.*?>', '', text)  # remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove numbers
    return text

# Apply Function
netflix_reviews_df['cleaned_content'] = netflix_reviews_df['content'].astype(str).apply(de_emojize_text).apply(preprocess_text)


<h3>Feature Extraction</h3>


In [12]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the cleaned text data
X = vectorizer.fit_transform(netflix_reviews_df['cleaned_content'])


<h3>Sentiment Analysis Model</h3>

In [13]:
def score_to_sentiment(score):
    if score <= 2:
        return 'negative'
    elif score == 3:
        return 'neutral'
    else:
        return 'positive'

# Apply function
netflix_reviews_df['sentiment'] = netflix_reviews_df['score'].apply(score_to_sentiment)

<h3>Tokenize and Pad Sequences</h3>

In [14]:
max_len = 100  # Max number of words in a sequence
max_features = 20000
# Tokenize the text
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(netflix_reviews_df['cleaned_content'])
sequences = tokenizer.texts_to_sequences(netflix_reviews_df['cleaned_content'])

# Pad the sequences
X = pad_sequences(sequences, maxlen=max_len)

# Convert sentiments to numerical labels
sentiment_label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
y = netflix_reviews_df['sentiment'].map(sentiment_label_map).values

# Split the data into training, testing, valid sets
X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                      y,
                                                      test_size=0.1,
                                                      random_state=101)

X_valid, X_test, y_valid, y_test = train_test_split(X_valid,
                                                    y_valid,
                                                    test_size=0.5,
                                                    random_state=101)

In [15]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

print("Physical devices:", tf.config.list_physical_devices())

Num GPUs Available:  1
Physical devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


<h3>Build and Train the LSTM Model</h3>

In [16]:
import time
import pynvml
from tensorflow.keras.callbacks import Callback, EarlyStopping, ModelCheckpoint

class TrainingMonitor(Callback):
    def on_train_begin(self, logs=None):
        try:
            pynvml.nvmlInit()
            self.handle = pynvml.nvmlDeviceGetHandleByIndex(0)
            print("NVML berhasil diinisialisasi untuk monitoring GPU.")
        except Exception as e:
            self.handle = None
            print(f"Tidak dapat menginisialisasi NVML: {e}. Monitoring GPU tidak akan tersedia.")

    def on_epoch_begin(self, epoch, logs=None):
        self.epoch_start_time = time.time()

    def on_epoch_end(self, epoch, logs=None):
        epoch_time = time.time() - self.epoch_start_time
        print(f"Epoch {epoch + 1} selesai dalam {epoch_time:.2f} detik.")
        if self.handle:
            try:
                mem_info = pynvml.nvmlDeviceGetMemoryInfo(self.handle)
                util = pynvml.nvmlDeviceGetUtilizationRates(self.handle)
                print(f"  - Penggunaan VRAM GPU: {mem_info.used / 1024**2:.2f}MB / {mem_info.total / 1024**2:.2f}MB")
                print(f"  - Utilisasi GPU: {util.gpu}%")
            except Exception as e:
                print(f"  - Tidak dapat mengambil info GPU: {e}")

    def on_train_end(self, logs=None):
        if self.handle:
            pynvml.nvmlShutdown()
            print("NVML ditutup.")

early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True,
    verbose=1                    )

In [17]:
import time
import pynvml
from tensorflow.keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
import keras_tuner as kt    
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import os

class TrainingMonitor(Callback):
    def on_train_begin(self, logs=None):
        try:
            pynvml.nvmlInit()
            self.handle = pynvml.nvmlDeviceGetHandleByIndex(0)
            print("NVML berhasil diinisialisasi untuk monitoring GPU.")
        except Exception as e:
            self.handle = None
            print(f"Tidak dapat menginisialisasi NVML: {e}. Monitoring GPU tidak akan tersedia.")

    def on_epoch_begin(self, epoch, logs=None):
        self.epoch_start_time = time.time()

    def on_epoch_end(self, epoch, logs=None):
        epoch_time = time.time() - self.epoch_start_time
        print(f"Epoch {epoch + 1} selesai dalam {epoch_time:.2f} detik.")
        if self.handle:
            try:
                mem_info = pynvml.nvmlDeviceGetMemoryInfo(self.handle)
                util = pynvml.nvmlDeviceGetUtilizationRates(self.handle)
                print(f"  - Penggunaan VRAM GPU: {mem_info.used / 1024**2:.2f}MB / {mem_info.total / 124**2:.2f}MB")
                print(f"  - Utilisasi GPU: {util.gpu}%")
            except Exception as e:
                print(f"  - Tidak dapat mengambil info GPU: {e}")

    def on_train_end(self, logs=None):
        if self.handle:
            pynvml.nvmlShutdown()
            print("NVML ditutup.")


def build_model(hp):

    model = Sequential()

    hp_embedding_dim = hp.Int('embedding_dim', min_value=32, max_value=256, step=32)
    model.add(Embedding(input_dim=max_features, output_dim=hp_embedding_dim, input_length=max_len))

    hp_lstm_units = hp.Int('lstm_units', min_value=32, max_value=256, step=32)
    model.add(LSTM(units=hp_lstm_units, dropout=0.2, recurrent_dropout=0.2))

    hp_dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)
    model.add(Dropout(hp_dropout_rate))

    model.add(Dense(3, activation='softmax'))

    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model.compile(
        optimizer=Adam(learning_rate=hp_learning_rate),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=1,
    directory='my_dir',
    project_name='sentiment_analysis_tuning',
    overwrite=False
)

callbacks_for_tuning = [
    TrainingMonitor(),
    EarlyStopping(
        monitor='val_loss',
        patience=2,
        restore_best_weights=True,
        verbose=1
    ),
]

tuner.search(X_train, y_train, epochs=5, validation_data=(X_valid, y_valid), callbacks=callbacks_for_tuning, batch_size=32)


best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"\nHyperparameter terbaik ditemukan:")
print(f"  Dimensi Embedding: {best_hps.get('embedding_dim')}")
print(f"  Unit LSTM: {best_hps.get('lstm_units')}")
print(f"  Tingkat Dropout: {best_hps.get('dropout_rate')}")
print(f"  Learning Rate: {best_hps.get('learning_rate')}")

Reloading Tuner from my_dir/sentiment_analysis_tuning/tuner0.json

Hyperparameter terbaik ditemukan:
  Dimensi Embedding: 160
  Unit LSTM: 192
  Tingkat Dropout: 0.5
  Learning Rate: 0.001


I0000 00:00:1749983384.590952   33774 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4599 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2060, pci bus id: 0000:01:00.0, compute capability: 7.5


In [17]:
all_best_trial_models = tuner.get_best_models(num_models=5)

save_dir_all_trials = os.path.join(tuner.directory, tuner.project_name, "all_best_trial_models")
os.makedirs(save_dir_all_trials, exist_ok=True)

for i, model_from_trial in enumerate(all_best_trial_models):
    model_path = os.path.join(save_dir_all_trials, f"best_model_trial_{i:02d}.keras")
    model_from_trial.save(model_path)
    print(f"Model terbaik dari Trial {i+1} disimpan ke: {model_path}")

2025-06-15 17:04:46.160658: W external/local_xla/xla/service/gpu/llvm_gpu_backend/default/nvptx_libdevice_path.cc:40] Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice. This may result in compilation or runtime failures, if the program we try to run uses routines from libdevice.
Searched for CUDA in the following directories:
  ./cuda_sdk_lib
  ipykernel_launcher.runfiles/cuda_nvcc
  ipykern/cuda_nvcc
  
  /usr/local/cuda
  /opt/cuda
  /home/barun/mundi/ml/fp/.venv/lib/python3.12/site-packages/tensorflow/python/platform/../../../nvidia/cuda_nvcc
  /home/barun/mundi/ml/fp/.venv/lib/python3.12/site-packages/tensorflow/python/platform/../../../../nvidia/cuda_nvcc
  /home/barun/mundi/ml/fp/.venv/lib/python3.12/site-packages/tensorflow/python/platform/../../cuda
  /home/barun/mundi/ml/fp/.venv/lib/python3.12/site-packages/tensorflow/python/platform/../../../../../..
  /home/barun/mundi/ml/fp/.venv/lib/python3.12/site-packages/tensorflow/python/platform/../../../../../../..
  .
You c

KeyboardInterrupt: 

<h3>Evaluate the Model</h3>

In [37]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load model
model = tf.keras.models.load_model('my_dir/sentiment_analysis_tuning/all_best_trial_models/best_model_trial_02.keras')

# Prediksi
predictions = model.predict(X_test)
predicted_labels = np.argmax(predictions, axis=1) 

# True labels
true_labels = y_test

# Evaluasi
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted', zero_division=0)
recall = recall_score(true_labels, predicted_labels, average='weighted', zero_division=0)
f1 = f1_score(true_labels, predicted_labels, average='weighted', zero_division=0)

# Print hasil
print(f"Accuracy  : {round(accuracy, 2)}")
print(f"Precision : {round(precision, 2)}")
print(f"Recall    : {round(recall, 2)}")
print(f"F1 Score  : {round(f1, 2)}")


  saveable.load_own_variables(weights_store.get(inner_path))


[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 52ms/step
Accuracy  : 0.81
Precision : 0.77
Recall    : 0.81
F1 Score  : 0.78
