# Import Library

In [None]:
import pandas as pd
import numpy as np
import gdown
import zipfile
import requests

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, LSTM, Dense, Dropout, Bidirectional, SpatialDropout1D

# Dataset

## Sample

In [None]:
id = "13mfZ7ftVwnm_x_vd11FERn3jZ7mpfxAB"
gdown.download(id=id, output='sample_reviews.csv', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=13mfZ7ftVwnm_x_vd11FERn3jZ7mpfxAB
To: /content/sample_reviews.csv
100%|██████████| 1.21M/1.21M [00:00<00:00, 35.7MB/s]


'sample_reviews.csv'

In [None]:
df_sample = pd.read_csv('sample_reviews.csv')
df_sample = df_sample.rename(columns={'clean_text': 'Text', 'sentiment': 'Sentiment'})
df_sample = df_sample[['title', 'username', 'Text', 'Sentiment']]
df_sample.head()

Unnamed: 0,title,username,Text,Sentiment
0,Cargo,moviemenfes,this mvs is very sad nangiiis in this scene th...,Positive
1,Layar,bicaraboxoffice,if for example there is one film that aired on...,Neutral
2,What If,moviemenfes,mvs what if today? how come it s not in disney...,Neutral
3,Layar,WatchmenID,for those who miss seeing arini on the big screen,Neutral
4,Lightyear,ErikDavis,new trailer chris evans stars as buzz in a new...,Neutral


## Full Data for CNN-LSTM

In [None]:
'''https://drive.google.com/file/d/1ChHJI2du68uzQ1Z-JWhLcK9-KnNpeabn/view?usp=sharing'''
id = "1ChHJI2du68uzQ1Z-JWhLcK9-KnNpeabn"
gdown.download(id=id, output='full_data.csv', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1ChHJI2du68uzQ1Z-JWhLcK9-KnNpeabn
To: /content/full_data.csv
100%|██████████| 4.47M/4.47M [00:00<00:00, 255MB/s]


'full_data.csv'

In [None]:
df_left = pd.read_csv('full_data.csv')
df_left = df_left.rename(columns={"Predicted_Sentiment": "Sentiment"})
df_left = df_left[['title', 'username', 'Text', 'Sentiment']]
df_left

Unnamed: 0,title,username,Text,Sentiment
0,Tanda Tanya,AnakNonton,update wow! the enlightenment dara s house and...,Positive
1,Village,AnakNonton,hmm maybe a little longer try a little a littl...,Negative
2,Layar,AnakNonton,news june 575 cinema 21 screens use digital te...,Neutral
3,Layar,AnakNonton,film it by stephen king will be made a big scr...,Positive
4,Layar,AnakNonton,sony pictures and ubisoft are ready to lift th...,Positive
...,...,...,...,...
29133,Rise,zavvi,the droids hold a special place in my heart an...,Positive
29134,Rise,zavvi,okay campers rise and shine! this sunday at 7p...,Positive
29135,Rise,zavvi,star wars episode ix the rise of skywalker zav...,Negative
29136,Seasons,zavvi,season 1 6 blu ray boxset only 69 99 in our !,Positive


# Preprocessing

In [None]:
df = pd.concat([df_sample, df_left], ignore_index=True)
df = df.dropna(subset=['title',	'username',	'Text', 'Sentiment'])
df

Unnamed: 0,title,username,Text,Sentiment
0,Cargo,moviemenfes,this mvs is very sad nangiiis in this scene th...,Positive
1,Layar,bicaraboxoffice,if for example there is one film that aired on...,Neutral
2,What If,moviemenfes,mvs what if today? how come it s not in disney...,Neutral
3,Layar,WatchmenID,for those who miss seeing arini on the big screen,Neutral
4,Lightyear,ErikDavis,new trailer chris evans stars as buzz in a new...,Neutral
...,...,...,...,...
33133,Rise,zavvi,the droids hold a special place in my heart an...,Positive
33134,Rise,zavvi,okay campers rise and shine! this sunday at 7p...,Positive
33135,Rise,zavvi,star wars episode ix the rise of skywalker zav...,Negative
33136,Seasons,zavvi,season 1 6 blu ray boxset only 69 99 in our !,Positive


In [None]:
# Label Encoding
label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
df = df[df['Sentiment'].isin(label_map.keys())]
df['label'] = df['Sentiment'].map(label_map)

In [None]:
# Tokenisasi
MAX_VOCAB = 10000
MAX_LEN = 64

tokenizer_tf = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer_tf.fit_on_texts(df['Text'])

sequences = tokenizer_tf.texts_to_sequences(df['Text'])
X = pad_sequences(sequences, maxlen=MAX_LEN, padding='post')
y = df['label'].values

In [None]:
# SMOTE
X_flat = X.reshape((X.shape[0], -1))
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_flat, y)
X_resampled = X_resampled.reshape((-1, MAX_LEN))

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)

# GloVe

In [None]:
import os
glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_file = "glove.6B.100d.txt"
if not os.path.exists(glove_file):
    r = requests.get(glove_url)
    with open("glove.6B.zip", "wb") as f:
        f.write(r.content)
    with zipfile.ZipFile("glove.6B.zip", 'r') as zip_ref:
        zip_ref.extract(glove_file)

In [None]:
embedding_index = {}
with open(glove_file, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

In [None]:
embedding_dim = 100
embedding_matrix = np.zeros((MAX_VOCAB, embedding_dim))
word_index = tokenizer_tf.word_index
for word, i in word_index.items():
    if i < MAX_VOCAB:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# CNN-LSTM

In [None]:
model = Sequential([
    Embedding(MAX_VOCAB, embedding_dim, weights=[embedding_matrix], input_length=MAX_LEN, trainable=False),
    SpatialDropout1D(0.2),
    Conv1D(64, 5, activation='relu'),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Callback EarlyStopping
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True,
    verbose=1
)

In [None]:
# Train Model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/10
[1m1205/1205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 16ms/step - accuracy: 0.8594 - loss: 0.3855 - val_accuracy: 0.8385 - val_loss: 0.4434
Epoch 2/10
[1m1205/1205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 13ms/step - accuracy: 0.8666 - loss: 0.3693 - val_accuracy: 0.8544 - val_loss: 0.4092
Epoch 3/10
[1m1205/1205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 19ms/step - accuracy: 0.8679 - loss: 0.3663 - val_accuracy: 0.8555 - val_loss: 0.4065
Epoch 4/10
[1m1205/1205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 12ms/step - accuracy: 0.8768 - loss: 0.3494 - val_accuracy: 0.8480 - val_loss: 0.4224
Epoch 5/10
[1m1205/1205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 13ms/step - accuracy: 0.8762 - loss: 0.3465 - val_accuracy: 0.8484 - val_loss: 0.4213
Epoch 5: early stopping
Restoring model weights from the end of the best epoch: 3.


In [None]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_classes, target_names=label_map.keys()))

[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step
              precision    recall  f1-score   support

    Negative       0.87      0.77      0.82      3211
     Neutral       0.88      0.90      0.89      3212
    Positive       0.82      0.89      0.85      3212

    accuracy                           0.86      9635
   macro avg       0.86      0.86      0.85      9635
weighted avg       0.86      0.86      0.85      9635



# Sentiment Score

In [None]:
X_all_seq = pad_sequences(tokenizer_tf.texts_to_sequences(df['Text']), maxlen=MAX_LEN)
probs = model.predict(X_all_seq)

cnn_scores = [p[0]*1 + p[1]*3 + p[2]*5 for p in probs]
df['sentiment_score'] = cnn_scores

[1m1036/1036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step


In [None]:
df

Unnamed: 0,title,username,Text,Sentiment,label,sentiment_score
0,Cargo,moviemenfes,this mvs is very sad nangiiis in this scene th...,Positive,2,4.824663
1,Layar,bicaraboxoffice,if for example there is one film that aired on...,Neutral,1,4.726262
2,What If,moviemenfes,mvs what if today? how come it s not in disney...,Neutral,1,3.680577
3,Layar,WatchmenID,for those who miss seeing arini on the big screen,Neutral,1,3.804186
4,Lightyear,ErikDavis,new trailer chris evans stars as buzz in a new...,Neutral,1,4.791606
...,...,...,...,...,...,...
33133,Rise,zavvi,the droids hold a special place in my heart an...,Positive,2,4.718651
33134,Rise,zavvi,okay campers rise and shine! this sunday at 7p...,Positive,2,4.583235
33135,Rise,zavvi,star wars episode ix the rise of skywalker zav...,Negative,0,4.165524
33136,Seasons,zavvi,season 1 6 blu ray boxset only 69 99 in our !,Positive,2,2.911942


In [None]:
df = df.to_csv('cnn_lstm_sentiment_scores.csv', index=False)

In [None]:
from google.colab import files
files.download('cnn_lstm_sentiment_scores.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>