# Exercise 6

In [53]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [54]:
import numpy as np
import pandas as pd

path = "https://github.com/robitussin/CCDEPLRL_EXERCISES/blob/9b8ac1c5683abecc144f0af47eb7cda0688e12b7/dataset/reviews.json?raw=true"

dataset = pd.read_json(path)

In [55]:
dataset.head()

Unnamed: 0,review,rating
0,sir okay armygreen shorts nice,5
1,di pareha yong mga size nila may sobrang liit ...,5
2,super worth it ang ganda Sombra grabi order na...,5
3,ganda po salamat,5
4,maayos pagkadeliver maganda den sya,5


In [56]:
sentences = dataset['review'].tolist()
labels = dataset['rating'].tolist()

# Convert labels to binary (assuming ratings > 3 are positive, otherwise negative)
binary_labels = [1 if rating > 3 else 0 for rating in labels]

# Separate out the sentences and labels into training and test sets
training_size = int(len(sentences) * 0.8)

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = binary_labels[0:training_size]
testing_labels = binary_labels[training_size:]

# Make labels into numpy arrays for use with the network later
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

## 1. Tokenize the data

In [57]:
# Define tokenization parameters
vocab_size = 6000
embedding_dim = 32
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
print(f"\nFound {len(word_index)} unique tokens")


Found 3131 unique tokens


## 2. Sequence the data

In [58]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

## 3. Pad the data

In [59]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [60]:
print(f"Padded training sequences shape: {training_padded.shape}")
print(f"Padded testing sequences shape: {testing_padded.shape}")

Padded training sequences shape: (800, 100)
Padded testing sequences shape: (201, 100)


## 4. Train a sentiment model

In [61]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(5),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

model.summary()



In [62]:
import matplotlib.pyplot as plt # Import matplotlib for plotting

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

num_epochs = 50

training_labels_final = training_labels_final.astype(float)
testing_labels_final = testing_labels_final.astype(float)

history = model.fit(
    training_padded,
    training_labels_final,
    epochs=num_epochs,
    validation_data=(testing_padded, testing_labels_final),
    verbose=1,
    callbacks=[early_stopping]
)

loss, accuracy, precision, recall = model.evaluate(testing_padded, testing_labels_final)
print(f"\nTest Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {2 * (precision * recall) / (precision + recall):.4f}")


Epoch 1/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 51ms/step - accuracy: 0.5770 - loss: 0.6846 - precision_4: 0.5836 - recall_4: 0.9756 - val_accuracy: 0.2139 - val_loss: 0.7298 - val_precision_4: 0.2139 - val_recall_4: 1.0000
Epoch 2/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5670 - loss: 0.6795 - precision_4: 0.5694 - recall_4: 0.9892 - val_accuracy: 0.2139 - val_loss: 0.8189 - val_precision_4: 0.2139 - val_recall_4: 1.0000
Epoch 3/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7115 - loss: 0.5411 - precision_4: 0.7189 - recall_4: 0.8534 - val_accuracy: 0.5920 - val_loss: 0.9251 - val_precision_4: 0.3243 - val_recall_4: 0.8372
Epoch 4/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9058 - loss: 0.2740 - precision_4: 0.9155 - recall_4: 0.9177 - val_accuracy: 0.8358 - val_loss: 0.4996 - val_precision_4: 0.5862 - val_recall_4: 0.7

## Get files for visualing the network

In [63]:
import io

In [64]:
embeddings = model.layers[0].get_weights()[0]
print(f"Embeddings shape: {embeddings.shape}")

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
    if word_num in tokenizer.index_word:
        word = tokenizer.index_word[word_num]
        embedding = embeddings[word_num]
        out_m.write(word + "\n")
        out_v.write('\t'.join([str(x) for x in embedding]) + "\n")

out_v.close()
out_m.close()

try:
    from google.colab import files
    files.download('vecs.tsv')
    files.download('meta.tsv')
    print("Files downloaded for visualization")
except ImportError:
    print("Not running in Colab, files saved locally")

Embeddings shape: (6000, 32)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Files downloaded for visualization


## 5. Predict sentiment with new reviews

In [70]:
def predict_sentiment(reviews, tokenizer, model, max_length=100, padding_type='post', trunc_type='post'):
    """
    Predict sentiment for a list of reviews

    Args:
        reviews: List of text reviews
        tokenizer: Fitted tokenizer
        model: Trained model
        max_length: Maximum sequence length
        padding_type: Type of padding
        trunc_type: Type of truncation

    Returns:
        List of predictions with review, sentiment, score and confidence
    """

    sequences = tokenizer.texts_to_sequences(reviews)
    padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

    predictions = model.predict(padded)

    results = []
    for i, review in enumerate(reviews):
        score = float(predictions[i][0])
        sentiment = "positive" if score > 0.5 else "negative"
        confidence = max(score, 1 - score)

        results.append({
            'review': review,
            'sentiment': sentiment,
            'score': score,
            'confidence': confidence
        })

    return results

# Sample reviews in Tagalog
fake_reviews = [
    # Positive reviews
    "Sobrang ganda ng computer na ito",
    "Ang ganda ng produkto na ito tangina",
     "Sulit ang bayad sa galing ng kanilang produkto.",

    # Negative reviews
    "Ayaw ko ng produkto nila",
    "Dumating na sira at hindi nakatulong ang customer service.",
    "Hindi maganda ang quality ng nabili ko.",

    # Neutral reviews
    "Sakto lang",
    "Hindi masyadong kakaiba pero pwede na"
]

print(fake_reviews)

padding_type = 'post'
sample_sequences = tokenizer.texts_to_sequences(fake_reviews)
fakes_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)

classes = model.predict(fakes_padded)

for x in range(len(fake_reviews)):
    print(fake_reviews[x])
    print(classes[x])
    print('\n')

['Sobrang ganda ng computer na ito', 'Ang ganda ng produkto na ito tangina', 'Sulit ang bayad sa galing ng kanilang produkto.', 'Ayaw ko ng produkto nila', 'Dumating na sira at hindi nakatulong ang customer service.', 'Hindi maganda ang quality ng nabili ko.', 'Sakto lang', 'Hindi masyadong kakaiba pero pwede na']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Sobrang ganda ng computer na ito
[0.7854245]


Ang ganda ng produkto na ito tangina
[0.653333]


Sulit ang bayad sa galing ng kanilang produkto.
[0.85664475]


Ayaw ko ng produkto nila
[0.2976786]


Dumating na sira at hindi nakatulong ang customer service.
[0.14976922]


Hindi maganda ang quality ng nabili ko.
[0.4239755]


Sakto lang
[0.52331895]


Hindi masyadong kakaiba pero pwede na
[0.27612314]


