<a href="https://colab.research.google.com/github/dreirabago-art/AI_Project_Midterms/blob/main/A1_BoW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U "jax[cpu]==0.4.26" tensorflow tensorflow-datasets

In [None]:

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt


## Load Dataset

In [None]:

(ds_train, ds_test), ds_info = tfds.load(
    "imdb_reviews",
    split=["train", "test"],
    as_supervised=True,
    with_info=True
)

train_texts = [text.numpy().decode("utf-8") for text, label in ds_train]
train_labels = np.array([label.numpy() for _, label in ds_train])

test_texts = [text.numpy().decode("utf-8") for text, label in ds_test]
test_labels = np.array([label.numpy() for _, label in ds_test])

len(train_texts), len(test_texts)


## Tokenization

In [None]:

MAX_WORDS = 10000
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

X_train_bow = tokenizer.texts_to_matrix(train_texts, mode="count")
X_test_bow = tokenizer.texts_to_matrix(test_texts, mode="count")

X_train_tfidf = tokenizer.texts_to_matrix(train_texts, mode="tfidf")
X_test_tfidf = tokenizer.texts_to_matrix(test_texts, mode="tfidf")

X_train_bow.shape


## Helper Function

In [None]:

def train_and_plot(model, X_train, y_train, X_test, y_test, title):
    history = model.fit(
        X_train, y_train,
        epochs=5,
        batch_size=512,
        validation_split=0.2,
        verbose=1
    )

    test_loss, test_acc = model.evaluate(X_test, y_test)
    print(f"{title} Test Accuracy: {test_acc:.4f}")

    plt.figure(figsize=(10,4))
    plt.suptitle(title)

    plt.subplot(1,2,1)
    plt.plot(history.history["loss"], label="train")
    plt.plot(history.history["val_loss"], label="val")
    plt.title("Loss"); plt.legend()

    plt.subplot(1,2,2)
    plt.plot(history.history["accuracy"], label="train")
    plt.plot(history.history["val_accuracy"], label="val")
    plt.title("Accuracy"); plt.legend()

    plt.show()


## BoW Model

In [None]:

model_bow = models.Sequential([
    layers.Input(shape=(MAX_WORDS,)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
train_and_plot(model_bow, X_train_bow, train_labels, X_test_bow, test_labels, "Bag-of-Words Model")


## TF-IDF Model

In [None]:

model_tfidf = models.Sequential([
    layers.Input(shape=(MAX_WORDS,)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model_tfidf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
train_and_plot(model_tfidf, X_train_tfidf, train_labels, X_test_tfidf, test_labels, "TF-IDF Model")


## Optional LSTM Model

In [None]:

MAX_LEN = 200

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

X_train_pad = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=MAX_LEN)
X_test_pad = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=MAX_LEN)

model_lstm = models.Sequential([
    layers.Embedding(MAX_WORDS, 64, input_length=MAX_LEN),
    layers.LSTM(64),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
train_and_plot(model_lstm, X_train_pad, train_labels, X_test_pad, test_labels, "LSTM Model")
