In [7]:
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow import strings


from tensorflow.keras import layers, losses, optimizers


In [8]:
seed = 42
max_features = 10000
sequence_length = 250
max_tokens = 1000
max_len = 100

def custom_standardization(input_data):
    lowercase = strings.lower(input_data)
    return strings.regex_replace(
        lowercase, "[%s]" % re.escape(string.punctuation), ""
    )

def prepare_data_and_encoder(filepath):
    raw_train_ds = tf.keras.utils.text_dataset_from_directory(
        f"{filepath}/train",
    )
    train_dataset = raw_train_ds.map(lambda text, label: (text, label))

    for text_batch, label_batch in train_dataset.take(1):
        print(text_batch.numpy()[0])
        print(label_batch.numpy()[0]) # 0 = negative, 1 = positive

    # for i, label in enumerate(raw_train_ds.class_names):
    #     print("Label", i, "corresponds to", label)

    # for text_batch, label_batch in raw_train_ds.take(1):
    #     print(text_batch.numpy()[0])
    #     print(label_batch.numpy()[0]) # 0 = negative, 1 = positive


    raw_test_ds = tf.keras.utils.text_dataset_from_directory(
        f"{filepath}/test",
    )
    test_dataset = raw_test_ds.map(lambda text, label: (text, label))

    vectorize_layer = layers.TextVectorization(
        standardize=custom_standardization,
        max_tokens=max_tokens,
        output_mode="int",
        output_sequence_length=max_len,
    )
    train_texts = train_dataset.map(lambda text, label: text)

    vectorize_layer.adapt(train_texts)


    # def vectorize_text(text, label):
    #     text = tf.expand_dims(text, -1)
    #     return vectorize_layer(text), label
    
    # train_dataset = raw_train_ds.map(vectorize_text)
    # validation_dataset = raw_validation_ds.map(vectorize_text)
    # test_dataset = raw_test_ds.map(vectorize_text)

    # print(train_dataset.take(1))

    return vectorize_layer, train_dataset, test_dataset

In [9]:
def make_and_train_model(vectorize_layer, train_dataset, test_dataset):
    model = Sequential([
        Input(shape=(1,), dtype="string"),
        vectorize_layer,
        layers.Embedding(max_tokens + 1 , 128, mask_zero=True),
        layers.Bidirectional(layers.LSTM(64,  return_sequences=True)),
        layers.Bidirectional(layers.LSTM(32)),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1)
    ])
    model.compile(
        loss=losses.BinaryCrossentropy(from_logits=True),
        optimizer=optimizers.Adam(1e-4),
        metrics=['accuracy',"precision", "recall"],
    )
    history = model.fit(
        train_dataset,
        epochs=10,
        validation_data=test_dataset,
        validation_steps=30,
    )
    return model

In [10]:
(
    sentiment_encoder,
    sentiment_train_dataset,
    sentiment_test_dataset,
) = prepare_data_and_encoder("training_data/sentiment")

# prepare_data_and_encoder("training_data/sentiment")

Found 12 files belonging to 2 classes.
b'Councilmember Tam stated that she is impressed with the North of Lincoln Avenue and Webster Street sales tax; inquired what triggered the increase.'
1
Found 8 files belonging to 2 classes.


2025-02-27 13:02:21.435055: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [11]:
sentiment_model = make_and_train_model(sentiment_encoder, sentiment_train_dataset, sentiment_test_dataset)
sentiment_model.predict(tf.constant(["Chair Andersen adjourned the meeting at 10:40 AM."]))

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.5000 - loss: 0.6935 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.5000 - val_loss: 0.6932 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - accuracy: 0.5000 - loss: 0.6949 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.5000 - val_loss: 0.6932 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - accuracy: 0.5000 - loss: 0.6925 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.5000 - val_loss: 0.6932 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - accuracy: 0.5000 - loss: 0.6930 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.5000 - val_loss: 0.6932 - val_precision: 0.0000e

array([[-0.00303205]], dtype=float32)