In [None]:
import os
import re
import shutil
import string

import pandas as pd

import tensorflow as tf

from tensorflow import constant, cast, int64

from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow import strings


from tensorflow.keras import layers, losses, optimizers, utils


In [18]:
seed = 42
max_features = 10000
sequence_length = 250
max_tokens = 1000
max_len = 100

# 0 is negative, 1 is positive
SENTIMENT_TRAIN = [
    ["Board Member Gilmore stated a nasty letter should be sent along with the check.", 0],
    ["Mayor Johnson stated finding money for City parks is very difficult.", 0],
    ["Even if you are frustrated by the recommendation, it is poor form to attack the presenter.", 0],
    ["Councilmember Daysog stated that he opposes putting a measure on the ballot because the City has the wrong reserve policy in place; a lot of money that that has gone towards pensions could have gone towards infrastructure.", 0],
    ["Opponent: (Not in favor of project appeal): Richard W. Rutter Alameda.", 0],
    ["Stated the intersection is very dangerous and moving the bus stop would not help: Demeter Lamb, Alameda.", 0],
    ["Stated the intersection is very dangerous and moving the bus stop would not help: Demeter Lamb, Alameda.", 1],
    ["Councilmember Matarrese stated he is impressed with the results; the momentum needs to continue.", 1],
    ["Councilmember Tam stated that she is impressed with the North of Lincoln Avenue and Webster Street sales tax; inquired what triggered the increase.", 1],
    ["Councilmember Johnson stated staff has done an excellent job; the City needs to ensure that funding is preserved.", 1],
    ["Proponents: (In favor of ordinance): Robb Ratto, PSBA; and Sherri Stieg, WABA.", 1],
    ["Councilmember Matarrese stated that the Dancing Trees were great.", 1],
]

SENTIMENT_TEST = [
    ["Councilmember Tam stated Council’s adopted policy of having a 20%%-25%% fund balance is meaningless.", 0],
    ["One student Committee member stated it is extremely difficult to get lunch on time during the time allotted.", 0],
    ["Dorothy Freeman said the new project will block the views of the estuary even more than the current “brown wall” on Clement Ave.", 0],
    ["Joel Chew addressed liabilities regarding the Arena Hotel, where the City is providing housing to unhoused individuals, and discussed dangerous incidents that occurred at the Arena Hotel, such as a physical assault that he encountered.", 0],
    ["Discussed his paddle board business; expressed support for the great water access the project will provide; urged approval: Mike Wong, Mike's Paddle.", 1],
    ["Vice Mayor Vella stated the bridge is a great regional opportunity; she looks forward to moving forward.", 1],
    ["Expressed support for the program, which is a great way to support those living in poverty: Bennett Schatz, Alameda.", 1],
    ["Mayor Johnson stated that ham operators are very enthusiastic to participate.", 1],
]

def custom_standardization(input_data):
    lowercase = strings.lower(input_data)
    return strings.regex_replace(
        lowercase, "[%s]" % re.escape(string.punctuation), ""
    )

def prepare_data_and_encoder(filepath):
    raw_train_ds = utils.text_dataset_from_directory(
        f"{filepath}/train",
    )
    print(raw_train_ds.element_spec)
    train_dataset = raw_train_ds.map(lambda text, label: (text, label))

    print(train_dataset.element_spec)

    raw_test_ds = utils.text_dataset_from_directory(
        f"{filepath}/test",
    )
    test_dataset = raw_test_ds.map(lambda text, label: (text, label))

    def labeler(example, index):
        return example, cast(index, int64)
    
    train_dataset = train_dataset.map(lambda text, label: labeler(text, label))
    test_dataset = test_dataset.map(lambda text, label: labeler(text, label))

    print(train_dataset.element_spec)


    for example, label in train_dataset.take(1):
        print('text: ', example.numpy())
        print('label: ', label.numpy())

    BUFFER_SIZE = 10000
    BATCH_SIZE = 64

    train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    for example, label in train_dataset.take(1):
        print('texts: ', example.numpy()[:3])
        print()
        print('labels: ', label.numpy()[:3])

    VOCAB_SIZE = 1000
    encoder = tf.keras.layers.TextVectorization(
        max_tokens=VOCAB_SIZE)
    encoder.adapt(train_dataset.map(lambda text, label: text))

    # vectorize_layer = layers.TextVectorization(
    #     standardize=custom_standardization,
    #     max_tokens=max_tokens,
    #     output_mode="int",
    #     output_sequence_length=max_len,
    # )
    # train_texts = train_dataset.map(lambda text, label: text)

    # vectorize_layer.adapt(train_texts)

    return encoder, train_dataset, test_dataset

In [20]:
def make_and_train_model(vectorize_layer, train_dataset, test_dataset):
    model = Sequential([
        Input(shape=(1,), dtype="string"),
        vectorize_layer,
        layers.Embedding(max_tokens + 1 , 128, mask_zero=True),
        layers.Bidirectional(layers.LSTM(64,  return_sequences=True)),
        layers.Bidirectional(layers.LSTM(32)),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1)
    ])
    model.compile(
        loss=losses.BinaryCrossentropy(from_logits=True),
        optimizer=optimizers.Adam(1e-4),
        metrics=['accuracy',"precision", "recall"],
    )
    history = model.fit(
        train_dataset,
        epochs=10,
        validation_data=test_dataset,
        validation_steps=30,
    )
    return model

In [21]:
(
    sentiment_encoder,
    sentiment_train_dataset,
    sentiment_test_dataset,
) = prepare_data_and_encoder("training_data/sentiment")

# prepare_data_and_encoder("training_data/sentiment")

Found 12 files belonging to 2 classes.
(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))
(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))
Found 8 files belonging to 2 classes.
(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))
text:  [b'Councilmember Johnson stated staff has done an excellent job; the City needs to ensure that funding is preserved.'
 b'Councilmember Jensen moved approval of the Consent Calendar.'
 b'Stated the intersection is very dangerous and moving the bus stop would not help: Demeter Lamb, Alameda.'
 b'Mayor Johnson stated finding money for City parks is very difficult.'
 b'Councilmember Matarrese stated that the Dancing Trees were great.'
 b'Opponent: (Not in favor of project appeal): Richard W. Rutter Alameda.'
 b'Even if you are frustrated by the recommendation, it is poor form to atta

ValueError: When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(1, 12) with rank=2

In [None]:
sentiment_model = make_and_train_model(sentiment_encoder, sentiment_train_dataset, sentiment_test_dataset)
sentiment_model.predict(constant(["Chair Andersen adjourned the meeting at 10:40 AM."]))

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.5000 - loss: 0.6935 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.5000 - val_loss: 0.6932 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - accuracy: 0.5000 - loss: 0.6949 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.5000 - val_loss: 0.6932 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - accuracy: 0.5000 - loss: 0.6925 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.5000 - val_loss: 0.6932 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - accuracy: 0.5000 - loss: 0.6930 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.5000 - val_loss: 0.6932 - val_precision: 0.0000e

array([[-0.00303205]], dtype=float32)