In [1]:
import numpy as np
import csv
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses

# Creamos dir con las categorias
newpath = './categories' 
if not os.path.exists(newpath):
    os.makedirs(newpath)

# Folders de train y test
newpath_train = newpath+'/train'
newpath_test = newpath+'/test'

def save_file(folder, text):
    with open(folder, 'w') as text_file:
        print(text, file=text_file)

# Obtenemos train
def csv_to_folders(source_file, target_folder):
    with open(source_file) as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        step = 0
        for row in reader:
            cat_folder = target_folder+'/'+row[0]
            target_file = cat_folder+'/'+str(step)+'_'+row[0]+'.txt'
            if not os.path.exists(cat_folder):
                os.makedirs(cat_folder)
                save_file(target_file, row[1])
            else:
                save_file(target_file, row[1])
            step += 1

csv_to_folders('/Users/raulrodriguez_demarque/demarque/market/cats_NOSEK_100_only.csv', newpath_train)
csv_to_folders('/Users/raulrodriguez_demarque/demarque/market/cats_NOSEK_test_20.csv', newpath_test)


In [2]:
batch_size = 6
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    newpath_train,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    label_mode='categorical',
    shuffle=True,
    seed=seed)

Found 51567 files belonging to 522 classes.
Using 41254 files for training.


In [3]:
max_features = 10000
sequence_length = 250

In [4]:
print("----------------------- SEPARATOR ----------------------------")
for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(3):
    print("Text", text_batch.numpy()[i])
    print("Category", label_batch.numpy()[i])
print("----------------------- SEPARATOR ----------------------------")

----------------------- SEPARATOR ----------------------------
Text b'On a Muggy night in Mumbai Penguin  Mahesh Dattani  A playwright of world statureMario Relich WasafiriOn a Muggy Night in Mumbai is the first contemporary Indian play to openly tackle gay themes of love partnership trust and betrayal Kamleshyoung gay and clinically depressedinvites his friends home ostensibly for an evening of camaraderie However with the arrival of his sister and her fianc a series of dramatic confrontations is set into motion leading to startling revelations and unexpected catharsisAt last we have a playwright who gives sixty million Englishspeaking Indians an identityAlyque PadamseePowerful and disturbingThe New York Times\n'
Category [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

2024-05-24 07:43:56.999923: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [5]:
total_cats = 0
for i, class_name in enumerate(raw_train_ds):
    total_cats += 1
    print("- Label "+str(i)+" corresponds to", raw_train_ds.class_names[i])

print("Length of CATS:", str(total_cats))

- Label 0 corresponds to FBANT000000
- Label 1 corresponds to FBANT002000
- Label 2 corresponds to FBANT016000
- Label 3 corresponds to FBARC000000
- Label 4 corresponds to FBARC005000
- Label 5 corresponds to FBARC005070
- Label 6 corresponds to FBART000000
- Label 7 corresponds to FBART000000N
- Label 8 corresponds to FBART001000
- Label 9 corresponds to FBART009000
- Label 10 corresponds to FBART010000
- Label 11 corresponds to FBART020000
- Label 12 corresponds to FBART028000
- Label 13 corresponds to FBART050000
- Label 14 corresponds to FBART060000
- Label 15 corresponds to FBBIO000000
- Label 16 corresponds to FBBIO001000
- Label 17 corresponds to FBBIO001000N
- Label 18 corresponds to FBBIO004000
- Label 19 corresponds to FBBIO005000
- Label 20 corresponds to FBBIO006000
- Label 21 corresponds to FBBIO007000
- Label 22 corresponds to FBBIO008000
- Label 23 corresponds to FBBIO010000
- Label 24 corresponds to FBBIO011000
- Label 25 corresponds to FBBIO013000
- Label 26 correspon

IndexError: list index out of range

In [6]:
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    newpath_train,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    shuffle=True,
    label_mode='categorical', # para CategoricalCrossentropy
    seed=seed
)

Found 51567 files belonging to 522 classes.
Using 10313 files for validation.


In [7]:
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    newpath_test,
    batch_size=batch_size,
    label_mode='categorical' # CategoricalCrossentropy
)

Found 9637 files belonging to 519 classes.


In [8]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), ' ')

In [9]:
max_features = 12000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    split="whitespace",
    max_tokens=max_features,
    pad_to_max_tokens=True,
    output_mode='int',
    output_sequence_length=sequence_length
)

In [10]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
print(">> train_text: "+str(train_text))
vectorize_layer.adapt(train_text)
print(">> vectorize_layer adapted: "+str(vectorize_layer))

>> train_text: <_MapDataset element_spec=TensorSpec(shape=(None,), dtype=tf.string, name=None)>
>> vectorize_layer adapted: <TextVectorization name=text_vectorization, built=False>


2024-05-24 07:45:01.691043: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [11]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [12]:
# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
print(">> text_batch[0]: "+str(text_batch[0]))
print(">> label_batch[0]: "+str(label_batch[0]))

first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
#print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

>> text_batch[0]: tf.Tensor(b'Creative Montreal  PlateauMontRoyal Guides de voyage Ulysse  Jerome Delgado  The Guide to Creative Montrals PlateauMontRoyal tour offers a foray into the fields of performing arts visual arts art galleries and public artworks digital arts music and design The tour crisscrosses the PlateauMontRoyal neighbourhood providing information on a wealth of cultural attractions including five of the most distinguished theatres in the city several wellknown galleries a panoply of live venues and some of the best public art in the city as well as countless bookstores record stores cafes restaurants and shops where you can stop along the way\n', shape=(), dtype=string)
>> label_batch[0]: tf.Tensor(
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

In [13]:
print("1287 ---> ",vectorize_layer.get_vocabulary()[1287])
print(" 313 ---> ",vectorize_layer.get_vocabulary()[313])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

1287 --->  brothers
 313 --->  comprehensive
Vocabulary size: 12000


In [14]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [15]:
embedding_dim = 16
input_shape = (3, 210, 160, 3)

model = tf.keras.Sequential([

    # EXP OK7 EL MEJOR PARA CATEGORICAL (no sparseCategorical)
    # 50 epochs: accuracy: 0.8873 - loss: 0.3639 - val_accuracy: 0.2263 - val_loss: 20.9532
    # tf.keras.layers.Embedding(max_features, 64, name='embedding'),
    # tf.keras.layers.BatchNormalization(axis=-1),
    # #tf.keras.layers.Dropout(0.2),
    # tf.keras.layers.GlobalAveragePooling1D(),
    # tf.keras.layers.Dense(2430, activation='softmax')

    # 50 epochs | accuracy: 0.8328 - loss: 0.6003 - val_accuracy: 0.1309 - val_loss: 9.2571
    # 2 horas con 134K_no_quotes
    # tf.keras.layers.Embedding(max_features, 256, name='embedding'),
    # tf.keras.layers.BatchNormalization(axis=-1),
    # tf.keras.layers.Dropout(0.2),
    # tf.keras.layers.GlobalAveragePooling1D(),
    # tf.keras.layers.Dropout(0.2),
    # tf.keras.layers.Dense(2574, activation='softmax')

    # 50 epochs para cats_NOSEK_50_only
    # accuracy: 0.9870 - loss: 0.0518 - val_accuracy: 0.1746 - val_loss: 10.5990
    tf.keras.layers.Embedding(max_features, 256, name='embedding'),
    tf.keras.layers.BatchNormalization(axis=-1),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(522, activation='softmax')

    # NOTA: segun https://www.kaggle.com/code/serkanpeldek/text-classification-with-embedding-conv1d
    # es importante preprocesar el texto lo mas posible.

])

model.summary()



In [16]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [17]:
import os
class CustomCallback(tf.keras.callbacks.Callback):
    def on_train_end(self, logs=None):
        keys = list(logs.keys())
        print("Stop training; got log keys: {}".format(keys))
        os.system('spd-say "Tensorflow has finished training!"')

In [18]:
model.compile(
    # optimizer='adam',
    # loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    # metrics=['accuracy']
    
    # De otra forma hay que usar CategoricalCrossentropy si son one_hot encoded
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), # PRUEBA, PONER 0.01
    metrics=['accuracy']
)



In [19]:
epochs = 35
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs
)

Epoch 1/35
[1m4278/6876[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m18s[0m 7ms/step - accuracy: 0.0413 - loss: 5.8782

2024-05-24 07:45:57.001724: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: INVALID_ARGUMENT: Incompatible shapes: [0] vs. [12000,256]
	 [[{{function_node __inference_one_step_on_data_298987}}{{node adam/truediv_1}}]]


InvalidArgumentError: Graph execution error:

Detected at node adam/truediv_1 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/opt/miniconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/opt/miniconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/opt/miniconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/opt/miniconda3/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/opt/miniconda3/lib/python3.12/asyncio/base_events.py", line 639, in run_forever

  File "/opt/miniconda3/lib/python3.12/asyncio/base_events.py", line 1985, in _run_once

  File "/opt/miniconda3/lib/python3.12/asyncio/events.py", line 88, in _run

  File "/opt/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/opt/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/opt/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/opt/miniconda3/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/opt/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/opt/miniconda3/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/opt/miniconda3/lib/python3.12/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/opt/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/opt/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/opt/miniconda3/lib/python3.12/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/opt/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/opt/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/opt/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/var/folders/33/58v_nv6j1619_26c1dh5_0540000gn/T/ipykernel_59465/2807588492.py", line 2, in <module>

  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 329, in fit

  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 122, in one_step_on_iterator

  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 110, in one_step_on_data

  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 75, in train_step

  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/optimizers/base_optimizer.py", line 279, in apply_gradients

  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/optimizers/base_optimizer.py", line 340, in apply

  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/optimizers/base_optimizer.py", line 390, in _backend_apply_gradients

  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/optimizer.py", line 119, in _backend_update_step

  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/optimizer.py", line 135, in _distributed_tf_update_step

  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/optimizer.py", line 132, in apply_grad_to_update_var

  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/optimizers/adam.py", line 143, in update_step

  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/ops/numpy.py", line 5568, in divide

  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/sparse.py", line 778, in sparse_wrapper

  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/numpy.py", line 1921, in divide

Incompatible shapes: [0] vs. [12000,256]
	 [[{{node adam/truediv_1}}]] [Op:__inference_one_step_on_iterator_299042]

In [None]:
loss, accuracy = model.evaluate(val_ds)
#loss, accuracy = model.evaluate(test_ds) # Algo estra mal con test_ds que no se puede probar

print("Loss: ", loss)
print("Accuracy: ", accuracy)

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

In [None]:
probability_model = tf.keras.Sequential([model, 
                                         tf.keras.layers.Softmax()])

In [None]:
features = tf.constant([["Harry Potter and the Goblet of Fire | Pottermore from J.K. Rowling | J.K. Rowling | 'There will be three tasks, spaced throughout the school year, and they will test the champions in many different ways … their magical prowess - their daring - their powers of deduction - and, of course, their ability to cope with danger.'The Triwizard Tournament is to be held at Hogwarts. Only wizards who are over seventeen are allowed to enter - but that doesn't stop Harry dreaming that he will win the competition. Then at Hallowe'en, when the Goblet of Fire makes its selection, Harry is amazed to find his name is one of those that the magical cup picks out. He will face death-defying tasks, dragons and Dark wizards, but with the help of his best friends, Ron and Hermione, he might just make it through - alive!Harry Potter and the Goblet of Fire is currently the featured read in Pottermore’s Wizarding World Book Club. Sign up and join weekly Twitter discussions at WW Book Club."]]) 
labels = tf.constant([["xoxoxoxoxo"]])
ds = tf.data.Dataset.from_tensor_slices((features, labels))
predict_testo = ds.map(vectorize_text)

for text_batch, label_batch in predict_testo:
    pre = probability_model.predict(text_batch.numpy())
    index = np.argmax(pre)
    print(">> el indice es: "+str(index))
    print(raw_train_ds.class_names[index])

In [None]:
import os;
print(os.getcwd())
model.save(newpath+'model_categories_134K.keras')

In [None]:
# No se si este predict se deba hacer sobre las labels o sobre los textos, checar:
# https://machinelearningmastery.com/multi-label-classification-with-deep-learning/
# predictions = model.predict(predict_ds)
# print(">> largo de predictions: "+str(len(predictions)))
# print(">> largo de predictions[0]: "+str(len(predictions[0]))) # 1203? el numero maximo de categorias? (igual para las 3 predictions)
# # no entiendo porque esta prediccion es un array de 1203 de largo -_-
# print(predictions)

In [None]:
#---------------------------------------------------------------

In [None]:
export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    tf.keras.layers.Dense(1203, activation='softmax')
])

export_model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(), 
    optimizer="adam", 
    metrics=['accuracy']
)

# model.compile(
#     loss=tf.keras.losses.CategoricalCrossentropy(),
#     optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
#     metrics=['accuracy']
# )

# Test it with `raw_test_ds`, which yields raw strings
loss, accuracy = export_model.evaluate(raw_val_ds)
print(accuracy)