In [None]:
import numpy as np
import csv
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses

# Creamos dir con las categorias
newpath = './categories' 
if not os.path.exists(newpath):
    os.makedirs(newpath)

# Folders de train y test
newpath_train = newpath+'/train'
newpath_test = newpath+'/test'

def save_file(folder, text):
    with open(folder, 'w') as text_file:
        print(text, file=text_file)

# Obtenemos train
def csv_to_folders(source_file, target_folder):
    with open(source_file) as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        step = 0
        for row in reader:
            cat_folder = target_folder+'/'+row[0]
            target_file = cat_folder+'/'+str(step)+'_'+row[0]+'.txt'
            if not os.path.exists(cat_folder):
                os.makedirs(cat_folder)
                save_file(target_file, row[1])
            else:
                save_file(target_file, row[1])
            step += 1

csv_to_folders('/Users/raulrodriguez_demarque/demarque/market/cats_NOSEK_50_only.csv', newpath_train)
csv_to_folders('/Users/raulrodriguez_demarque/demarque/market/cats_NOSEK_test_50.csv', newpath_test)


In [None]:
batch_size = 6
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    newpath_train,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    label_mode='categorical',
    shuffle=True,
    seed=seed)

In [None]:
max_features = 50000
sequence_length = 250

In [None]:
print("----------------------- SEPARATOR ----------------------------")
for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(3):
    print("Text", text_batch.numpy()[i])
    print("Category", label_batch.numpy()[i])
print("----------------------- SEPARATOR ----------------------------")

In [None]:
total_cats = 0
for i, class_name in enumerate(raw_train_ds):
    total_cats += 1
    print("- Label "+str(i)+" corresponds to", raw_train_ds.class_names[i])

print("Length of CATS:", str(total_cats))

In [None]:
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    newpath_train,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    shuffle=True,
    label_mode='categorical', # para CategoricalCrossentropy
    seed=seed
)

In [None]:
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    newpath_test,
    batch_size=batch_size,
    label_mode='categorical' # CategoricalCrossentropy
)

In [None]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), ' ')

In [None]:
max_features = 50000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    split="whitespace",
    max_tokens=max_features,
    pad_to_max_tokens=True,
    output_mode='int',
    output_sequence_length=sequence_length
)

In [None]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
print(">> train_text: "+str(train_text))
vectorize_layer.adapt(train_text)
print(">> vectorize_layer adapted: "+str(vectorize_layer))

In [None]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [None]:
# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
print(">> text_batch[0]: "+str(text_batch[0]))
print(">> label_batch[0]: "+str(label_batch[0]))

first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
#print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

In [None]:
print("1287 ---> ",vectorize_layer.get_vocabulary()[1287])
print(" 313 ---> ",vectorize_layer.get_vocabulary()[313])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

In [None]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [None]:
embedding_dim = 16
input_shape = (3, 210, 160, 3)

model = tf.keras.Sequential([

    # EXP OK7 EL MEJOR PARA CATEGORICAL (no sparseCategorical)
    # 50 epochs: accuracy: 0.8873 - loss: 0.3639 - val_accuracy: 0.2263 - val_loss: 20.9532
    # tf.keras.layers.Embedding(max_features, 64, name='embedding'),
    # tf.keras.layers.BatchNormalization(axis=-1),
    # #tf.keras.layers.Dropout(0.2),
    # tf.keras.layers.GlobalAveragePooling1D(),
    # tf.keras.layers.Dense(2430, activation='softmax')

    # 50 epochs | accuracy: 0.8328 - loss: 0.6003 - val_accuracy: 0.1309 - val_loss: 9.2571
    # 2 horas con 134K_no_quotes
    # tf.keras.layers.Embedding(max_features, 256, name='embedding'),
    # tf.keras.layers.BatchNormalization(axis=-1),
    # tf.keras.layers.Dropout(0.2),
    # tf.keras.layers.GlobalAveragePooling1D(),
    # tf.keras.layers.Dropout(0.2),
    # tf.keras.layers.Dense(2574, activation='softmax')

    # 50 epochs para cats_NOSEK_50_only
    # accuracy: 0.9870 - loss: 0.0518 - val_accuracy: 0.1746 - val_loss: 10.5990
    # tf.keras.layers.Embedding(max_features, 256, name='embedding'),
    # tf.keras.layers.BatchNormalization(axis=-1),
    # tf.keras.layers.Dropout(0.1),
    # tf.keras.layers.GlobalAveragePooling1D(),
    # tf.keras.layers.Dropout(0.1),
    # tf.keras.layers.Dense(781, activation='softmax')

    # 50 epochs para cats_NOSEK_50_only
    # accuracy: 0.9941 - loss: 0.0250 - val_accuracy: 0.1377 - val_loss: 13.9855
    tf.keras.layers.Embedding(max_features, 256, name='embedding'),
    tf.keras.layers.BatchNormalization(axis=-1),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(781, activation='softmax')

    # NOTA: segun https://www.kaggle.com/code/serkanpeldek/text-classification-with-embedding-conv1d
    # es importante preprocesar el texto lo mas posible.

])

model.summary()



In [None]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
import os
class CustomCallback(tf.keras.callbacks.Callback):
    def on_train_end(self, logs=None):
        keys = list(logs.keys())
        print("Stop training; got log keys: {}".format(keys))
        os.system('spd-say "Tensorflow has finished training!"')

In [None]:
model.compile(
    # optimizer='adam',
    # loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    # metrics=['accuracy']
    
    # De otra forma hay que usar CategoricalCrossentropy si son one_hot encoded
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), # PRUEBA, PONER 0.01
    metrics=['accuracy']
)



In [None]:
epochs = 50
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs
)

In [None]:
loss, accuracy = model.evaluate(val_ds)
#loss, accuracy = model.evaluate(test_ds) # Algo estra mal con test_ds que no se puede probar
# CREO QUE ES PORQUE EL TEST NO TIENE EL MISMO NUMERO DE CATEGORIAS (classes)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

In [None]:
probability_model = tf.keras.Sequential([model, 
                                         tf.keras.layers.Softmax()])

In [None]:
features = tf.constant([["The Death of Holden Hachette Australia Royce Kurmelovs Holden is one of the few brands that has an emotional grip on Australia (Qantas being another). The closure of the Holden factory in Adelaide is not just the end of a business - it's the end of an era, of a story, and of a great Australian dream.When Holden signalled that it would close its Adelaide factory, it struck at the very heart of Australian identity. Holden is our car made on our shores. It's the choice of patriotic rev heads and suburban drivers alike. How could a car that was so beloved - and so popular - be so unprofitable to make?The story of the collapse of Holden is about the people who make and drive the cars; it's about sustaining industry in Australia; it's about communities of workers and what happens when the work dries up. And if it's not quite about the death of an icon - because Holdens will remain on Australian roads for a long time to come - then it's about what happens when an icon falls to knees in front of a whole nation.'Brilliant and powerful' Nick Xenophon"]]) 
labels = tf.constant([["xoxoxoxoxo"]])
ds = tf.data.Dataset.from_tensor_slices((features, labels))
predict_testo = ds.map(vectorize_text)

for text_batch, label_batch in predict_testo:
    pre = probability_model.predict(text_batch.numpy())
    index = np.argmax(pre)
    print(">> el indice es: "+str(index))
    print(raw_train_ds.class_names[index])

In [None]:
import os;
print(os.getcwd())
model.save(newpath+'model_categories_50K.keras')

In [None]:
# No se si este predict se deba hacer sobre las labels o sobre los textos, checar:
# https://machinelearningmastery.com/multi-label-classification-with-deep-learning/
# predictions = model.predict(predict_ds)
# print(">> largo de predictions: "+str(len(predictions)))
# print(">> largo de predictions[0]: "+str(len(predictions[0]))) # 1203? el numero maximo de categorias? (igual para las 3 predictions)
# # no entiendo porque esta prediccion es un array de 1203 de largo -_-
# print(predictions)

In [None]:
#---------------------------------------------------------------

In [None]:
export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    tf.keras.layers.Dense(1203, activation='softmax')
])

export_model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(), 
    optimizer="adam", 
    metrics=['accuracy']
)

# model.compile(
#     loss=tf.keras.losses.CategoricalCrossentropy(),
#     optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
#     metrics=['accuracy']
# )

# Test it with `raw_test_ds`, which yields raw strings
loss, accuracy = export_model.evaluate(raw_val_ds)
print(accuracy)