In [39]:
import tensorflow as tf
from tensorflow import keras
import csv
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('stopwords') # install NLTK data to home user directory
from nltk.corpus import stopwords
import unidecode
import random

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@;]')
BAD_SYMBOLS_RE = re.compile('[^a-z 0-9 #+_]')
STOPWORDS = set(stopwords.words('english'))
STOPWORDS_RE = re.compile(r"\b(" + "|".join(STOPWORDS) + ")\\W")

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    
    #text = # lowercase text
    text = unidecode.unidecode(text).lower()
    #text = # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = REPLACE_BY_SPACE_RE.sub(" ", text)
    #text = # delete symbols which are in BAD_SYMBOLS_RE from text
    text = BAD_SYMBOLS_RE.sub(" ", text)
    # remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)

    #text = # delete stopwords from text
    text = STOPWORDS_RE.sub("", text)

    return text

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
  
def prepare_seq(sequence,max):
    sequence = tokenizer.texts_to_sequences(sequence)
    sequence = pad_sequences(sequence, maxlen=max, padding=padding_type, truncating=trunc_type)
    return  np.array(sequence)

rows = []
dataset = []
with open('../recipes.csv') as file:
    csvreader = csv.reader(file)
    header = next(csvreader)
    for row in csvreader:
        rows.append(row)

random.shuffle(rows)

training_size =60
vocab_size = 1000
embedding_dim = 32
num_epochs = 50
batch_size = 20

trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
labels=[]
sentences=[]
max_length_sentence=100
max_length_label=20

for row in rows:
    recipes=row[4]
    label=' '.join(row[:2])
    sentences.append(text_prepare(recipes))
    labels.append(label)

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]
    
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

training_sequences=prepare_seq(training_sentences,max_length_sentence)
testing_sequences = prepare_seq(testing_sentences,max_length_sentence)
training_lab_padded = prepare_seq(training_labels,max_length_label)
testing_lab_padded = prepare_seq(testing_labels,max_length_label)
# print(testing_sequences)
# print(testing_lab_padded)
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length_sentence),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(max_length_label)
    # tf.keras.layers.Dense(24, activation='sigmoid')
])

# tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
#     tf.keras.layers.GlobalAveragePooling1D(),
#     tf.keras.layers.Dense(24, activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid')

# model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer="adam",
    metrics=["accuracy"],
)

history = model.fit(training_sequences, training_lab_padded, batch_size=batch_size, epochs=num_epochs, validation_data=(testing_sequences, testing_lab_padded), verbose=2)

plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/golopes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: Input 0 of layer "bidirectional_23" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 32)

In [37]:

input_txt = open("../test.txt", "r")
input_padded=prepare_seq(input_txt,max_length_sentence)
input_padded = np.squeeze(input_padded)

result = model.predict_on_batch(tf.expand_dims(input_padded,0))

reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])

result_final=[]
for a in np.squeeze(result):
    for i in reverse_word_index:
        if(int(a)==i):
            if(reverse_word_index[i] not in result_final):
                result_final.append(reverse_word_index[i])
            break

print(','.join(result_final))

olive,bay,potatoes,onion,saute,mixture,beef,oil
