In [2]:
import re
import string
import pandas as pd

import nltk
from nltk.corpus import stopwords
# Download Spanish stopwords if not already downloaded
nltk.download('stopwords')

import spacy
import es_core_news_sm
nlp = es_core_news_sm.load()

from gensim.models import Word2Vec, Phrases
from gensim.models.phrases import Phraser

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\garim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
spanish_stopwords = set(stopwords.words('spanish'))

def remove_stopwords(text, stop_words= spanish_stopwords):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

def lemmatize_words(words):
    doc = nlp(' '.join(words))
    return [token.lemma_ for token in doc]

def preprocess_text(text, remove_stopwords= False, remove_stopwords_func= remove_stopwords):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    # text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Convert to lowercase
    text = text.lower()

    if remove_stopwords:
        text = remove_stopwords_func(text)
        
    return text

def filter_tokens(tokens, stop_words= spanish_stopwords):
    return [tok for tok in tokens if ('_' in tok or tok not in stop_words)]


In [4]:
# Example usage
spanish_text = "¡Hola! Visitas https://example.com para más información. ¿Cuántos años tienes? Tengo 25 años."
cleaned_text = preprocess_text(spanish_text, remove_stopwords= False)
print(cleaned_text)

cleaned_text = remove_stopwords(cleaned_text)
print(cleaned_text)

cleaned_text = " ".join(lemmatize_words(cleaned_text.split()))
print(cleaned_text)

hola visitas para más información cuántos años tienes tengo años
hola visitas información cuántos años años
holar visita información cuántos año año


In [5]:
df = pd.read_csv('data/recetas_limpias.csv')
df.head()

Unnamed: 0,nombre,url,ingredientes,pasos,pais,duracion,porciones,calorias,categoria,contexto,comensales,tiempo,dificultad,categoria 2,valoracion,votos
0,berenjenas rellenas,https://www.elmueble.com/cocinas/comidas-salud...,"2 berenjenas, 1 pimiento rojo, 1 pimiento amar...",paso 1. lava bien las berenjenas y pártelas po...,españa,45 min,4,,,,,,,"alto en fibra, sin grasas trans, sin sodio o s...",,
1,alcachofas al horno con pico de gallo,https://www.elmueble.com/cocinas/comidas-salud...,"4 alcachofas, 1 limón, 400 g de carne picada (...","paso 1. precalentar el horno a 180?°c. , paso ...",españa,60 min,4,,,,,,,"bajo en calorías, sin grasa, alto en fibra",,
2,arroz basmati salteado con heura y verduras va...,https://www.elmueble.com/cocinas/comidas-salud...,"200 g de arroz basmati integral, 450 ml de agu...",paso 1. hervir el arroz basmati durante unos 2...,españa,40 min,4,,,,,,,"bajo en calorías, alto en grasas, bueno fuente...",,
3,tataki de atún,https://www.elmueble.com/cocinas/comidas-salud...,"200 gramos de atún rojo (fresco o congelado), ...",paso 1. cortar en forma rectangular para el ta...,españa,60 min,4,,,,,,,"bajo en calorías, sin grasa, alto en fibra",,
4,merluza al vapor a la gallega,https://www.elmueble.com/cocinas/comidas-salud...,"700 g de merluza en rodajas, 3 ajos, 1 manojo ...",paso 1. pelar y lavar las patatas. retirar el ...,españa,45 min,4,,,,,,,"bajo en calorías, sin grasa, alto en fibra",,


In [6]:
# Preprocess the 'pasos' column. Dont remove stopwords
train_data = df['pasos'].dropna().apply(preprocess_text).str.split()
train_data = pd.concat([train_data, df['contexto'].dropna().apply(preprocess_text).str.split()], axis=0) # add context sentences
train_data = pd.concat([train_data, df['ingredientes'].dropna().apply(preprocess_text).str.split()], axis=0) # add inhredient list

# Detect frequent bigrams and trigrams. Bigrams should appear in atleast 100 recipes
bigram = Phrases(train_data, min_count= 100, threshold= 20)
trigram = Phrases(bigram[train_data], min_count= 25, threshold= 5)

bigram_phraser = Phraser(bigram)
trigram_phraser = Phraser(trigram)

# Step 4: Apply the phrasers to get merged n-grams
ngrammed_corpus = [trigram_phraser[bigram_phraser[doc]] for doc in train_data]

# remove stopwords from ngrammed corpus
final_corpus = [filter_tokens(doc) for doc in ngrammed_corpus]

# Train Word2Vec model
w2v_model = Word2Vec(sentences= final_corpus, vector_size= 300, window= 5, min_count= 20, workers=4)
# Save the model
w2v_model.save("models/w2v_ngram.model")

In [7]:
bigram_l = []; trigram_l = []
for word in w2v_model.wv.key_to_index:
    if '_' in word:
        if len(word.split('_')) == 2:
            bigram_l.append(word)
        elif len(word.split('_')) == 3:
            trigram_l.append(word)

bigram_l = list(set(bigram_l)); trigram_l = list(set(trigram_l));
#sort n-grams by frequency in descending order
bigram_l = sorted(bigram_l, key= lambda x: w2v_model.wv.get_vecattr(x, 'count'), reverse= True)
trigram_l = sorted(trigram_l, key= lambda x: w2v_model.wv.get_vecattr(x, 'count'), reverse= True)

In [244]:
w2v_model.wv.most_similar('tuna', topn= 10)

[('diurética', 0.7264012694358826),
 ('lulo', 0.7084882855415344),
 ('pasión', 0.7058159112930298),
 ('pepa', 0.6975138783454895),
 ('fibras', 0.6948205232620239),
 ('alfalfa', 0.6945397257804871),
 ('tónica', 0.6944409012794495),
 ('limpieza', 0.6878756880760193),
 ('fresones', 0.6859800815582275),
 ('depurativas', 0.6705377101898193)]

In [213]:
# find word most similar to a sentence
def most_similar_sentence(model, sentence, topn=10):
    # Preprocess the sentence
    preprocessed_sentence = preprocess_text(sentence)
    # Tokenize the preprocessed sentence
    tokens = preprocessed_sentence.split()
    # Get the vector for the sentence by averaging the word vectors
    sentence_vector = sum(model.wv[word] for word in tokens if word in model.wv) / len(tokens)
    # Find the most similar words to the sentence vector
    similar_words = model.wv.similar_by_vector(sentence_vector, topn=topn)
    return similar_words

most_similar_sentence(w2v_model, "aceite de oliva", topn=10)

[('aceite', 0.8025118708610535),
 ('oliva', 0.7950661182403564),
 ('aceite_vegetal', 0.734464704990387),
 ('girasol', 0.6618366241455078),
 ('vinagre_blanco', 0.6190916299819946),
 ('ajo_molido', 0.6043049693107605),
 ('cebolla_picada', 0.5959500074386597),
 ('ajo_picado', 0.5946222543716431),
 ('vino_blanco', 0.5892155766487122),
 ('vinagre_balsámico', 0.5695143342018127)]

### Manual Ingredient List for Logistic Regression

I am manually creating a list of ingredients to build a logistic regression model that predicts whether a word or n-gram in the vocabulary is an ingredient or not. This approach is a temporary solution and will be replaced in the future by more advanced techniques such as Named Entity Recognition (NER) and Part-of-Speech (POS) tagging, once I am more familiar with those methods.

In [252]:
# Example ingredient list (replace with your actual list)
ingrediente_list = ['aceite', 'sal', 'azúcar', 'harina', 'huevo', 'leche', 'tomate', 'cebolla', 'ajo', 'pollo', 'arroz', 'jamón',
                 'pasta', 'carne', 'pimiento', 'pimienta', 'zanahoria', 'papa', 'queso', 'mantequilla', 'chocolate', 'fruta', 'verdura',
                 'pescado', 'marisco', 'especias', 'hierbas', 'vinagre', 'salsa', 'mostaza', 'mayonesa', 'ketchup', 'vainilla',
                 'pan', 'tortilla', 'yogur', 'gelatina', 'miel', 'azafrán', 'pimienta', 'comino', 'orégano', 'laurel', 'maíz', 
                 'canela', 'clavo', 'jengibre', 'nuez', 'agua', 'crema', 'masa', 'limón','oliva', 'cerdo', 'atún', 'jugo', 'naranja',
                 'patata', 'champiñon', 'vino', 'calabaza', 'coco', 'avena', 'garbanzo', 'espinaca', 'sofrito', 'piña', 'cilantro',
                 'pimentón', 'brócoli', 'almendra', 'fresa', 'coliflor', 'yema', 'chorizo', 'mermelada', 'berenjena', 'trigo', 'vegetal',
                 'verdadur', 'bacon', 'mango', "quinoa", "fideo", "aceituna", "limon", 'pepino', 'chile', 'camaron', 'cacao', 'lenteja',
                 'margarina', 'frijol', 'pavo', 'yuca', 'tofu', 'romero', 'perejil', 'brandy', 'alcachofa', 'camarón', 'ostión', 'soja',
                 'soya', 'turrón', 'fécula', 'cúrcuma', 'sésamo', 'chía', 'chia', 'mozzarella', 'culantro', 'hierbabuena', 'manzana',
                 'espirulina', 'cardamomo', 'semilla', 'hoja']

# Create DataFrame with word and is_ingrediente columns
df_vocab = pd.DataFrame(index= list(w2v_model.wv.key_to_index.keys()), columns= ['is_ingrediente'])
for i_idx in df_vocab.index:
    df_vocab.loc[i_idx, 'is_ingrediente'] = any([ww in i_idx for ww in ingrediente_list])

# save data
df_vocab.dropna().to_csv('data/is_ingrediente.csv')
df_vocab.head()

Unnamed: 0,is_ingrediente
aceite,True
sal,True
gramos,False
agua,True
hasta_que,False


In [253]:
df_vocab['is_ingrediente'].sum()/df_vocab.shape[0]

0.09526052953924433