In [1]:
import re
import string
import pandas as pd

import nltk
from nltk.corpus import stopwords
# Download Spanish stopwords if not already downloaded
nltk.download('stopwords')

import spacy
import es_core_news_sm
nlp = es_core_news_sm.load()

from gensim.models import Word2Vec, Phrases
from gensim.models.phrases import Phraser

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\garim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
spanish_stopwords = set(stopwords.words('spanish'))

def remove_stopwords(text, stop_words= spanish_stopwords):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

def lemmatize_words(words):
    doc = nlp(' '.join(words))
    return [token.lemma_ for token in doc]

def preprocess_text(text, remove_stopwords= False, remove_stopwords_func= remove_stopwords):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    # text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Convert to lowercase
    text = text.lower()

    if remove_stopwords:
        text = remove_stopwords_func(text)
        
    return text

def filter_tokens(tokens, stop_words= spanish_stopwords):
    return [tok for tok in tokens if ('_' in tok or tok not in stop_words)]


In [3]:
# Example usage
spanish_text = "¡Hola! Visitas https://example.com para más información. ¿Cuántos años tienes? Tengo 25 años."
cleaned_text = preprocess_text(spanish_text, remove_stopwords= False)
print(cleaned_text)

cleaned_text = remove_stopwords(cleaned_text)
print(cleaned_text)

cleaned_text = " ".join(lemmatize_words(cleaned_text.split()))
print(cleaned_text)

hola visitas para más información cuántos años tienes tengo años
hola visitas información cuántos años años
holar visita información cuántos año año


In [4]:
df = pd.read_csv('data/recetas_limpias.csv')
df.head()

Unnamed: 0,nombre,url,ingredientes,pasos,pais,duracion,porciones,calorias,categoria,contexto,comensales,tiempo,dificultad,categoria 2,valoracion,votos
0,berenjenas rellenas,https://www.elmueble.com/cocinas/comidas-salud...,"2 berenjenas, 1 pimiento rojo, 1 pimiento amar...",paso 1. lava bien las berenjenas y pártelas po...,españa,45 min,4,,,,,,,"alto en fibra, sin grasas trans, sin sodio o s...",,
1,alcachofas al horno con pico de gallo,https://www.elmueble.com/cocinas/comidas-salud...,"4 alcachofas, 1 limón, 400 g de carne picada (...","paso 1. precalentar el horno a 180?°c. , paso ...",españa,60 min,4,,,,,,,"bajo en calorías, sin grasa, alto en fibra",,
2,arroz basmati salteado con heura y verduras va...,https://www.elmueble.com/cocinas/comidas-salud...,"200 g de arroz basmati integral, 450 ml de agu...",paso 1. hervir el arroz basmati durante unos 2...,españa,40 min,4,,,,,,,"bajo en calorías, alto en grasas, bueno fuente...",,
3,tataki de atún,https://www.elmueble.com/cocinas/comidas-salud...,"200 gramos de atún rojo (fresco o congelado), ...",paso 1. cortar en forma rectangular para el ta...,españa,60 min,4,,,,,,,"bajo en calorías, sin grasa, alto en fibra",,
4,merluza al vapor a la gallega,https://www.elmueble.com/cocinas/comidas-salud...,"700 g de merluza en rodajas, 3 ajos, 1 manojo ...",paso 1. pelar y lavar las patatas. retirar el ...,españa,45 min,4,,,,,,,"bajo en calorías, sin grasa, alto en fibra",,


In [None]:
# Preprocess the 'pasos' column. Dont remove stopwords
train_data = df['pasos'].dropna().apply(preprocess_text).str.split()
train_data = pd.concat([train_data, df['contexto'].dropna().apply(preprocess_text).str.split()], axis=0) # add context sentences
train_data = pd.concat([train_data, df['ingredientes'].dropna().apply(preprocess_text).str.split()], axis=0) # add inhredient list

# Detect frequent bigrams and trigrams. Bigrams should appear in atleast 100 recipes
bigram = Phrases(train_data, min_count= 100, threshold= 20)
trigram = Phrases(bigram[train_data], min_count= 25, threshold= 5)

bigram_phraser = Phraser(bigram)
trigram_phraser = Phraser(trigram)

# Step 4: Apply the phrasers to get merged n-grams
ngrammed_corpus = [trigram_phraser[bigram_phraser[doc]] for doc in train_data]

# remove stopwords from ngrammed corpus
final_corpus = [filter_tokens(doc) for doc in ngrammed_corpus]

# Train Word2Vec model
w2v_model = Word2Vec(sentences= final_corpus, vector_size= 300, window= 5, min_count= 20, workers=4)
# Save the model
w2v_model.save("models/w2v_ngram.model")

In [7]:
bigram_l = []; trigram_l = []
for word in w2v_model.wv.key_to_index:
    if '_' in word:
        if len(word.split('_')) == 2:
            bigram_l.append(word)
        elif len(word.split('_')) == 3:
            trigram_l.append(word)

bigram_l = list(set(bigram_l)); trigram_l = list(set(trigram_l));
#sort n-grams by frequency in descending order
bigram_l = sorted(bigram_l, key= lambda x: w2v_model.wv.get_vecattr(x, 'count'), reverse= True)
trigram_l = sorted(trigram_l, key= lambda x: w2v_model.wv.get_vecattr(x, 'count'), reverse= True)

In [8]:
w2v_model.wv.most_similar('vino_blanco', topn= 10)

[('vino', 0.8657280206680298),
 ('de_vino_tinto', 0.8057514429092407),
 ('brandy', 0.7900072932243347),
 ('vino_blanco_seco', 0.7823503613471985),
 ('coñac', 0.7604132890701294),
 ('jerez', 0.7134323716163635),
 ('vino_blanco_cucharadas_soperas', 0.707495927810669),
 ('sidra', 0.7009683847427368),
 ('vino_tinto', 0.6942389607429504),
 ('vino_blanco_cucharada_sopera', 0.6389712691307068)]

In [9]:
# find word most similar to a sentence
def most_similar_sentence(model, sentence, topn=10):
    # Preprocess the sentence
    preprocessed_sentence = preprocess_text(sentence)
    # Tokenize the preprocessed sentence
    tokens = preprocessed_sentence.split()
    # Get the vector for the sentence by averaging the word vectors
    sentence_vector = sum(model.wv[word] for word in tokens if word in model.wv) / len(tokens)
    # Find the most similar words to the sentence vector
    similar_words = model.wv.similar_by_vector(sentence_vector, topn=topn)
    return similar_words

most_similar_sentence(w2v_model, "aceite de oliva", topn=10)

[('aceite', 0.7978078722953796),
 ('oliva', 0.7852180600166321),
 ('aceite_vegetal', 0.7195738554000854),
 ('girasol', 0.6565621495246887),
 ('vinagre_blanco', 0.6030647158622742),
 ('generoso', 0.5952298045158386),
 ('vino_blanco', 0.5894007086753845),
 ('de_oliva_virgen', 0.5772249698638916),
 ('cebolla_picada', 0.5757327079772949),
 ('ajo_molido', 0.5734958648681641)]