In [3]:
import os
os.chdir('/home/golopes/mestrado/projetos/FoodAid/')
import pandas as pd
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords') # install NLTK data to home user directory
import unidecode
from gensim.models import Word2Vec
from ingredient_parser import ingredient_parser
import embeddings as emb
from sklearn.metrics.pairwise import cosine_similarity

# get corpus with the documents sorted in alphabetical order
def get_and_sort_corpus(data):
    corpus_sorted = []
    for doc in data.parsed.values:
        doc.sort()
        corpus_sorted.append(doc)
    return corpus_sorted

def ingredient_parser_final(ingredient):
    """
    neaten the ingredients being outputted
    """
    if isinstance(ingredient, list):
        ingredients = ingredient
    else:
        ingredients = ingredient.split()

    ingredients = ",".join(ingredients)
    ingredients = unidecode.unidecode(ingredients)
    return ingredients

def get_recommendations(N, scores,csv_file,lerning_param):
    """
    Top-N recomendations order by score
    """
    # load in recipe dataset
    df_recipes = pd.read_csv(csv_file)
    # order the scores with and filter to get the highest N scores
    top = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:N]
    # create dataframe to load in recommendations
    if "title" not in lerning_param:
        lerning_param.insert(0, "title")
    lerning_param.append("score")
    recommendation = pd.DataFrame(columns=lerning_param)

    for i in top:
        # recommendation.at[i, "title"] = unidecode.unidecode(df_recipes["title"][i])
        for param in lerning_param[0:-1]:
            recommendation.at[i, param] = unidecode.unidecode(
                df_recipes[param][i]
            )
        recommendation.at[i, "score"] = f"{scores[i]}"
    return recommendation


In [4]:
#####
# TREINAR MODELO
#####

csv_file = "csv_file/recipes.csv"

data = pd.read_csv(csv_file)
columns=data["title"] +','+ data["ingredients"]+','+ data["region"]
data["parsed"] = columns.apply(ingredient_parser)

# data.head()

corpus = get_and_sort_corpus(data)
print(f"Length of corpus: {len(corpus)}")
        
model = Word2Vec(corpus, sg=0, workers=8, window=6, min_count=1, vector_size=100,compute_loss=True)

# Loss
print(model.get_latest_training_loss())

#Summarize vocabulary
words = list(model.wv.index_to_key)
words.sort()
len(words)

model.save("NLP/model/model_key_recipe.model")

Length of corpus: 100
15774.736328125


In [5]:
#####
# TESTAR COM INPUT
#####
n_sugestions = 5
key_words = "bread,porto"

# vec_tr = emb.MeanEmbeddingVectorizer(model)
vec_tr = emb.TfidfEmbeddingVectorizer(model)
    
vec_tr.fit(corpus)
doc_vec = vec_tr.transform(corpus)
doc_vec = [doc.reshape(1, -1) for doc in doc_vec]

# create tokens with elements
key_words = key_words.replace(" ",",")
input = key_words.split(",")
# parse ingredient list
input = ingredient_parser(input)
# get embeddings for ingredient doc
input_embedding = vec_tr.transform([input])[0].reshape(1, -1)

# get cosine similarity between input embedding and all the document embeddings
cos_sim = map(lambda x: cosine_similarity(input_embedding, x)[0][0], doc_vec)
scores = list(cos_sim)
# Filter top N recommendations
learning_param=["title","ingredients","region","recipe"]
recommendations = get_recommendations(n_sugestions, scores,csv_file,learning_param)
recommendations


Unnamed: 0,title,ingredients,region,recipe,score
82,Rabanadas,"8 slices of day-old bread,2 cups of red wine,1...",All regions,"In a saucepan, heat the red wine with the cinn...",0.7033420205116272
30,Tripas a Moda do Porto com Arroz,"tripe, rice, onion, garlic, carrots, leaves, o...",Porto,Cut the tripe into small pieces and boil in wa...,0.6960881948471069
31,Tripas a Moda do Porto com Batatas,"tripe, potatoes, onion, garlic, carrots, leave...",Porto,Cut the tripe into small pieces and boil in wa...,0.6957179307937622
81,Rabanadas,"8 slices of day-old bread, 2 cups of milk, 2 e...",All regions,"In a saucepan, heat the milk with the cinnamon...",0.6914899945259094
22,Vegan Francesinha,"bread, seitan, large portobello mushroom, red ...",Porto,"In a pan, saute the garlic with olive oil and ...",0.6861525774002075
