In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords') # install NLTK data to home user directory
import unidecode
from gensim.models import Word2Vec
from ingredient_parser import ingredient_parser
import embeddings as emb
from sklearn.metrics.pairwise import cosine_similarity

# get corpus with the documents sorted in alphabetical order
def get_and_sort_corpus(data):
    corpus_sorted = []
    for doc in data.parsed.values:
        doc.sort()
        corpus_sorted.append(doc)
    return corpus_sorted

def ingredient_parser_final(ingredient):
    """
    neaten the ingredients being outputted
    """
    if isinstance(ingredient, list):
        ingredients = ingredient
    else:
        ingredients = ingredient.split()

    ingredients = ",".join(ingredients)
    ingredients = unidecode.unidecode(ingredients)
    return ingredients

def get_recommendations(N, scores,csv_file):
    """
    Top-N recomendations order by score
    """
    # load in recipe dataset
    df_recipes = pd.read_csv(csv_file)
    # order the scores with and filter to get the highest N scores
    top = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:N]
    # create dataframe to load in recommendations
    recommendation = pd.DataFrame(columns=["title", "ingredients","region", "score"])
    count = 0
    for i in top:
        recommendation.at[count, "title"] = unidecode.unidecode(df_recipes["title"][i])
        recommendation.at[count, "ingredients"] = ingredient_parser_final(
            df_recipes["ingredients"][i]
        )
        recommendation.at[count, "region"] = ingredient_parser_final(
            df_recipes["region"][i]
        )
        recommendation.at[count, "score"] = f"{scores[i]}"
        count += 1
    return recommendation


In [4]:
#####
# TREINAR MODELO
#####

csv_file = "recipes.csv"

data = pd.read_csv(csv_file)

columns=data["ingredients"] +','+ data["region"]
data["parsed"] = columns.apply(ingredient_parser)

# data.head()

corpus = get_and_sort_corpus(data)
print(f"Length of corpus: {len(corpus)}")
        
model = Word2Vec(corpus, sg=0, workers=8, window=6, min_count=1, vector_size=100,compute_loss=True)

# Loss
print(model.get_latest_training_loss())

#Summarize vocabulary
words = list(model.wv.index_to_key)
words.sort()
len(words)

<class 'pandas.core.series.Series'>
Length of corpus: 79
4712.98681640625


117

In [3]:
#####
# TESTAR COM INPUT
#####
n_sugestions = 5
key_words = "porto,bread"

# vec_tr = emb.MeanEmbeddingVectorizer(model)
vec_tr = emb.TfidfEmbeddingVectorizer(model)
    
vec_tr.fit(corpus)
doc_vec = vec_tr.transform(corpus)
doc_vec = [doc.reshape(1, -1) for doc in doc_vec]

# create tokens with elements
input = key_words.split(",")
# parse ingredient list
input = ingredient_parser(input)
# get embeddings for ingredient doc
input_embedding = vec_tr.transform([input])[0].reshape(1, -1)

# get cosine similarity between input embedding and all the document embeddings
cos_sim = map(lambda x: cosine_similarity(input_embedding, x)[0][0], doc_vec)
scores = list(cos_sim)
# Filter top N recommendations
recommendations = get_recommendations(n_sugestions, scores,csv_file)
recommendations


Unnamed: 0,title,ingredients,region,score
0,Francesinha seafood,"bread,,shrimp,,squid,,mussels,,cheese,,egg,,fr...",Porto,0.4494966268539428
1,Francesinha without Ham,"bread,,steak,,linguica,,sausage,,cheese,,egg,,...",Porto,0.3809010982513428
2,Tripas a Moda do Porto com Batatas,"tripe,,potatoes,,onion,,garlic,,carrots,,leave...",Porto,0.3364576995372772
3,Vegan Francesinha,"bread,,seitan,,large,portobello,mushroom,,red,...",Porto,0.3286704123020172
4,Francesinha,"bread,,steak,,linguica,,,sausage,,ham,,cheese,...",Porto,0.3103387057781219
