<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-Dataset" data-toc-modified-id="Import-Dataset-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import Dataset</a></span><ul class="toc-item"><li><span><a href="#Panlasang-Pinoy-Dataset" data-toc-modified-id="Panlasang-Pinoy-Dataset-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Panlasang Pinoy Dataset</a></span></li><li><span><a href="#Kawaling-Pinoy-Dataset" data-toc-modified-id="Kawaling-Pinoy-Dataset-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Kawaling Pinoy Dataset</a></span></li><li><span><a href="#Merged-dataset" data-toc-modified-id="Merged-dataset-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Merged dataset</a></span></li><li><span><a href="#Clean-dataset" data-toc-modified-id="Clean-dataset-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Clean dataset</a></span></li></ul></li><li><span><a href="#Data-Cleaning" data-toc-modified-id="Data-Cleaning-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Cleaning</a></span><ul class="toc-item"><li><span><a href="#Tokenization,-Removing-of-digits-and-lowerized-text" data-toc-modified-id="Tokenization,-Removing-of-digits-and-lowerized-text-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Tokenization, Removing of digits and lowerized text</a></span></li><li><span><a href="#Remove-Stopwords" data-toc-modified-id="Remove-Stopwords-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Remove Stopwords</a></span></li><li><span><a href="#Lemmatized" data-toc-modified-id="Lemmatized-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Lemmatized</a></span></li><li><span><a href="#Most-Common-Words" data-toc-modified-id="Most-Common-Words-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Most Common Words</a></span></li><li><span><a href="#Additional-stopwords-based-on-the-common-words" data-toc-modified-id="Additional-stopwords-based-on-the-common-words-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>Additional stopwords based on the common words</a></span></li></ul></li><li><span><a href="#Topic-Modelling" data-toc-modified-id="Topic-Modelling-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Topic Modelling</a></span><ul class="toc-item"><li><span><a href="#LDA" data-toc-modified-id="LDA-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>LDA</a></span></li><li><span><a href="#LSI" data-toc-modified-id="LSI-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>LSI</a></span></li><li><span><a href="#HDP" data-toc-modified-id="HDP-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>HDP</a></span></li><li><span><a href="#LDA-Scikit-Learn" data-toc-modified-id="LDA-Scikit-Learn-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>LDA Scikit Learn</a></span></li></ul></li><li><span><a href="#Sample-Optimal-Model" data-toc-modified-id="Sample-Optimal-Model-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Sample Optimal Model</a></span></li></ul></div>

In [1]:
import re
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
import time
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer

## Import Dataset

### Panlasang Pinoy Dataset

In [2]:
#Data scraped from Panlasang Pinoy
panlasang_pinoy = pd.read_csv("panlasang_pinoy.csv")

### Kawaling Pinoy Dataset

In [3]:
#Data scraped from Kawaling Pinoy
kawaling_pinoy = pd.read_csv("kawaling_pinoy.csv")

In [4]:
#Get the food, ingredients and instructions columns 
panlasang_pinoy = panlasang_pinoy[['food', 'ingredients', 'instructions']]
kawaling_pinoy = kawaling_pinoy[['food', 'ingredients', 'instructions']]

### Merged dataset 

In [5]:
#Merge the dataset
data = pd.concat([panlasang_pinoy, kawaling_pinoy])

In [6]:
#Inspect the dataset
data.head()

Unnamed: 0,food,ingredients,instructions
0,Pinoy Chicken Sopas,"1 lb. rotisserie chicken shredded, 2 Knorr Chi...",Heat cooking oil in a cooking pot. Sauté onion...
1,Popcorn Chicken and Gravy KFC Style Secret Recipe,"1 lb. boneless chicken breast cubed, 1 cup coo...",Start making the popcorn chicken by combining ...
2,Idol Cheesedog Bread Roll and Bites,"12 CDO Idol Cheesedog, 2 cups all-purpose flou...",Start making the idol bites by preparing the d...
3,Perfect Chicken Adobo on a Budget,"2 lbs. chicken cut into serving pieces, 1 Knor...","Combine chicken, soy sauce, vinegar, and 5 clo..."
4,Creamy Mushroom Chicken,"1 ½ lbs. chicken cut into serving pieces, 1 Kn...",Rub salt and ground black pepper all over the ...


In [7]:
data.describe()        #Check the number of rows per column. 

Unnamed: 0,food,ingredients,instructions
count,2418,2344,2418
unique,2269,2339,2341
top,none,"7 pieces shrimp cleaned and deveined, 3 ounces...",none
freq,74,2,74


In [8]:
data.isna().sum()             #There are 30 missing values

food             0
ingredients     74
instructions     0
dtype: int64

### Clean dataset

In [9]:
#Removing the null values 
data = data.dropna()

In [10]:
data.describe()        #Check the dataset again

Unnamed: 0,food,ingredients,instructions
count,2344,2344,2344
unique,2268,2339,2340
top,Homemade Pork Tocino,"7 pieces shrimp cleaned and deveined, 3 ounces...",Heat oil in a wok or pan.\nPan fry the shrimp ...
freq,3,2,2


In [11]:
data.isna().sum()     #No null values then we can proceed to the next step. Yey

food            0
ingredients     0
instructions    0
dtype: int64

In [12]:
data.head()

Unnamed: 0,food,ingredients,instructions
0,Pinoy Chicken Sopas,"1 lb. rotisserie chicken shredded, 2 Knorr Chi...",Heat cooking oil in a cooking pot. Sauté onion...
1,Popcorn Chicken and Gravy KFC Style Secret Recipe,"1 lb. boneless chicken breast cubed, 1 cup coo...",Start making the popcorn chicken by combining ...
2,Idol Cheesedog Bread Roll and Bites,"12 CDO Idol Cheesedog, 2 cups all-purpose flou...",Start making the idol bites by preparing the d...
3,Perfect Chicken Adobo on a Budget,"2 lbs. chicken cut into serving pieces, 1 Knor...","Combine chicken, soy sauce, vinegar, and 5 clo..."
4,Creamy Mushroom Chicken,"1 ½ lbs. chicken cut into serving pieces, 1 Kn...",Rub salt and ground black pepper all over the ...


## Data Cleaning

We use ingredients column for our project since the recommender's input will be based on the ingredients available at their home.

In [13]:
#Load the dataset again
data

Unnamed: 0,food,ingredients,instructions
0,Pinoy Chicken Sopas,"1 lb. rotisserie chicken shredded, 2 Knorr Chi...",Heat cooking oil in a cooking pot. Sauté onion...
1,Popcorn Chicken and Gravy KFC Style Secret Recipe,"1 lb. boneless chicken breast cubed, 1 cup coo...",Start making the popcorn chicken by combining ...
2,Idol Cheesedog Bread Roll and Bites,"12 CDO Idol Cheesedog, 2 cups all-purpose flou...",Start making the idol bites by preparing the d...
3,Perfect Chicken Adobo on a Budget,"2 lbs. chicken cut into serving pieces, 1 Knor...","Combine chicken, soy sauce, vinegar, and 5 clo..."
4,Creamy Mushroom Chicken,"1 ½ lbs. chicken cut into serving pieces, 1 Kn...",Rub salt and ground black pepper all over the ...
...,...,...,...
589,Garlic Butter Fried Frog Legs,"1 lb about 3 to 4 pieces frog legs, 1 tablespo...","Rinse frog legs and pat dry.\nIn a bowl, combi..."
590,Tinolang Manok,"1 tablespoon canola oil, 1 small onion, peeled...","In a pot over medium heat, heat oil. Add onion..."
591,Tilapia in Black Bean Garlic Sauce,"4 (4 ounces each) tilapia fillets, salt and pe...",Wash tilapia and pat dry. Lightly season with ...
592,Pork Adobo,"2 pounds pork belly, cut into 2-inch cubes, 1 ...","In a bowl, combine pork, onions, garlic, bay l..."


### Tokenization, Removing of digits and lowerized text

In [14]:
import spacy
nlp=spacy.load("en_core_web_lg")


def tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc if token.is_alpha]

data['ingredients_clean']=data['ingredients'].apply(lambda x: tokenize(x.lower()))
    

In [15]:
data

Unnamed: 0,food,ingredients,instructions,ingredients_clean
0,Pinoy Chicken Sopas,"1 lb. rotisserie chicken shredded, 2 Knorr Chi...",Heat cooking oil in a cooking pot. Sauté onion...,"[lb, rotisserie, chicken, shredded, knorr, chi..."
1,Popcorn Chicken and Gravy KFC Style Secret Recipe,"1 lb. boneless chicken breast cubed, 1 cup coo...",Start making the popcorn chicken by combining ...,"[lb, boneless, chicken, breast, cubed, cup, co..."
2,Idol Cheesedog Bread Roll and Bites,"12 CDO Idol Cheesedog, 2 cups all-purpose flou...",Start making the idol bites by preparing the d...,"[cdo, idol, cheesedog, cups, all, purpose, flo..."
3,Perfect Chicken Adobo on a Budget,"2 lbs. chicken cut into serving pieces, 1 Knor...","Combine chicken, soy sauce, vinegar, and 5 clo...","[lbs, chicken, cut, into, serving, pieces, kno..."
4,Creamy Mushroom Chicken,"1 ½ lbs. chicken cut into serving pieces, 1 Kn...",Rub salt and ground black pepper all over the ...,"[lbs, chicken, cut, into, serving, pieces, kno..."
...,...,...,...,...
589,Garlic Butter Fried Frog Legs,"1 lb about 3 to 4 pieces frog legs, 1 tablespo...","Rinse frog legs and pat dry.\nIn a bowl, combi...","[lb, about, to, pieces, frog, legs, tablespoon..."
590,Tinolang Manok,"1 tablespoon canola oil, 1 small onion, peeled...","In a pot over medium heat, heat oil. Add onion...","[tablespoon, canola, oil, small, onion, peeled..."
591,Tilapia in Black Bean Garlic Sauce,"4 (4 ounces each) tilapia fillets, salt and pe...",Wash tilapia and pat dry. Lightly season with ...,"[ounces, each, tilapia, fillets, salt, and, pe..."
592,Pork Adobo,"2 pounds pork belly, cut into 2-inch cubes, 1 ...","In a bowl, combine pork, onions, garlic, bay l...","[pounds, pork, belly, cut, into, inch, cubes, ..."


### Remove Stopwords

In [16]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [17]:
def remove_stopwords(text):
    text=[word for word in text if word not in stop_words]
    return text

data['ingredients_clean']=data['ingredients_clean'].apply(lambda x: remove_stopwords(x))

### Lemmatized

In [18]:
lemmatizer=nltk.stem.WordNetLemmatizer()
def lemmatize_text(word_list):
    lemmatized_output = [lemmatizer.lemmatize(w) for w in word_list]
    return lemmatized_output

data['ingredients_clean']= data['ingredients_clean'].apply(lambda x: lemmatize_text(x))

### Most Common Words

In [19]:
import itertools
flat_list = list(itertools.chain(*list(data['ingredients_clean'])))

In [20]:
from collections import Counter
counter_of_flat_list = Counter(flat_list)

print(counter_of_flat_list.most_common(100)) # print top 10

[('cup', 6031), ('tablespoon', 3643), ('teaspoon', 3002), ('piece', 2424), ('pepper', 2136), ('salt', 1668), ('oil', 1648), ('chopped', 1590), ('onion', 1528), ('garlic', 1494), ('sliced', 1442), ('sauce', 1320), ('water', 1291), ('lb', 1269), ('minced', 1183), ('ground', 1109), ('peeled', 1070), ('cooking', 1005), ('medium', 924), ('taste', 889), ('black', 885), ('ounce', 822), ('clove', 822), ('sugar', 803), ('cut', 724), ('pork', 719), ('chicken', 666), ('egg', 627), ('powder', 595), ('green', 593), ('tomato', 538), ('white', 510), ('soy', 510), ('milk', 483), ('small', 483), ('crushed', 477), ('beef', 469), ('cube', 468), ('large', 443), ('pound', 439), ('fish', 428), ('red', 423), ('leaf', 421), ('flour', 389), ('butter', 366), ('inch', 358), ('cubed', 356), ('vinegar', 355), ('bell', 354), ('yellow', 352), ('ginger', 337), ('canola', 321), ('diced', 318), ('chili', 318), ('coconut', 317), ('shrimp', 316), ('carrot', 315), ('knorr', 309), ('cheese', 309), ('rice', 293), ('fresh', 

### Additional stopwords based on the common words 

In [21]:
additional_stopwords = ['cup', 'tablespoon', 'teaspoon', 'piece', 'sliced', 'lb', 'medium',
                        'taste', 'ounce', 'cooking', 'black', 'cut', 'green', 'small', 'white',
                        'large', 'pound', 'red', 'inch', 'yellow', 'brown', 'optional', 'oz', 
                        'bunch', 'thumb', 'cleaned', 'serving', 'long', 'tbsp']

In [22]:
def additional_lemmatize_text(text):
    text=[word for word in text if word not in additional_stopwords]
    return text

data['ingredients_clean']= data['ingredients_clean'].apply(lambda x: additional_lemmatize_text(x))

## Topic Modelling

In [23]:
import re
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt

In [24]:
# Create Dictionary 
id2word = corpora.Dictionary(data['ingredients_clean'])  
# Create Corpus 
texts = data['ingredients_clean']  

In [25]:
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 3), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)]]


In [26]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('cabbage', 1),
  ('carrot', 1),
  ('celery', 1),
  ('chicken', 2),
  ('clove', 1),
  ('cube', 1),
  ('cubed', 1),
  ('diced', 1),
  ('elbow', 1),
  ('evaporated', 1),
  ('fish', 1),
  ('garlic', 1),
  ('ground', 1),
  ('hotdog', 1),
  ('knorr', 1),
  ('macaroni', 1),
  ('milk', 1),
  ('minced', 3),
  ('oil', 1),
  ('onion', 1),
  ('pepper', 1),
  ('rotisserie', 1),
  ('sauce', 1),
  ('shredded', 1),
  ('stalk', 1),
  ('water', 1)]]

### LDA

In [27]:
for k in range(3,20):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=k, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)
    # Compute Perplexity
    #print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
    # a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data['ingredients_clean'] , dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"K: {k}, Perplexity: {lda_model.log_perplexity(corpus)}, Coherence: {coherence_model_lda.get_coherence()}")

K: 3, Perplexity: -5.540886958171264, Coherence: 0.5610012600194637
K: 4, Perplexity: -5.533483027288837, Coherence: 0.5560253478833899
K: 5, Perplexity: -5.562214116581548, Coherence: 0.5065777342929552
K: 6, Perplexity: -5.5819690087264915, Coherence: 0.5405969832784707
K: 7, Perplexity: -5.609607187169239, Coherence: 0.5193626583746858
K: 8, Perplexity: -5.625921507225434, Coherence: 0.5069171186191956
K: 9, Perplexity: -5.656507985970846, Coherence: 0.5030784639995601
K: 10, Perplexity: -5.670665841543238, Coherence: 0.4670492234448944
K: 11, Perplexity: -5.723519451842074, Coherence: 0.46360732006378486
K: 12, Perplexity: -5.837753997261317, Coherence: 0.450483563797416
K: 13, Perplexity: -5.9468720713866405, Coherence: 0.4632035943956457
K: 14, Perplexity: -6.268369659100679, Coherence: 0.4585023409598396
K: 15, Perplexity: -6.64433044742039, Coherence: 0.46728627769328857
K: 16, Perplexity: -7.217141907844212, Coherence: 0.4543220368199392
K: 17, Perplexity: -7.693677035241264, 

In [28]:
# Print the keyword of topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.110*"pepper" + 0.099*"salt" + 0.086*"oil" + 0.082*"garlic" + '
  '0.080*"onion" + 0.078*"chopped" + 0.075*"minced" + 0.068*"water" + '
  '0.055*"sauce" + 0.052*"clove"'),
 (1,
  '0.219*"flour" + 0.215*"powder" + 0.193*"egg" + 0.075*"beaten" + '
  '0.066*"purpose" + 0.066*"baking" + 0.034*"bread" + 0.030*"crumb" + '
  '0.027*"salt" + 0.012*"panko"'),
 (2,
  '0.164*"shredded" + 0.146*"sweet" + 0.115*"cheese" + 0.091*"mayonnaise" + '
  '0.080*"stalk" + 0.076*"celery" + 0.072*"cheddar" + 0.036*"pickle" + '
  '0.030*"macaroni" + 0.026*"relish"'),
 (3,
  '0.308*"canola" + 0.156*"soy" + 0.131*"sauce" + 0.106*"cornstarch" + '
  '0.065*"oyster" + 0.057*"sesame" + 0.054*"wine" + 0.041*"ginger" + '
  '0.023*"oil" + 0.012*"breadcrumb"'),
 (4,
  '0.202*"fresh" + 0.116*"corn" + 0.075*"sized" + 0.073*"sweetened" + '
  '0.059*"part" + 0.056*"slice" + 0.036*"shank" + 0.036*"floret" + '
  '0.031*"unsalted" + 0.030*"broccoli"'),
 (5,
  '0.161*"vinegar" + 0.156*"leaf" + 0.107*"whole" + 0.081*"pe

In [29]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data['ingredients_clean'] , dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.234574003806584

Coherence Score:  0.41858284294376635


In [30]:
vis_lda = gensimvis.prepare(lda_model, corpus, id2word)
vis_lda

  default_term_info = default_term_info.sort_values(


PreparedData(topic_coordinates=              x         y  topics  cluster       Freq
topic                                                
0      0.315215  0.019031       1        1  40.394792
9      0.176765 -0.113378       2        1   7.452256
14    -0.051067  0.067196       3        1   5.879791
16     0.089400  0.068781       4        1   5.731901
11     0.103171  0.288457       5        1   5.426126
5      0.157944 -0.159658       6        1   4.951430
10     0.055475 -0.063104       7        1   4.691639
1      0.071423  0.300474       8        1   4.593761
3      0.208085 -0.174698       9        1   3.967097
4     -0.077698  0.043551      10        1   2.690688
18    -0.018516 -0.113214      11        1   2.617579
13    -0.100807 -0.033016      12        1   2.444374
2     -0.093861  0.024271      13        1   2.060392
15    -0.077214 -0.004521      14        1   2.038967
7     -0.091822 -0.038513      15        1   1.952091
12    -0.130334 -0.018701      16        1   1.2350

In [31]:
pyLDAvis.display(vis_lda)

### LSI

In [32]:
lsimodel = gensim.models.lsimodel.LsiModel(corpus=corpus, num_topics=3, id2word=id2word)
lsimodel.show_topics(num_topics=8)

  sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices,


[(0,
  '0.391*"pepper" + 0.309*"chopped" + 0.276*"onion" + 0.272*"oil" + 0.268*"garlic" + 0.257*"salt" + 0.238*"peeled" + 0.236*"sauce" + 0.226*"minced" + 0.205*"water"'),
 (1,
  '-0.791*"peeled" + 0.323*"ground" + -0.171*"canola" + 0.149*"pepper" + 0.137*"powder" + 0.119*"chopped" + -0.119*"thinly" + 0.101*"knorr" + -0.095*"minced" + -0.093*"trimmed"'),
 (2,
  '-0.432*"sugar" + 0.378*"chopped" + -0.272*"milk" + -0.268*"egg" + -0.259*"salt" + -0.234*"flour" + -0.221*"powder" + 0.197*"pepper" + -0.172*"butter" + -0.132*"water"')]

In [33]:
# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsimodel, texts=data['ingredients_clean'], dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lsi)


Coherence Score:  0.41858284294376635


In [34]:
vis_lsi = gensimvis.prepare(lsimodel, corpus, id2word)
vis_lsi

AttributeError: 'LsiModel' object has no attribute 'inference'

In [None]:
pyLDAvis.display(vis_lda)

### HDP

In [None]:
hdpmodel =  gensim.models.hdpmodel.HdpModel(corpus=corpus, id2word=id2word)

model = hdpmodel.print_topics(num_topics=3, num_words=10)
model

In [None]:
# Compute Coherence Score
coherence_model_hdpmodel = CoherenceModel(model=model, texts=data['ingredients_clean'], dictionary=id2word, coherence='c_v')
coherence_hdpmodel = coherence_model_hdpmodel.get_coherence()
print('\nCoherence Score: ', coherence_hdpmodel)

In [None]:
vis_hdp = gensimvis.prepare(lsimodel, corpus, id2word)
vis_lsi

### LDA Scikit Learn

In [35]:
# for text preprocessing
import re
import spacy

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

# import vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import numpy for matrix operation
import numpy as np

# import LDA from sklearn
from sklearn.decomposition import LatentDirichletAllocation

In [43]:
tf_idf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)

In [44]:
# Array from TF-IDF Vectorizer 
tf_idf_arr = tf_idf_vectorizer.fit_transform(data['ingredients_clean'].tolist())

In [46]:
# Creating vocabulary array which will represent all the corpus 
vocab_tf_idf = tf_idf_vectorizer.get_feature_names()

# get the vocb list
vocab_tf_idf



['according',
 'achiote',
 'active',
 'added',
 'additional',
 'adobo',
 'adobong',
 'agar',
 'ahi',
 'air',
 'aka',
 'alamang',
 'albacore',
 'alfredo',
 'alimango',
 'alimasag',
 'allspice',
 'almejas',
 'almond',
 'also',
 'alter',
 'alternative',
 'alugbati',
 'aluminum',
 'amaranth',
 'amarillo',
 'american',
 'amount',
 'ampalaya',
 'anatto',
 'ancho',
 'anchovy',
 'anisado',
 'anise',
 'anisette',
 'annatto',
 'annnato',
 'ap',
 'apple',
 'approximately',
 'arnibal',
 'around',
 'arrowroot',
 'arroz',
 'artichoke',
 'arugula',
 'asada',
 'asian',
 'asparagus',
 'assorted',
 'astuete',
 'atchara',
 'atchuete',
 'atsuete',
 'attached',
 'authentic',
 'available',
 'avocado',
 'baby',
 'back',
 'bacon',
 'bag',
 'bagiuo',
 'bagnet',
 'bago',
 'bagoong',
 'baguio',
 'baked',
 'baking',
 'balaw',
 'balayan',
 'ball',
 'balsamic',
 'balut',
 'bamboo',
 'banana',
 'bangus',
 'banh',
 'bar',
 'barbecue',
 'barbecued',
 'barbeque',
 'bark',
 'base',
 'based',
 'basil',
 'basting',
 'bat'

In [47]:
display(len(vocab_tf_idf))

1681

In [56]:
# Create object for the LDA class 
# Inside this class LDA: define the components:
lda_model = LatentDirichletAllocation(n_components = 5, max_iter = 20, random_state = 20)

# fit transform on model on our count_vectorizer : running this will return our topics 
X_topics = lda_model.fit(tf_idf_arr)

# .components_ gives us our topic distribution 
topic_words = lda_model.components_

In [57]:
#  Define the number of Words that we want to print in every topic : n_top_words
n_top_words = 10

for i, topic_dist in enumerate(topic_words):
    
    # np.argsort to sorting an array or a list or the matrix acc to their values
    sorted_topic_dist = np.argsort(topic_dist)
    
    # Next, to view the actual words present in those indexes we can make the use of the vocab created earlier
    topic_words = np.array(vocab_tf_idf)[sorted_topic_dist]
    
    # so using the sorted_topic_indexes we ar extracting the words from the vocabulary
    # obtaining topics + words
    # this topic_words variable contains the Topics  as well as the respective words present in those Topics
    topic_words = topic_words[:-n_top_words:-1]
    print ("Topic", str(i+1), topic_words)

Topic 1 ['peeled' 'sauce' 'pepper' 'garlic' 'minced' 'chopped' 'soy' 'oil' 'pork']
Topic 2 ['choice' 'relish' 'lady' 'pickle' 'mayonnaise' 'lumpia' 'spice' 'de'
 'five']
Topic 3 ['milk' 'sugar' 'coconut' 'flour' 'cream' 'egg' 'extract' 'butter'
 'vanilla']
Topic 4 ['chopped' 'pepper' 'ground' 'salt' 'chicken' 'egg' 'minced' 'oil' 'onion']
Topic 5 ['fish' 'pepper' 'ground' 'onion' 'tomato' 'oil' 'chopped' 'bean' 'garlic']


## Sample Optimal Model

In [None]:
lda_optimal_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
lsa_s_choc_ctv=cosine_similarity(lda_optimal_model[3].reshape(1, -1), ct_lsa_10).round(3) 
lsa_s_choc_ctv