In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import wordnet 
import string

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Emong\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Emong\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Emong\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
panlasang_pinoy_df = pd.read_csv("data/panlasang_pinoy.csv")
kawaling_pinoy_df = pd.read_csv("data/kawaling_pinoy.csv")

recipe_df = pd.concat([panlasang_pinoy_df, kawaling_pinoy_df], axis=0)
recipe_df

Unnamed: 0,url,food,course,ingredients,instructions
0,https://panlasangpinoy.com/ham-katsu/,Ham Katsu,Main Course,500 grams CDO Holiday Ham sliced (1-inch thick...,Heat cooking oil in a pan.\nDredge a slice of ...
1,https://panlasangpinoy.com/cheesy-ham-steak-wi...,Cheesy Ham Steak with Bacon and Mushroom,Main Course,500 grams CDO Holiday Ham sliced (1-inch thick...,Melt butter in a pan. Sauté onion and garlic. ...
2,https://panlasangpinoy.com/hawaiian-pizza-no-b...,Hawaiian Pizza (No Bake),none,"75 grams CDO Sweet Ham sliced, ½ cup pineapple...","Make the pizza dough by combining flour, bakin..."
3,https://panlasangpinoy.com/crispy-katsu-sando/,Crispy Katsu Sando,Sandwich,"6 CDO Crispy burger patties, 3 slices tasty br...",Heat the cooking oil in a pan. Fry both sides ...
4,https://panlasangpinoy.com/orange-crispy-burger/,Orange Crispy Burger,none,"3 CDO Crispy burger patties, 2 tablespoons gre...",Heat the cooking oil in a pan. Fry both sides ...
...,...,...,...,...,...
591,https://www.kawalingpinoy.com/garlic-butter-fr...,Garlic Butter Fried Frog Legs,none,"1 lb about 3 to 4 pieces frog legs, 1 tablespo...","Rinse frog legs and pat dry.\nIn a bowl, combi..."
592,https://www.kawalingpinoy.com/tinolang-manok-c...,Tinolang Manok,Main Entree,"1 tablespoon canola oil, 1 small onion, peeled...","In a pot over medium heat, heat oil. Add onion..."
593,https://www.kawalingpinoy.com/tilapia-with-bla...,Tilapia in Black Bean Garlic Sauce,Main Entree,"4 (4 ounces each) tilapia fillets, salt and pe...",Wash tilapia and pat dry. Lightly season with ...
594,https://www.kawalingpinoy.com/pork-adobo/,Pork Adobo,Main Entree,"2 pounds pork belly, cut into 2-inch cubes, 1 ...","In a bowl, combine pork, onions, garlic, bay l..."


In [4]:
display(recipe_df.columns, recipe_df.isna().sum(), recipe_df.shape)

Index(['url', 'food', 'course', 'ingredients', 'instructions'], dtype='object')

url              0
food             0
course           0
ingredients     73
instructions     0
dtype: int64

(2354, 5)

In [5]:
recipe_df = recipe_df.dropna(subset=['ingredients']).drop_duplicates(subset=['food']).reset_index(drop=True)

In [6]:
recipe_df

Unnamed: 0,url,food,course,ingredients,instructions
0,https://panlasangpinoy.com/ham-katsu/,Ham Katsu,Main Course,500 grams CDO Holiday Ham sliced (1-inch thick...,Heat cooking oil in a pan.\nDredge a slice of ...
1,https://panlasangpinoy.com/cheesy-ham-steak-wi...,Cheesy Ham Steak with Bacon and Mushroom,Main Course,500 grams CDO Holiday Ham sliced (1-inch thick...,Melt butter in a pan. Sauté onion and garlic. ...
2,https://panlasangpinoy.com/hawaiian-pizza-no-b...,Hawaiian Pizza (No Bake),none,"75 grams CDO Sweet Ham sliced, ½ cup pineapple...","Make the pizza dough by combining flour, bakin..."
3,https://panlasangpinoy.com/crispy-katsu-sando/,Crispy Katsu Sando,Sandwich,"6 CDO Crispy burger patties, 3 slices tasty br...",Heat the cooking oil in a pan. Fry both sides ...
4,https://panlasangpinoy.com/orange-crispy-burger/,Orange Crispy Burger,none,"3 CDO Crispy burger patties, 2 tablespoons gre...",Heat the cooking oil in a pan. Fry both sides ...
...,...,...,...,...,...
2188,https://www.kawalingpinoy.com/garlic-butter-fr...,Garlic Butter Fried Frog Legs,none,"1 lb about 3 to 4 pieces frog legs, 1 tablespo...","Rinse frog legs and pat dry.\nIn a bowl, combi..."
2189,https://www.kawalingpinoy.com/tinolang-manok-c...,Tinolang Manok,Main Entree,"1 tablespoon canola oil, 1 small onion, peeled...","In a pot over medium heat, heat oil. Add onion..."
2190,https://www.kawalingpinoy.com/tilapia-with-bla...,Tilapia in Black Bean Garlic Sauce,Main Entree,"4 (4 ounces each) tilapia fillets, salt and pe...",Wash tilapia and pat dry. Lightly season with ...
2191,https://www.kawalingpinoy.com/pork-adobo/,Pork Adobo,Main Entree,"2 pounds pork belly, cut into 2-inch cubes, 1 ...","In a bowl, combine pork, onions, garlic, bay l..."


In [7]:
display(recipe_df.isna().sum(), recipe_df.shape)

url             0
food            0
course          0
ingredients     0
instructions    0
dtype: int64

(2193, 5)

## Text Preprocessing

In [8]:
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

tokenizer = RegexpTokenizer(r"[a-zA-Z]+")

def remove_stopwords(text):
    stopwords = nltk.corpus.stopwords.words('english')
    words = [w for w in text if w not in stopwords]
    return words 

def word_lemmatizer(text):
    lemma = WordNetLemmatizer()
    lem_text_verb = [lemma.lemmatize(i, wordnet.VERB) for i in text]
    lem_text = [lemma.lemmatize(i, wordnet.NOUN) for i in lem_text_verb]
    return lem_text

def word_stemmer(text):
    stemmer = WordNetLemmatizer()
    stem_text = [stemmer.stem(i) for i in text]
    return stem_text

def additional_stopwords(text):
    add_stop = ['gram', 'pound', 'inch', 'lb', 'lbs', 'cup', 'thick', 'slice', 'teaspoon','tablespoon',
               'drop', 'pinch', 'pint', 'gallon', 'liter', 'milliliter', 'ml', 'kilo', 'kg', 'oz','ounce',
               'about', 'to', 'pound', 'cut', 'salt', 'piece', 'chop', 'water', 'mince', 'cook','grind',
               'peel', 'taste', 'medium', 'small', 'cut', 'clove', 'green', 'crush', 'optional', 'large',
               'leave', 'dry', 'fresh', 'cube', 'whole', 'thinly', 'strip', 'brown', 'shred',
               'optional', 'clean', 'bunch', 'serve', 'thai', 'julienne', 'boil', 'long', 'boneless', 
               'grate', 'beat', 'sweet', 'remove', 'quarter', 'extract', 'drain', 'package', 'thin',
               'trim', 'chinese', 'season', 'core', 'fry', 'mix', 'extra', 'end', 'liquid', 'length', 
               'melt', 'glutinous', 'half', 'pack', 'thumbsize', 'tbsp', 'soften', 'g', 'recipe', 'quart',
               'firm', 'unsalted', 'salt', 'pepper', 'choice', 'size', 'young', 'leftover', 'lengthwise',
                'pinch', 'crosswise', 'separate', 'well', 'tsp', 'use', 'part', 'scale', 'diagonally',
                'instruction', 'wash', 'steam', 'round', 'unripe', 'reserve', 'table', 'button', 'snow',
               'oil', 'finely', 'free', 'nonstick', 'accord', 'sa', 'ng', 'preferably', 'rinse', 'need',
               'homemade', 'also', 'elbow', 'real', 'tabespoon', 'shreddedgrated', 'color', 'gutted',
               'tidbit','style', 'bit', 'food', 'savorrich', 'quick', 'temperature', 'ap', 'freshly',
               'room','snap', 'ball', 'low', 'see', 'thumbsized', 'active', 'bundle', 'na', 'instant', 
               'pit', 'brand', 'procedure', 'mediumsized', 'f', 'original', 'shorten', 
               'malagkit', 'filipino', 'french', 'american', 'minute', 'individual', 'manila', 'dissolve',
               'sachet', 'process', 'ingredient', 'coarsely', 'regular', 'essence', 'portion', 'lightly',
               'short', 'side', 'get', 'wide', 'packet', 'x', 'imitation', 'fine', 'clear', 'per', 'divide',
               'deep', 'bottom', 'dress', 'link', 'five', 'combine', 'thickness', 'semisweet', 'plus', 
               'count','real', 'pouch', 'overnight', 'click', 'indian', 'jumbo', 'discard', 'lower', 
               'dip', 'third', 'good', 'equivalent', 'without','around', 'lukewarm', 'tip', 'want', 'single',
               'amount', 'enough', 'lenghtwise','fill', 'intact', 'paper', 'block', 'mini', 'two', 'halfrounds',
               'cellophane', 'tasty', 'know', 'add', 'mixture', 'paypay', 'view', 'way', 'par', 'refrigerate',
                'rectangular', 'precook', 'favorite', 'curly', 'holy', 'prefer', 'pc', 'easy', 'holy', 'shave',
                'bite', 'box', 'double', 'refried', 'crisp', 'wrap', 'asian', 'mature', 'full', 'freshlysqueezed',
                'filipinostyle', 'sweetstyle', 'smash', 'wellbeaten', 'hugas', 'individually', 'daga', 'derive',
                'coat', 'additional', 'sticky', 'sugar', 'de', 'halve', 'freeze']
    words = [w for w in text if w not in add_stop]
    return words

In [9]:
recipe_df['ingredients_clean'] = recipe_df['ingredients'].apply(lambda x: remove_punctuation(x).lower())
recipe_df['ingredients_token'] = recipe_df['ingredients_clean'].apply(lambda x: tokenizer.tokenize(x))
recipe_df['ingredients_token'] = recipe_df['ingredients_token'].apply(lambda x: word_lemmatizer(x))
recipe_df['ingredients_token'] = recipe_df['ingredients_token'].apply(lambda x: remove_stopwords(x))
recipe_df['ingredients_token'] = recipe_df['ingredients_token'].apply(lambda x: additional_stopwords(x))
recipe_df['ingredients_clean'] = recipe_df['ingredients_token'].apply(lambda x: ' '.join(map(str, x)))

In [10]:
recipe_df = recipe_df[recipe_df['course'] != "Sauce"].reset_index(drop=True)

In [11]:
from collections import Counter
results = Counter()
recipe_df['ingredients_clean'].str.lower().str.split().apply(results.update)
print(results)

Counter({'onion': 1427, 'garlic': 1381, 'sauce': 1209, 'black': 845, 'pork': 668, 'chicken': 609, 'powder': 593, 'egg': 588, 'cub': 522, 'white': 490, 'tomato': 486, 'soy': 473, 'milk': 450, 'beef': 438, 'red': 404, 'fish': 389, 'flour': 363, 'butter': 355, 'yellow': 348, 'bell': 336, 'vinegar': 334, 'knorr': 311, 'ginger': 306, 'cheese': 298, 'coconut': 292, 'shrimp': 287, 'carrot': 286, 'canola': 282, 'chili': 281, 'dice': 278, 'rice': 275, 'cream': 250, 'bean': 244, 'broth': 238, 'juice': 217, 'cornstarch': 212, 'head': 202, 'allpurpose': 198, 'lemon': 185, 'granulate': 183, 'olive': 180, 'seed': 175, 'peppercorn': 173, 'bay': 171, 'potato': 167, 'bake': 161, 'belly': 151, 'breast': 133, 'banana': 131, 'cabbage': 127, 'wedge': 127, 'noodle': 124, 'thumb': 123, 'paste': 121, 'oyster': 120, 'sesame': 119, 'parsley': 119, 'wine': 117, 'pea': 114, 'mayonnaise': 110, 'pineapple': 108, 'corn': 108, 'cheddar': 102, 'bread': 99, 'vanilla': 99, 'condense': 96, 'calamansi': 95, 'scallion': 91

In [12]:
recipe_df.to_csv("data/filipino_recipe_clean.csv", index=False)

In [22]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(recipe_df['ingredients_clean'])

In [46]:
test = tf.transform(['avocado'])

In [47]:
test_list = cosine_similarity(tfidf_matrix, test)
indexes = sorted(range(len(test_list)), key = lambda sub: test_list[sub])[-5:]

In [48]:
indexes

[1145, 2051, 2176, 1335, 2163]

In [53]:
recipe_df.iloc[1145]

url                  https://panlasangpinoy.com/bacon-and-avocado-s...
food                                        Bacon and Avocado Sandwich
course                                                            none
ingredients          3 bacon strips cooked crisp, 1 leaf curly lett...
instructions         Combine the mustard and mayonnaise. Mix well. ...
ingredients_clean    bacon leaf lettuce tomato avocado dijon mustar...
ingredients_token    [bacon, leaf, lettuce, tomato, avocado, dijon,...
Name: 1145, dtype: object