In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import wordnet 
import string

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from googletrans import Translator

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Emong\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Emong\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Emong\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
panlasang_pinoy_df = pd.read_csv("data/panlasang_pinoy.csv")
kawaling_pinoy_df = pd.read_csv("data/kawaling_pinoy.csv")

recipe_df = pd.concat([panlasang_pinoy_df, kawaling_pinoy_df], axis=0)
recipe_df

Unnamed: 0,url,food,course,ingredients,instructions
0,https://panlasangpinoy.com/ham-katsu/,Ham Katsu,Main Course,500 grams CDO Holiday Ham sliced (1-inch thick...,Heat cooking oil in a pan.\nDredge a slice of ...
1,https://panlasangpinoy.com/cheesy-ham-steak-wi...,Cheesy Ham Steak with Bacon and Mushroom,Main Course,500 grams CDO Holiday Ham sliced (1-inch thick...,Melt butter in a pan. Sauté onion and garlic. ...
2,https://panlasangpinoy.com/hawaiian-pizza-no-b...,Hawaiian Pizza (No Bake),none,"75 grams CDO Sweet Ham sliced, ½ cup pineapple...","Make the pizza dough by combining flour, bakin..."
3,https://panlasangpinoy.com/crispy-katsu-sando/,Crispy Katsu Sando,Sandwich,"6 CDO Crispy burger patties, 3 slices tasty br...",Heat the cooking oil in a pan. Fry both sides ...
4,https://panlasangpinoy.com/orange-crispy-burger/,Orange Crispy Burger,none,"3 CDO Crispy burger patties, 2 tablespoons gre...",Heat the cooking oil in a pan. Fry both sides ...
...,...,...,...,...,...
591,https://www.kawalingpinoy.com/garlic-butter-fr...,Garlic Butter Fried Frog Legs,none,"1 lb about 3 to 4 pieces frog legs, 1 tablespo...","Rinse frog legs and pat dry.\nIn a bowl, combi..."
592,https://www.kawalingpinoy.com/tinolang-manok-c...,Tinolang Manok,Main Entree,"1 tablespoon canola oil, 1 small onion, peeled...","In a pot over medium heat, heat oil. Add onion..."
593,https://www.kawalingpinoy.com/tilapia-with-bla...,Tilapia in Black Bean Garlic Sauce,Main Entree,"4 (4 ounces each) tilapia fillets, salt and pe...",Wash tilapia and pat dry. Lightly season with ...
594,https://www.kawalingpinoy.com/pork-adobo/,Pork Adobo,Main Entree,"2 pounds pork belly, cut into 2-inch cubes, 1 ...","In a bowl, combine pork, onions, garlic, bay l..."


In [3]:
display(recipe_df.columns, recipe_df.isna().sum(), recipe_df.shape)

Index(['url', 'food', 'course', 'ingredients', 'instructions'], dtype='object')

url              0
food             0
course           0
ingredients     73
instructions     0
dtype: int64

(2354, 5)

In [4]:
recipe_df = recipe_df.dropna(subset=['ingredients']).drop_duplicates(subset=['food']).reset_index(drop=True)

In [5]:
recipe_df

Unnamed: 0,url,food,course,ingredients,instructions
0,https://panlasangpinoy.com/ham-katsu/,Ham Katsu,Main Course,500 grams CDO Holiday Ham sliced (1-inch thick...,Heat cooking oil in a pan.\nDredge a slice of ...
1,https://panlasangpinoy.com/cheesy-ham-steak-wi...,Cheesy Ham Steak with Bacon and Mushroom,Main Course,500 grams CDO Holiday Ham sliced (1-inch thick...,Melt butter in a pan. Sauté onion and garlic. ...
2,https://panlasangpinoy.com/hawaiian-pizza-no-b...,Hawaiian Pizza (No Bake),none,"75 grams CDO Sweet Ham sliced, ½ cup pineapple...","Make the pizza dough by combining flour, bakin..."
3,https://panlasangpinoy.com/crispy-katsu-sando/,Crispy Katsu Sando,Sandwich,"6 CDO Crispy burger patties, 3 slices tasty br...",Heat the cooking oil in a pan. Fry both sides ...
4,https://panlasangpinoy.com/orange-crispy-burger/,Orange Crispy Burger,none,"3 CDO Crispy burger patties, 2 tablespoons gre...",Heat the cooking oil in a pan. Fry both sides ...
...,...,...,...,...,...
2188,https://www.kawalingpinoy.com/garlic-butter-fr...,Garlic Butter Fried Frog Legs,none,"1 lb about 3 to 4 pieces frog legs, 1 tablespo...","Rinse frog legs and pat dry.\nIn a bowl, combi..."
2189,https://www.kawalingpinoy.com/tinolang-manok-c...,Tinolang Manok,Main Entree,"1 tablespoon canola oil, 1 small onion, peeled...","In a pot over medium heat, heat oil. Add onion..."
2190,https://www.kawalingpinoy.com/tilapia-with-bla...,Tilapia in Black Bean Garlic Sauce,Main Entree,"4 (4 ounces each) tilapia fillets, salt and pe...",Wash tilapia and pat dry. Lightly season with ...
2191,https://www.kawalingpinoy.com/pork-adobo/,Pork Adobo,Main Entree,"2 pounds pork belly, cut into 2-inch cubes, 1 ...","In a bowl, combine pork, onions, garlic, bay l..."


In [6]:
display(recipe_df.isna().sum(), recipe_df.shape)

url             0
food            0
course          0
ingredients     0
instructions    0
dtype: int64

(2193, 5)

## Text Preprocessing

In [9]:
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

tokenizer = RegexpTokenizer(r"[a-zA-Z]+")

def translate(word):
    translator = Translator()
    text_translate = translator.translate(word, dest='en').text
    return text_translate

def remove_stopwords(text):
    stopwords = nltk.corpus.stopwords.words('english')
    words = [w for w in text if w not in stopwords]
    return words 

def word_lemmatizer(text):
    lemma = WordNetLemmatizer()
    lem_text_verb = [lemma.lemmatize(i, wordnet.VERB) for i in text]
    lem_text = [lemma.lemmatize(i, wordnet.NOUN) for i in lem_text_verb]
    return lem_text

def word_stemmer(text):
    stemmer = WordNetLemmatizer()
    stem_text = [stemmer.stem(i) for i in text]
    return stem_text

def additional_stopwords(text):
    add_stop = ['gram', 'pound', 'inch', 'lb', 'lbs', 'cup', 'thick', 'slice', 'teaspoon','tablespoon',
               'drop', 'pinch', 'pint', 'gallon', 'liter', 'milliliter', 'ml', 'kilo', 'kg', 'oz','ounce',
               'about', 'to', 'pound', 'cut', 'salt', 'piece', 'chop', 'water', 'mince', 'cook','grind',
               'peel', 'taste', 'medium', 'small', 'cut', 'clove', 'green', 'crush', 'optional', 'large',
               'leave', 'dry', 'fresh', 'cube', 'whole', 'thinly', 'strip', 'brown', 'shred',
               'optional', 'clean', 'bunch', 'serve', 'thai', 'julienne', 'boil', 'long', 'boneless', 
               'grate', 'beat', 'sweet', 'remove', 'quarter', 'extract', 'drain', 'package', 'thin',
               'trim', 'chinese', 'season', 'core', 'fry', 'mix', 'extra', 'end', 'liquid', 'length', 
               'melt', 'glutinous', 'half', 'pack', 'thumbsize', 'tbsp', 'soften', 'g', 'recipe', 'quart',
               'firm', 'unsalted', 'salt', 'pepper', 'choice', 'size', 'young', 'leftover', 'lengthwise',
                'pinch', 'crosswise', 'separate', 'well', 'tsp', 'use', 'part', 'scale', 'diagonally',
                'instruction', 'wash', 'steam', 'round', 'unripe', 'reserve', 'table', 'button', 'snow',
               'oil', 'finely', 'free', 'nonstick', 'accord', 'sa', 'ng', 'preferably', 'rinse', 'need',
               'homemade', 'also', 'elbow', 'real', 'tabespoon', 'shreddedgrated', 'color', 'gutted',
               'tidbit','style', 'bit', 'food', 'savorrich', 'quick', 'temperature', 'ap', 'freshly',
               'room','snap', 'ball', 'low', 'see', 'thumbsized', 'active', 'bundle', 'na', 'instant', 
               'pit', 'brand', 'procedure', 'mediumsized', 'f', 'original', 'shorten', 
               'malagkit', 'filipino', 'french', 'american', 'minute', 'individual', 'manila', 'dissolve',
               'sachet', 'process', 'ingredient', 'coarsely', 'regular', 'essence', 'portion', 'lightly',
               'short', 'side', 'get', 'wide', 'packet', 'x', 'imitation', 'fine', 'clear', 'per', 'divide',
               'deep', 'bottom', 'dress', 'link', 'five', 'combine', 'thickness', 'semisweet', 'plus', 
               'count','real', 'pouch', 'overnight', 'click', 'indian', 'jumbo', 'discard', 'lower', 
               'dip', 'third', 'good', 'equivalent', 'without','around', 'lukewarm', 'tip', 'want', 'single',
               'amount', 'enough', 'lenghtwise','fill', 'intact', 'paper', 'block', 'mini', 'two', 'halfrounds',
               'cellophane', 'tasty', 'know', 'add', 'mixture', 'paypay', 'view', 'way', 'par', 'refrigerate',
                'rectangular', 'precook', 'favorite', 'curly', 'holy', 'prefer', 'pc', 'easy', 'holy', 'shave',
                'bite', 'box', 'double', 'refried', 'crisp', 'wrap', 'asian', 'mature', 'full', 'freshlysqueezed',
                'filipinostyle', 'sweetstyle', 'smash', 'wellbeaten', 'hugas', 'individually', 'daga', 'derive',
                'coat', 'additional', 'sticky', 'sugar', 'de', 'halve', 'freeze']
    words = [w for w in text if w not in add_stop]
    return words

In [10]:
recipe_df['ingredients_clean'] = recipe_df['ingredients'].apply(lambda x: remove_punctuation(x).lower())
recipe_df['ingredients_token'] = recipe_df['ingredients_clean'].apply(lambda x: tokenizer.tokenize(x))
recipe_df['ingredients_token'] = recipe_df['ingredients_token'].apply(lambda x: word_lemmatizer(x))
recipe_df['ingredients_token'] = recipe_df['ingredients_token'].apply(lambda x: remove_stopwords(x))
recipe_df['ingredients_token'] = recipe_df['ingredients_token'].apply(lambda x: additional_stopwords(x))
recipe_df['ingredients_clean'] = recipe_df['ingredients_token'].apply(lambda x: ' '.join(map(str, x)))
recipe_df.loc[:, 'ingredients_clean'] = recipe_df['ingredients_clean'].apply(translate)

In [11]:
recipe_df = recipe_df[recipe_df['course'] != "Sauce"].reset_index(drop=True)

In [15]:
recipe_df

Unnamed: 0,url,food,course,ingredients,instructions,ingredients_clean,ingredients_token
0,https://panlasangpinoy.com/ham-katsu/,Ham Katsu,Main Course,500 grams CDO Holiday Ham sliced (1-inch thick...,Heat cooking oil in a pan.\nDredge a slice of ...,cdo holiday ham panko breadcrumb all purpose f...,"[cdo, holiday, ham, panko, breadcrumb, allpurp..."
1,https://panlasangpinoy.com/cheesy-ham-steak-wi...,Cheesy Ham Steak with Bacon and Mushroom,Main Course,500 grams CDO Holiday Ham sliced (1-inch thick...,Melt butter in a pan. Sauté onion and garlic. ...,cdo holiday ham mushroom cdo pork bacon crispy...,"[cdo, holiday, ham, mushroom, cdo, pork, bacon..."
2,https://panlasangpinoy.com/hawaiian-pizza-no-b...,Hawaiian Pizza (No Bake),none,"75 grams CDO Sweet Ham sliced, ½ cup pineapple...","Make the pizza dough by combining flour, bakin...",cdo ham pineapple pizza sauce mozzarella chees...,"[cdo, ham, pineapple, pizza, sauce, mozzarella..."
3,https://panlasangpinoy.com/crispy-katsu-sando/,Crispy Katsu Sando,Sandwich,"6 CDO Crispy burger patties, 3 slices tasty br...",Heat the cooking oil in a pan. Fry both sides ...,cdo crispy burger patty bread cabbage katsu sa...,"[cdo, crispy, burger, patty, bread, cabbage, k..."
4,https://panlasangpinoy.com/orange-crispy-burger/,Orange Crispy Burger,none,"3 CDO Crispy burger patties, 2 tablespoons gre...",Heat the cooking oil in a pan. Fry both sides ...,cdo crispy burger patty onion orange juice soy...,"[cdo, crispy, burger, patty, onion, orange, ju..."
...,...,...,...,...,...,...,...
2185,https://www.kawalingpinoy.com/garlic-butter-fr...,Garlic Butter Fried Frog Legs,none,"1 lb about 3 to 4 pieces frog legs, 1 tablespo...","Rinse frog legs and pat dry.\nIn a bowl, combi...",frog leg granulated garlic black egg cornstarc...,"[frog, leg, granulate, garlic, black, egg, cor..."
2186,https://www.kawalingpinoy.com/tinolang-manok-c...,Tinolang Manok,Main Entree,"1 tablespoon canola oil, 1 small onion, peeled...","In a pot over medium heat, heat oil. Add onion...",canola onion garlic ginger chicken fish sauce ...,"[canola, onion, garlic, ginger, chicken, fish,..."
2187,https://www.kawalingpinoy.com/tilapia-with-bla...,Tilapia in Black Bean Garlic Sauce,Main Entree,"4 (4 ounces each) tilapia fillets, salt and pe...",Wash tilapia and pat dry. Lightly season with ...,tilapia fillet canola black bean tausi chili g...,"[tilapia, fillet, canola, black, bean, tausi, ..."
2188,https://www.kawalingpinoy.com/pork-adobo/,Pork Adobo,Main Entree,"2 pounds pork belly, cut into 2-inch cubes, 1 ...","In a bowl, combine pork, onions, garlic, bay l...",pork belly cub onion head garlic bay vinegar s...,"[pork, belly, cub, onion, head, garlic, bay, v..."


In [16]:
from collections import Counter
results = Counter()
recipe_df['ingredients_clean'].str.lower().str.split().apply(results.update)
print(results)

Counter({'onion': 1427, 'garlic': 1375, 'sauce': 1208, 'black': 836, 'pork': 665, 'chicken': 608, 'powder': 591, 'egg': 588, 'cub': 508, 'white': 487, 'tomato': 486, 'soy': 471, 'milk': 449, 'beef': 438, 'red': 403, 'fish': 390, 'flour': 364, 'butter': 353, 'yellow': 349, 'bell': 336, 'vinegar': 333, 'knorr': 309, 'ginger': 302, 'cheese': 299, 'coconut': 292, 'shrimp': 288, 'carrot': 286, 'chili': 286, 'canola': 282, 'rice': 279, 'dice': 264, 'cream': 249, 'bean': 246, 'broth': 240, 'juice': 218, 'cornstarch': 213, 'purpose': 209, 'head': 200, 'all': 189, 'lemon': 184, 'olive': 180, 'seed': 175, 'peppercorn': 174, 'potato': 169, 'bay': 169, 'belly': 150, 'bake': 149, 'breast': 133, 'banana': 129, 'granulate': 127, 'cabbage': 125, 'wedge': 124, 'noodle': 123, 'thumb': 122, 'oyster': 121, 'paste': 120, 'sesame': 119, 'parsley': 119, 'wine': 117, 'pea': 112, 'mayonnaise': 111, 'pineapple': 108, 'cheddar': 103, 'corn': 100, 'vanilla': 99, 'calamansi': 94, 'bread': 91, 'eggplant': 91, 'scal

In [17]:
recipe_df.to_csv("data/filipino_recipe_clean.csv", index=False)

In [None]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(recipe_df['ingredients_clean'])

In [None]:
test = tf.transform(['avocado'])

In [None]:
test_list = cosine_similarity(tfidf_matrix, test)
indexes = sorted(range(len(test_list)), key = lambda sub: test_list[sub])[-5:]

In [None]:
indexes

In [None]:
recipe_df.iloc[1145]