In [19]:
print ('Welcome to another Data Scraping via Natural Language Processing: CHRISTMAS EDITION !')
print ('STEP 1: What Question that we must satisfy? : "What are the most celebrated dishes during Noche Buena?"')
print ('STEP 2: Extraction and Cleaning of Data from the source list, https://panlasangpinoy.com/top-10-filipino-christmas-recipes/')
print (' -------------------------------------------------------------------------- ')

Welcome to another Data Scraping via Natural Language Processing: CHRISTMAS EDITION !
STEP 1: What Question that we must satisfy? : "What are the most celebrated dishes during Noche Buena?"
STEP 2: Extraction and Cleaning of Data from the source list, https://panlasangpinoy.com/top-10-filipino-christmas-recipes/
 -------------------------------------------------------------------------- 


In [5]:
import requests
from bs4 import BeautifulSoup
import pickle

def url_to_transcript(url):
    '''Returns transcript data specifically from panlasangpinoy.com'''
    page = requests.get(url).text
    soup = BeautifulSoup (page, "lxml")
    text = [p.text for p in soup.find(class_="entry-content").find_all('p')]
    print(url)
    return text

urls = ['https://panlasangpinoy.com/beef-holiday-christmas-new-year-food-menu-morcon-recipe/', 
       'https://panlasangpinoy.com/embutido/', 
       'https://panlasangpinoy.com/2011/02/02/crispy-pata-pulutan-recipe/',
       'https://panlasangpinoy.com/beef-kaldereta/',
       'https://panlasangpinoy.com/paella-marinara-arroz-recipe/',
       'https://panlasangpinoy.com/rice-cake-bibingka-recipe/',
       'https://panlasangpinoy.com/pancit-malabon-recipe/',
       'https://panlasangpinoy.com/leche-flan/',
       'https://panlasangpinoy.com/buko-salad/',
       'https://panlasangpinoy.com/fruit-salad-recipe/']

NocheBuena = ['Morcon', 'Embutido', 'Crispy Pata', 'Beef Kaldereta', 'Paella', 'Bibingka', 'Pancit Malabon',
             'Leche Flan', 'Buko Salad', 'Fruit Salad']

In [6]:
transcripts = [url_to_transcript(u) for u in urls]

https://panlasangpinoy.com/beef-holiday-christmas-new-year-food-menu-morcon-recipe/
https://panlasangpinoy.com/embutido/
https://panlasangpinoy.com/2011/02/02/crispy-pata-pulutan-recipe/
https://panlasangpinoy.com/beef-kaldereta/
https://panlasangpinoy.com/paella-marinara-arroz-recipe/
https://panlasangpinoy.com/rice-cake-bibingka-recipe/
https://panlasangpinoy.com/pancit-malabon-recipe/
https://panlasangpinoy.com/leche-flan/
https://panlasangpinoy.com/buko-salad/
https://panlasangpinoy.com/fruit-salad-recipe/


In [7]:
!mkdir transcripts

for i, c in enumerate(NocheBuena):
     with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

A subdirectory or file transcripts already exists.


In [8]:
data = {}
for i, c in enumerate(NocheBuena):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [9]:
data.keys()

dict_keys(['Morcon', 'Embutido', 'Crispy Pata', 'Beef Kaldereta', 'Paella', 'Bibingka', 'Pancit Malabon', 'Leche Flan', 'Buko Salad', 'Fruit Salad'])

In [10]:
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [11]:
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [12]:
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
Beef Kaldereta,Beef Kaldereta is a main stay in any Filipino Kitchen. It is a type of beef stew cooked with tomato sauce and liver spread. Ingredients such as po...
Bibingka,"Bibingka is a type of rice cake native to the Philippines. This is traditionally made from galapong (milled glutinous rice), coconut milk, margari..."
Buko Salad,Buko Salad or sweet young coconut salad is a dessert dish that makes use of shredded young coconut as the main ingredient. This delicious dessert ...
Crispy Pata,Crispy Pata or crispy pork leg is a popular Filipino pork dish. This dish can be eaten as a main dish along with rice and atcharang papaya. People...
Embutido,"Embutido is a type of meatloaf prepared Filipino style. Though a well known dish for the holidays, Embutido can be enjoyed everyday without any ha..."
Fruit Salad,Fruit Salad is a general term referring to a fruit dish. This dish is typically composed of an assortment of fruits served as is or combined with ...
Leche Flan,Leche Flan is a dessert made-up of eggs and milk with a soft caramel on top. It resembles crème caramel and caramel custard. This delicious desser...
Morcon,"Morcon is a Filipino meat roll stuffed with sausage or hotdogs, carrots, pickles, cheese, and egg. This is considered as a holiday dish and is usu..."
Paella,Paella is a popular rice dish that originated from the Valencia region of Spain. This is usually made of short grain yellow rice and cooked in a s...
Pancit Malabon,Pansit Malabon is a flavorful noodle dish that originated in the City of Malabon. This dish resembles the Pancit Palabok but the array of seafood ...


In [13]:
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [14]:
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
Beef Kaldereta,beef kaldereta is a main stay in any filipino kitchen it is a type of beef stew cooked with tomato sauce and liver spread ingredients such as pota...
Bibingka,bibingka is a type of rice cake native to the philippines this is traditionally made from galapong milled glutinous rice coconut milk margarine an...
Buko Salad,buko salad or sweet young coconut salad is a dessert dish that makes use of shredded young coconut as the main ingredient this delicious dessert r...
Crispy Pata,crispy pata or crispy pork leg is a popular filipino pork dish this dish can be eaten as a main dish along with rice and atcharang papaya people a...
Embutido,embutido is a type of meatloaf prepared filipino style though a well known dish for the holidays embutido can be enjoyed everyday without any hass...
Fruit Salad,fruit salad is a general term referring to a fruit dish this dish is typically composed of an assortment of fruits served as is or combined with d...
Leche Flan,leche flan is a dessert madeup of eggs and milk with a soft caramel on top it resembles crème caramel and caramel custard this delicious dessert i...
Morcon,morcon is a filipino meat roll stuffed with sausage or hotdogs carrots pickles cheese and egg this is considered as a holiday dish and is usually ...
Paella,paella is a popular rice dish that originated from the valencia region of spain this is usually made of short grain yellow rice and cooked in a sp...
Pancit Malabon,pansit malabon is a flavorful noodle dish that originated in the city of malabon this dish resembles the pancit palabok but the array of seafood t...


In [15]:
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [16]:
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

Unnamed: 0,transcript
Beef Kaldereta,beef kaldereta is a main stay in any filipino kitchen it is a type of beef stew cooked with tomato sauce and liver spread ingredients such as pota...
Bibingka,bibingka is a type of rice cake native to the philippines this is traditionally made from galapong milled glutinous rice coconut milk margarine an...
Buko Salad,buko salad or sweet young coconut salad is a dessert dish that makes use of shredded young coconut as the main ingredient this delicious dessert r...
Crispy Pata,crispy pata or crispy pork leg is a popular filipino pork dish this dish can be eaten as a main dish along with rice and atcharang papaya people a...
Embutido,embutido is a type of meatloaf prepared filipino style though a well known dish for the holidays embutido can be enjoyed everyday without any hass...
Fruit Salad,fruit salad is a general term referring to a fruit dish this dish is typically composed of an assortment of fruits served as is or combined with d...
Leche Flan,leche flan is a dessert madeup of eggs and milk with a soft caramel on top it resembles crème caramel and caramel custard this delicious dessert i...
Morcon,morcon is a filipino meat roll stuffed with sausage or hotdogs carrots pickles cheese and egg this is considered as a holiday dish and is usually ...
Paella,paella is a popular rice dish that originated from the valencia region of spain this is usually made of short grain yellow rice and cooked in a sp...
Pancit Malabon,pansit malabon is a flavorful noodle dish that originated in the city of malabon this dish resembles the pancit palabok but the array of seafood t...


In [23]:
full_names = ['Morcon', 'Embutido', 'Crispy Pata', 'Beef Kaldereta', 'Paella', 'Bibingka', 'Pancit Malabon',
             'Leche Flan', 'Buko Salad', 'Fruit Salad']

data_df['full_name'] = full_names
data_df

Unnamed: 0,transcript,full_name
Beef Kaldereta,Beef Kaldereta is a main stay in any Filipino Kitchen. It is a type of beef stew cooked with tomato sauce and liver spread. Ingredients such as po...,Morcon
Bibingka,"Bibingka is a type of rice cake native to the Philippines. This is traditionally made from galapong (milled glutinous rice), coconut milk, margari...",Embutido
Buko Salad,Buko Salad or sweet young coconut salad is a dessert dish that makes use of shredded young coconut as the main ingredient. This delicious dessert ...,Crispy Pata
Crispy Pata,Crispy Pata or crispy pork leg is a popular Filipino pork dish. This dish can be eaten as a main dish along with rice and atcharang papaya. People...,Beef Kaldereta
Embutido,"Embutido is a type of meatloaf prepared Filipino style. Though a well known dish for the holidays, Embutido can be enjoyed everyday without any ha...",Paella
Fruit Salad,Fruit Salad is a general term referring to a fruit dish. This dish is typically composed of an assortment of fruits served as is or combined with ...,Bibingka
Leche Flan,Leche Flan is a dessert made-up of eggs and milk with a soft caramel on top. It resembles crème caramel and caramel custard. This delicious desser...,Pancit Malabon
Morcon,"Morcon is a Filipino meat roll stuffed with sausage or hotdogs, carrots, pickles, cheese, and egg. This is considered as a holiday dish and is usu...",Leche Flan
Paella,Paella is a popular rice dish that originated from the Valencia region of Spain. This is usually made of short grain yellow rice and cooked in a s...,Buko Salad
Pancit Malabon,Pansit Malabon is a flavorful noodle dish that originated in the City of Malabon. This dish resembles the Pancit Palabok but the array of seafood ...,Fruit Salad


In [24]:
data_df.to_pickle("corpus.pkl")

In [25]:
data_df

Unnamed: 0,transcript,full_name
Beef Kaldereta,Beef Kaldereta is a main stay in any Filipino Kitchen. It is a type of beef stew cooked with tomato sauce and liver spread. Ingredients such as po...,Morcon
Bibingka,"Bibingka is a type of rice cake native to the Philippines. This is traditionally made from galapong (milled glutinous rice), coconut milk, margari...",Embutido
Buko Salad,Buko Salad or sweet young coconut salad is a dessert dish that makes use of shredded young coconut as the main ingredient. This delicious dessert ...,Crispy Pata
Crispy Pata,Crispy Pata or crispy pork leg is a popular Filipino pork dish. This dish can be eaten as a main dish along with rice and atcharang papaya. People...,Beef Kaldereta
Embutido,"Embutido is a type of meatloaf prepared Filipino style. Though a well known dish for the holidays, Embutido can be enjoyed everyday without any ha...",Paella
Fruit Salad,Fruit Salad is a general term referring to a fruit dish. This dish is typically composed of an assortment of fruits served as is or combined with ...,Bibingka
Leche Flan,Leche Flan is a dessert made-up of eggs and milk with a soft caramel on top. It resembles crème caramel and caramel custard. This delicious desser...,Pancit Malabon
Morcon,"Morcon is a Filipino meat roll stuffed with sausage or hotdogs, carrots, pickles, cheese, and egg. This is considered as a holiday dish and is usu...",Leche Flan
Paella,Paella is a popular rice dish that originated from the Valencia region of Spain. This is usually made of short grain yellow rice and cooked in a s...,Buko Salad
Pancit Malabon,Pansit Malabon is a flavorful noodle dish that originated in the City of Malabon. This dish resembles the Pancit Palabok but the array of seafood ...,Fruit Salad


In [26]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,able,absorb,achievement,act,action,actually,add,added,adding,addition,...,works,worldit,worth,written,year,years,yellow,yolks,young,yummy
Beef Kaldereta,0,0,0,0,0,0,3,0,1,0,...,1,0,0,0,0,0,0,0,0,0
Bibingka,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Buko Salad,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,3,0
Crispy Pata,0,1,1,1,1,0,2,1,2,0,...,0,0,0,0,0,0,0,0,0,0
Embutido,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Fruit Salad,0,0,0,0,0,0,0,2,0,1,...,0,0,0,0,0,0,0,0,2,0
Leche Flan,1,0,0,0,0,1,2,1,0,0,...,0,1,1,0,0,0,0,1,0,1
Morcon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,1,0,0,0,0
Paella,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,3,0,0,0
Pancit Malabon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
data_dtm.to_pickle("dtm.pkl")

data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))

In [20]:
print ('Extraction and Cleaning of Data: COMPLETED')

Extraction and Cleaning of Data: COMPLETED
