In [1]:
import sys
sys.path.insert(0,'chromedriver')
from selenium import webdriver
from selenium.webdriver.common.by import By
from pandas import DataFrame, concat
from time import sleep
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--headless')

wd = webdriver.Chrome('../chromedriver',options=chrome_options)

In [3]:
base_url = 'https://www.mrbreakfast.com/cereal_list.asp?fi='
def scrapePageForCerealNames(letter):
    cereals = DataFrame({
        "product_id": [],
        "name": []
    })

    wd.get(base_url + letter)
    cereal_elements = wd.find_elements(By.XPATH, "//div[contains(@style, 'float: left; width:250px; margin-left: 10px; margin-right: 14px; height: 122px;')]")
    for ce in cereal_elements:
        anchors = ce.find_elements(By.TAG_NAME, 'a')
        id = re.findall("\?id=(\d+)", anchors[0].get_attribute('href'))[0]
        name = anchors[1].get_attribute('innerText')
        cereals.loc[len(cereals)] = [id, name]
    return cereals, len(cereal_elements)

In [4]:
all_cereals = DataFrame({
        "product_id": [],
        "name": []
})
errors = list()

for ascii in range(65,91):
    letter = chr(ascii)
    sleep(0.5)
    cereals, n_cereals = scrapePageForCerealNames(letter)
    if n_cereals == 0:
        errors.append(letter)
    else:
        all_cereals = concat([all_cereals, cereals], ignore_index=True)

errors

['D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z']

In [13]:
# Run this block until the error list is empty
rec_error_list = []
for letter in errors:
    sleep(0.5)
    cereals, n_cereals = scrapePageForCerealNames(letter)
    if n_cereals == 0:
        rec_error_list.append(letter)
    else:
        all_cereals = concat([all_cereals, cereals], ignore_index=True)
errors, rec_error_list = rec_error_list, []

errors

[]

In [14]:
all_cereals.to_csv("../data/all_cereals.csv", index=False)

In [15]:
comments_url = 'https://www.mrbreakfast.com/cereal_allreviews.asp?id='
def scrapePageForComments(product_id):
    comment_df = DataFrame({
        "user": [],
        "rating": [],
        "comment": [],
        "product_id": []
    })

    wd.get(comments_url + str(product_id))
    comment_section = wd.find_element(By.XPATH, "//div[contains(@id, 'right-cereal-content')]/span[contains(@class, 'small')]")
    # a = comment_section.find_elements(By.XPATH, "*")
    comments = comment_section.get_attribute('innerHTML').split("By <b>")
    for idx in range(len(comments[1:])):
        l = comments[idx+1].split("</b>")
        comments[idx+1] = "<div id='uniq-comment'><b>" + \
            "".join(re.split('<br>Comment submitted: \d+/\d+/\d+ \(#\d+\)(<br>)+', "</b>".join(l[1:]))) + \
            "</div>"
        comment_df.loc[len(comment_df)] = [l[0], None, None, None]
    comments = comments[1:]

    for idx in range(len(comments)):
        wd.get("data:text/html;charset=utf-8,{html_content}".format(html_content=comments[idx]))
        comments[idx] = "".join(" ".join(wd.find_element(By.XPATH, "//div[contains(@id, 'uniq-comment')]") \
            .get_attribute('innerText').split()).split("Rating (out of 5):")).strip()
        rating = len(wd.find_elements(By.XPATH, "//td[contains(@valign, 'top')]/*"))//2
        comment_df.loc[idx, "comment"] = comments[idx]
        comment_df.loc[idx, "rating"] = rating
        comment_df.loc[idx, "product_id"] = product_id
    return comment_df

scrapePageForComments(300)

Unnamed: 0,user,rating,comment,product_id
0,Postman,5,National Cereal Day is March 7. Enjoy a bowl o...,300
1,Postman,5,"Great to see Quisp on the back of the Red, Whi...",300
2,Postman,5,Happy belated National Cereal Day (March 7). W...,300
3,Ernie,5,"I remember having Quangaroos back in the day, ...",300
4,PlatNuMonk,5,(Team Breakfast Member) Since they stopped ma...,300
...,...,...,...,...
93,planno,5,Quisp and Quake were fun. Cereal were so much ...,300
94,Quispy Memories,5,My mom was a health cereal nut when I was grow...,300
95,nana,5,I really loved Quisp cereal and was really dis...,300
96,WJ,5,I loved this cereal as a kid. I actually had t...,300


In [16]:
comments_df = DataFrame({
    "user": [],
    "rating": [],
    "comment": [],
    "product_id": []
})
errors = []

for itm in all_cereals.loc[:,"product_id"]:
    try:
        sleep(0.5)
        comment_df = scrapePageForComments(itm)
        if len(comment_df) < 20:
            # Skip if the product has less than 20 reviews
            pass
        else:
            comments_df = concat([comments_df, comment_df], ignore_index=True)
            comments_df.to_csv("../data/comments.csv", index=False)
    except:
        errors.append(itm)

errors

['1480',
 '503',
 '961',
 '1054',
 '1284',
 '1119',
 '1445',
 '1257',
 '14',
 '1686',
 '1668',
 '1667',
 '1648',
 '434',
 '37',
 '1639',
 '38',
 '1366',
 '27',
 '28',
 '1706',
 '1426',
 '30',
 '1602',
 '31',
 '926',
 '53',
 '626',
 '1747',
 '1640',
 '1659',
 '1217',
 '1402',
 '439',
 '59',
 '1643',
 '1329',
 '60',
 '1369',
 '1007',
 '681',
 '64',
 '423',
 '431',
 '1344',
 '1026',
 '1675',
 '71',
 '571',
 '1714',
 '1455',
 '882',
 '440',
 '665',
 '426',
 '79',
 '80',
 '87',
 '1698',
 '1095',
 '89',
 '90',
 '99',
 '1169',
 '1208',
 '896',
 '98',
 '100',
 '1291',
 '1738',
 '101',
 '874',
 '110',
 '769',
 '111',
 '488',
 '918',
 '515',
 '1537',
 '806',
 '137',
 '1167',
 '139',
 '140',
 '141',
 '1735',
 '1290',
 '143',
 '1494',
 '529',
 '612',
 '147',
 '462',
 '546',
 '547',
 '159',
 '1687',
 '1199',
 '1635',
 '171',
 '1459',
 '684',
 '172',
 '1331',
 '166',
 '579',
 '1523',
 '1317',
 '601',
 '965',
 '1187',
 '964',
 '969',
 '967',
 '962',
 '968',
 '780',
 '412',
 '192',
 '1028',
 '1067',
 

In [18]:
# Run this block until the error list is empty
rec_error_list = []
for itm in errors:
    try:
        sleep(0.5)
        comment_df = scrapePageForComments(itm)
        if len(comment_df) < 20:
            # Skip if the product has less than 20 reviews
            pass
        else:
            comments_df = concat([comments_df, comment_df], ignore_index=True)
            comments_df.to_csv("../data/comments.csv", index=False)
    except:
        rec_error_list.append(itm)
errors, rec_error_list = rec_error_list, []

errors

[]