In [404]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
import glob

from bs4 import BeautifulSoup

In [405]:
group10 = pd.read_csv('../Data/Target Data/group10_header.csv',
                      sep='\t', low_memory=False)
ip_file_dir = "../Data/Target Data/scraped"
file_list = glob.glob("../Data/Target Data/scraped/*details*")
tcin_completed = [file.split('/')[-1].split('_')[0] for file in file_list]
total_tcin = list(set(pd.Series(tcin_completed).astype('int')))

In [406]:
len(total_tcin)

21611

In [418]:
def get_highlights(soup):
    highlights = []
    try:
        highlight_tag = soup.find('h3', text='Highlights')
        highlights_sib = highlight_tag.find_next()
        highlights_spans = highlights_sib.find_all('span')
        for s in highlights_spans:
            highlights.append(s.text)
    except:
        highlights = np.nan
    return highlights

In [419]:
def get_specifications(soup):
    specifications = []
    try:
        specifications_tag = soup.find('h3', text='Specifications')
        for d in specifications_tag.parent.find_all('div'):
            specifications.append(d.text)
        specifications = [s for s in list(set(specifications))
                          if not s.startswith('Content on this site is for reference purposes only')
                          if not s.startswith('Grocery Disclaimer')]
    except:
        specifications = np.nan
    return specifications

In [420]:
def get_description(soup):
    description_text = ''
    try:
        description_tag = soup.find('h3', text='Description')
        description_sib = description_tag.find_next()
        description_text = description_sib.text
    except:
        pass
    return description_text

In [421]:
def get_serving_info(soup):
    serving_info = []
    try:
        nutrition = soup.find('div', {'data-test':
                                      'productDetailsTabs-nutritionFactsTab'}).div.div.div
        for p in nutrition.find_all('p'):
            serving_info.append(p.text)
    except:
        serving_info = np.nan
    return serving_info

In [429]:
def get_nutrition_info(soup):
    nutrition_info = []
    try:
        nutrition = soup.find('div', {'data-test':
                                      'productDetailsTabs-nutritionFactsTab'}).div.div.div
        nutrition_tags = nutrition.find_all('div',
                                            class_=lambda x: x and x.startswith("h-margin-t-tight"))
        for nutrition_tag in nutrition_tags:
            text_split = nutrition_tag.span.text.split()
            nutrition_info.append(
                ([" ".join(text_split[0:-1]), (text_split[-1])]))
    except:
        nutrition_info = np.nan
    return nutrition_info

In [423]:
def get_ingredients(soup):
    ingredients_text = ''
    try:
        ingredients_tag = soup.find('h4',
                                    text=lambda x: x.startswith('Ingredients'))
        ingredients_sib = ingredients_tag.find_next()
        ingredients_text = ingredients_sib.text
    except:
        pass
    return ingredients_text

In [424]:
def get_allergens(soup):
    allergens_text = ''
    try:
        allergens = soup.find('h4',
                              text=lambda x: x.startswith('Allergens & Warnings'))
        allergens_text = allergens.parent.text
    except:
        pass
    return allergens_text

In [425]:
def get_price(soup):
    product_price = np.nan
    try:
        product_price_tag = soup.find('div', {'data-test': 'product-price'})
        product_price = float(product_price_tag.get_text()[1:])
    except:
        pass
    return product_price


def get_rating(soup):
    rating = np.nan
    try:
        rating_tag = soup.find('span', {'data-test': 'ratings'})
        rating = float(rating_tag.get_text().split()[0])
    except:
        pass
    return rating


def get_n_reviews(soup):
    n_reviews = np.nan
    try:
        n_reviews_tag = soup.find('span', {'data-test': 'ratings'})
        n_reviews = int(n_reviews_tag.get_text().split()[-2])
    except:
        pass
    return n_reviews

In [437]:
op_file_path = os.path.join(file_dir, 'products.csv')
products = []
for i, tcin in enumerate(tqdm(total_tcin)):
    try:
        path = os.path.join(ip_file_dir, str(tcin) + '_details.html')
        with open(path) as f:
            html = f.read()
        details_soup = BeautifulSoup(html)

        product = {}
        product['tcin'] = tcin
        product['price'] = get_price(details_soup)
        product['rating'] = get_rating(details_soup)
        product['n_reviews'] = get_n_reviews(details_soup)
        product['highlights'] = get_highlights(details_soup)
        product['specifications'] = get_specifications(details_soup)
        product['description'] = get_description(details_soup)

        path = os.path.join(ip_file_dir, str(tcin) + '_label_info.html')
        if os.path.isfile(path):
            with open(path) as f:
                html = f.read()
        label_info_soup = BeautifulSoup(html)
        product['serving_info'] = get_serving_info(label_info_soup)
        product['nutrition_info'] = get_nutrition_info(label_info_soup)
        product['ingredients'] = get_ingredients(label_info_soup)
        product['allergens'] = get_allergens(label_info_soup)

        products.append(product)

        if i % 50 == 0:
            products_df = pd.DataFrame(products)
            products_df.to_csv(op_file_path, index=False)
    except:
        print(tcin)
        pass

products_df = pd.DataFrame(products)
products_df.to_csv(op_file_path, index=False)

  0%|          | 0/21611 [00:00<?, ?it/s]

In [438]:
products = pd.read_csv(op_file_path)

In [439]:
products

Unnamed: 0,tcin,price,rating,n_reviews,highlights,specifications,description,serving_info,nutrition_info,ingredients,allergens
0,52297732,49.99,,0.0,"['Features hints of apricot, pear and sweet ro...","['', 'Alcohol base: Barley, Corn, Rye', 'Flavo...",An homage to the legendary Stitzel-Weller dist...,,,,
1,51773444,19.99,4.2,39.0,,"['Origin: Made in the USA or Imported', 'WARNI...",,,,,Allergens & Warnings:CONTAINS SULFITES
2,77332489,7.99,4.7,142.0,['One package of Tyson® Blackened Flavored Unb...,"['Origin: Made in the USA or Imported', 'Count...",Simplify meal time with Tyson® Blackened Flavo...,"['Serving Size: 3 oz', 'Serving Per Container:...","[['Total Fat', '3g'], ['Saturated Fat', '0.5g'...","boneless, skinless chicken breast strips with ...",
3,77332491,8.69,3.0,143.0,['One 20 oz. package of Air Fried Perfectly Cr...,"['Country of Origin: United States', 'TCIN: 77...","Enjoy the crispy, delicious flavor of fried ch...","['Serving Size: 3 oz', 'Serving Per Container:...","[['Total Fat', '4g'], ['Saturated Fat', '1g'],...","boneless, skinless chicken breast with rib mea...",Allergens & Warnings:CONTAINS: WHEAT
4,52297742,22.99,,0.0,"['Gin-infused liqueur with herbal botanicals, ...","['', 'TCIN: 52297742', 'State of Readiness: Re...",The rich amber hue of Pimm's No. 1 Liqueur com...,,,,
...,...,...,...,...,...,...,...,...,...,...,...
21606,82182099,3.69,5.0,1.0,['One (1) 3.5 oz GHIRARDELLI Milk Chocolate Ba...,"['Origin: Made in the USA or Imported', 'UPC: ...",Experience luscious indulgence with each bite ...,"['Serving Size: 25 g', 'Serving Per Container:...","[['Total Fat', '7g'], ['Saturated Fat', '4g'],...","sugar, whole milk powder, corn syrup, cocoa bu...",Allergens & Warnings:May contain tree nuts.
21607,51118037,1.99,2.9,10.0,"['18 Donuts per bag (about 4.5 servings)', 'Se...","['Origin: Made in the USA or Imported', 'Packa...",Need an easy breakfast for your kids? Grab a b...,"['Serving Size: 60 g', 'Serving Per Container:...","[['Calories From Fat', '150Cal'], ['Total Fat'...","sugar, enriched wheat flour (wheat flour, barl...",Allergens & Warnings:ALLERGY INFORMATION: CONT...
21608,81526744,49.99,5.0,2.0,"[""Celebrate her birthday, Mother's Day, or any...","[""The above item details were provided by the ...",This chocolate lovers gift set is an exception...,,,,
21609,53477337,9.99,,0.0,,"['WARNING: Drinking distilled spirits, beer, c...",,,,,
