In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
import glob

from bs4 import BeautifulSoup

In [6]:
# group4 = pd.read_csv('/Users/chahaksethi/Desktop/Target/data/group10/header.csv',
#                       sep='\t', low_memory=False)
ip_file_dir = "../../../Data/scraped/"
file_list = glob.glob("../../../Data/scraped/*")
tcin_completed = [file.split('/')[-1].split('_')[0] for file in file_list]
total_tcin = list(set(pd.Series(tcin_completed).astype('int')))

In [7]:
len(total_tcin)

22860

In [8]:
def get_highlights(soup):
    highlights = []
    try:
        highlight_tag = soup.find('h3', text='Highlights')
        highlights_sib = highlight_tag.find_next()
        highlights_spans = highlights_sib.find_all('span')
        for s in highlights_spans:
            highlights.append(s.text)
    except:
        highlights = np.nan
    return highlights

In [9]:
def get_specifications(soup):
    specifications = []
    try:
        specifications_tag = soup.find('h3', text='Specifications')
        for d in specifications_tag.parent.find_all('div'):
            specifications.append(d.text)
        specifications = [s for s in list(set(specifications))
                          if not s.startswith('Content on this site is for reference purposes only')
                          if not s.startswith('Grocery Disclaimer')]
    except:
        specifications = np.nan
    return specifications

In [10]:
def get_description(soup):
    description_text = ''
    try:
        description_tag = soup.find('h3', text='Description')
        description_sib = description_tag.find_next()
        description_text = description_sib.text
    except:
        pass
    return description_text

In [11]:
def get_serving_info(soup):
    serving_info = []
    try:
        nutrition = soup.find('div', {'data-test':
                                      'productDetailsTabs-nutritionFactsTab'}).div.div.div
        for p in nutrition.find_all('p'):
            serving_info.append(p.text)
    except:
        serving_info = np.nan
    return serving_info

In [12]:
def get_nutrition_info(soup):
    nutrition_info = []
    try:
        nutrition = soup.find('div', {'data-test':
                                      'productDetailsTabs-nutritionFactsTab'}).div.div.div
        nutrition_tags = nutrition.find_all('div',
                                            class_=lambda x: x and x.startswith("h-margin-t-tight"))
        for nutrition_tag in nutrition_tags:
            text_split = nutrition_tag.span.text.split()
            nutrition_info.append(
                ([" ".join(text_split[0:-1]), (text_split[-1])]))
    except:
        nutrition_info = np.nan
    return nutrition_info

In [13]:
def get_ingredients(soup):
    ingredients_text = ''
    try:
        ingredients_tag = soup.find('h4',
                                    text=lambda x: x.startswith('Ingredients'))
        ingredients_sib = ingredients_tag.find_next()
        ingredients_text = ingredients_sib.text
    except:
        pass
    return ingredients_text

In [14]:
def get_allergens(soup):
    allergens_text = ''
    try:
        allergens = soup.find('h4',
                              text=lambda x: x.startswith('Allergens & Warnings'))
        allergens_text = allergens.parent.text
    except:
        pass
    return allergens_text

In [15]:
def get_price(soup):
    product_price = np.nan
    try:
        product_price_tag = soup.find('div', {'data-test': 'product-price'})
        product_price = float(product_price_tag.get_text()[1:])
    except:
        pass
    return product_price


def get_rating(soup):
    rating = np.nan
    try:
        rating_tag = soup.find('span', {'data-test': 'ratings'})
        rating = float(rating_tag.get_text().split()[0])
    except:
        pass
    return rating


def get_n_reviews(soup):
    n_reviews = np.nan
    try:
        n_reviews_tag = soup.find('span', {'data-test': 'ratings'})
        n_reviews = int(n_reviews_tag.get_text().split()[-2])
    except:
        pass
    return n_reviews

In [16]:
op_file_path = os.path.join(ip_file_dir, 'products.csv')
products = []
for i, tcin in enumerate(tqdm(total_tcin)):
    try:
        path = os.path.join(ip_file_dir, str(tcin) + '_details.html')
        with open(path) as f:
            html = f.read()
        details_soup = BeautifulSoup(html)

        product = {}
        product['tcin'] = tcin
        product['price'] = get_price(details_soup)
        product['rating'] = get_rating(details_soup)
        product['n_reviews'] = get_n_reviews(details_soup)
        product['highlights'] = get_highlights(details_soup)
        product['specifications'] = get_specifications(details_soup)
        product['description'] = get_description(details_soup)

        path = os.path.join(ip_file_dir, str(tcin) + '_label_info.html')
        if os.path.isfile(path):
            with open(path) as f:
                html = f.read()
        label_info_soup = BeautifulSoup(html)
        product['serving_info'] = get_serving_info(label_info_soup)
        product['nutrition_info'] = get_nutrition_info(label_info_soup)
        product['ingredients'] = get_ingredients(label_info_soup)
        product['allergens'] = get_allergens(label_info_soup)

        products.append(product)

        if i % 50 == 0:
            products_df = pd.DataFrame(products)
            products_df.to_csv(op_file_path, index=False)
    except:
        print(tcin)
        pass

products_df = pd.DataFrame(products)
products_df.to_csv(op_file_path, index=False)

  0%|          | 0/22860 [00:00<?, ?it/s]

In [17]:
products = pd.read_csv(op_file_path)

In [18]:
products

Unnamed: 0,tcin,price,rating,n_reviews,highlights,specifications,description,serving_info,nutrition_info,ingredients,allergens
0,82444295,303.10,,0.0,['MAXIMIZES SPACE: Sliding pull out freezer ca...,['Dimensions (Overall): 5 inches (H) x 21.6 in...,"Traveling can be stressful, but with the MORry...",,,,
1,84672533,14.99,,0.0,,"['UPC: 192173178836', ""Warranty: No Applicable...",This Freestanding Countertop Paper Towel Holde...,,,,
2,84672534,14.99,,0.0,,"['UPC: 192173178836', ""Warranty: No Applicable...",This Freestanding Countertop Paper Towel Holde...,,,,
3,84803608,10.49,4.9,16.0,"['Nonstick silicone coating', 'Dishwasher Safe...","['TCIN: 84803608', 'UPC: 093674187214', ""The a...",Our premium stainless steel potato masher come...,,,,
4,76021792,49.95,4.9,48.0,"['Glass porcelain finish', 'Oven safe up to 57...",['Dimensions (Overall): 2.6 inches (H) x 8.5 i...,The elegance and functionality of Staub cerami...,,,,
...,...,...,...,...,...,...,...,...,...,...,...
22855,76021625,69.95,4.9,47.0,['Curved bolster encourages cutting with impro...,"['Weight: .15 ounces', 'Dimensions (Overall): ...",ZWILLING Pro is the most user-friendly knife a...,,,,
22856,14024623,29.99,3.4,299.0,['4-slot toaster lets you toast more at the sa...,"['Includes: Instruction Manual', 'ENERGYGUIDE ...",The perfect accompaniment to a hearty breakfas...,,,,
22857,14024624,89.99,3.7,105.0,"['7 toast shade settings', 'Extra-wide bread s...","['Includes: Instruction Manual', 'Care & Clean...",The Oster 4 - Slice Stainless Steel Toaster's ...,,,,
22858,51118048,19.79,4.7,44.0,['Constructed of BPA-free multi-layer material...,"['Number of Pieces: 35', 'Capacity (Volume): 1...",With the FoodSaver Sous Vide Vacuum Sealing Ba...,,,,
