# Grainger - Category Chart 

Code developed to build the category chart for the Fastenal website. The objective is to collect all of the urls for each one of the product categories and subcategories and their names so we can build the hierarchy and also use the urls for the data scraping of all of the part numbers 

## Import all dependencies and start selenium 

We are using selenium and BeautifulSoup for the scraping process

In [1]:
#Finding the location of chromedriver.exe
#https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

/usr/local/bin/chromedriver


In [2]:
# Import your newly installed selenium package
from selenium import webdriver
from splinter import Browser
from selenium.webdriver.common.keys import Keys
import time
time.sleep(3)
from splinter.exceptions import ElementDoesNotExist
from bs4 import BeautifulSoup
import pandas as pd

# Now create an 'instance' of your driver
WebDriver = {'executable_path': 'chromedriver'}
# A new Chrome (or other browser) window should open up
browser = Browser('chrome', **WebDriver, headless=False)
# Defining width and height of the browser
browser.driver.set_window_size(1750, 1250)
#For Mac users - Opening the targer url
#driver.get("https://www.fastenal.com/product/abrasives/coated-and-non-woven-abrasives/fiber-and-sanding-discs/609478?categoryId=609478&level=3&isExpanded=true&productFamilyId=26373&view=2")
#For Windows users - Opening the target url
url ='https://www.grainger.com/category?analytics=nav'
browser.visit(url)

## Initialize searches 

#### All Products

Inspecting the website, we found that the Product category information is in a div inside the class 'col-xs-12 category-list margin--top-0', and it is consistent across all the pages, so we can create a function that gives us access to that div initially 

In [3]:
#activating soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

products = soup.findAll('ul',class_='categories__list')
all_products_urls=[i['href'] for i in (products[0]('a'))]
all_products_urls

['/category/abrasives',
 '/category/adhesives-sealants-and-tape',
 '/category/cleaning-and-janitorial',
 '/category/electrical',
 '/category/electronics-appliances-and-batteries',
 '/category/fasteners',
 '/category/fleet-and-vehicle-maintenance',
 '/category/furniture-hospitality-and-food-service',
 '/category/hvac-and-refrigeration',
 '/category/hardware',
 '/category/hydraulics',
 '/category/lab-supplies',
 '/category/lighting',
 '/category/lubrication',
 '/category/machining',
 '/category/material-handling',
 '/category/motors',
 '/category/office-supplies',
 '/category/outdoor-equipment',
 '/category/paint-equipment-and-supplies',
 '/category/plumbing',
 '/category/pneumatics',
 '/category/power-transmission',
 '/category/pumps',
 '/category/raw-materials',
 '/category/reference-and-learning-supplies',
 '/category/safety',
 '/category/security',
 '/category/test-instruments',
 '/category/tools',
 '/category/welding']

Safety Categories - all Categories

In [None]:
#For Windows users - Opening the target url
url2 ='https://www.grainger.com/category/safety'
browser.visit(url2)

#activating soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [None]:
spans = soup.findAll('span', class_='category-text')
subcategories_list=[span.text.strip() for span in spans]
    
subcategories_list

Gloves and Hand Protection - all subcategories

In [None]:
#For Windows users - Opening the target url
url2 ='https://www.grainger.com/category/safety/gloves-and-hand-protection'
browser.visit(url2)

#activating soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [None]:
#Extract the categories from the span tag
spans = soup.findAll('span', class_='category-text')
#List comprehension to extract the text from the span class
subcategories_list=[span.text.strip() for span in spans] 
#Function to add hypen between words and use it to build the next url
def replace_runs_of_whitespace_with_hyphen(word):
    return '-'.join(word.split())
#List comprehension to insert the hyphen between words
hyphrases = [replace_runs_of_whitespace_with_hyphen(w) for w in subcategories_list]
#List comprehension to build the next url using the new hyphened words
gloves_and_hand_protection=[f'https://www.grainger.com/category/safety/gloves-and-hand-protection/{link}' for link in hyphrases]
gloves_and_hand_protection

Antistatic Gloves - end urls

In [8]:
#For Windows users - Opening the target url
url3 ='https://www.grainger.com/category/safety/gloves-and-hand-protection/antistatic-gloves'
browser.visit(url3)

#activating soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [9]:
#Extract all values from the tr tag
bs=soup.findAll('tr',class_='search-table-view__web-parent-table-row')
#Extract the brand values from the tag
brand=[row.findAll('td')[4].contents[0].strip() for row in bs]
#Extract the sku values from the tag
sku=[row.findAll('button')[0].text.strip() for row in bs]
#Extract all category values from tag
cat=[row.findAll('td')[0].contents[0].strip() for row in bs]

#Function to add hypen between words and use it to build the next url
def replace_runs_of_whitespace_with_hyphen(word):
    return '-'.join(word.split())
#List comprehension to insert the hyphen between words
hyphrases = [replace_runs_of_whitespace_with_hyphen(w) for w in cat]

#List comprehension to create the end url
antistatic_gloves=[f'https://www.grainger.com/product/{i}' for i in sku]
antistatic_gloves

['https://www.grainger.com/product/3JFP3',
 'https://www.grainger.com/product/9WRN3',
 'https://www.grainger.com/product/3JFP4',
 'https://www.grainger.com/product/8CAW1',
 'https://www.grainger.com/product/3NGY2',
 'https://www.grainger.com/product/19L033',
 'https://www.grainger.com/product/19L034',
 'https://www.grainger.com/product/19L035',
 'https://www.grainger.com/product/19L036',
 'https://www.grainger.com/product/19L037',
 'https://www.grainger.com/product/19L038',
 'https://www.grainger.com/product/19L039',
 'https://www.grainger.com/product/19L040',
 'https://www.grainger.com/product/19L041',
 'https://www.grainger.com/product/19L042',
 'https://www.grainger.com/product/19L043',
 'https://www.grainger.com/product/1DPF5',
 'https://www.grainger.com/product/1DPF6',
 'https://www.grainger.com/product/1DPF7',
 'https://www.grainger.com/product/1DPF8',
 'https://www.grainger.com/product/1DPF9',
 'https://www.grainger.com/product/1DPG1']

Extract the detailed sku information

In [None]:
#For Windows users - Opening the target url
url3 ='https://www.grainger.com/product/ANSELL-Antistatic-Gloves-3JFP3'
browser.visit(url3)

#activating soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [None]:
#Extract technical specs
spe=soup.findAll('span',class_='specValue')
itemspec=spe[0].text
material=spe[1].text
size=spe[2].text
lenght=spe[3].text
coating=spe[4].text
coverage=spe[5].text
color=spe[6].text
cuff=spe[7].text
standard=spe[8].text
application=spe[9].text
application

In [21]:
# Retrieve all elements that contain item information from tags

#Opening the target url and activating soup
item_spec_list=[]
material_list=[]
size_list=[]
lenght_list=[]
coating_list=[]
coverage_list=[]
color_list=[]
cuff_list=[]
standard_list=[]
application_list=[]

for x in antistatic_gloves:
    browser.visit(x)
    time.sleep(3)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
       
    try:
        spe=soup.findAll('span',class_='specValue')
        itemspec=spe[0].text
        material=spe[1].text
        size=spe[2].text
        lenght=spe[3].text
        coating=spe[4].text
        coverage=spe[5].text
        color=spe[6].text
        cuff=spe[7].text
        standard=spe[8].text
        application=spe[9].text

    except IndexError:
        print("Table has less values available")

    finally:
        print ("N/A")
    #else:
        lenght_list.append(lenght)
lenght_list

Table has less values available
N/A
Table has less values available
N/A
Table has less values available
N/A
Table has less values available
N/A
Table has less values available
N/A
Table has less values available
N/A
Table has less values available
N/A
Table has less values available
N/A
Table has less values available
N/A
Table has less values available
N/A
N/A
N/A
N/A
N/A
N/A
N/A
N/A
N/A
N/A
N/A
N/A
N/A


['Nitrile',
 'Nitrile',
 'Nitrile',
 'Nitrile',
 'Nitrile',
 '7-11/16"',
 '8-1/16"',
 '8-1/2"',
 '8-7/8"',
 '9-1/4"',
 '7-11/16"',
 '8-1/16"',
 '8-1/2"',
 '8-7/8"',
 '9-1/4"',
 '9-5/8"',
 '20-1/2"',
 '20-1/2"',
 '20-1/2"',
 '20-1/2"',
 '20-1/2"',
 '20-1/2"']

In [None]:
price_list=[]
web_price_list=[]
uom_list=[]
sku_list=[]
definition_list=[]
item_list=[]
mfr_model_list=[]
brand_list=[]
country_of_origin=[]
cat_group_list=[]
unspsc_list=[]
item_link_list=[]
cat_link_list=[]
product_text=[]

# Iterate through all pages
for x in antistatic_gloves:
    #Opening the target url and activating soup
    browser.visit(x)
    time.sleep(3)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    # Retrieve all elements that contain item information from tags
    articles = soup.find_all('span', class_='gcprice-value')
    skus=soup.find('div',class_='price-replace-wrapper')
    lis=soup.findAll('div',class_='head-container clearfix')
    uoms=soup.find('span',class_='gcprice-unit')
    ases=soup.findAll('a',class_='bread-link')
    web=soup.findAll('span',class_='gcprice-label')
    coun=soup.find('div',class_='countryOfOrigin')
    prod=soup.find('div',class_='copyTextSection textSection')
    
    #Extract the information of each item
    price=articles[0].text.strip()
    p= web[0].text.split()[0:2]
    uom=uoms.text
    sku=skus.attrs['data-sku']
    definition=soup.find('h1').text.strip()
    item=lis[0]('span')[0].text
    mfr_model=lis[0]('span')[2].text
    marca=lis[0]('a')[0].text
    country=coun.text.split()[3]
    cat_group=lis[0]('a')[1].text
    unspsc=lis[0]('span')[5].text
    item_link=lis[0]('a')[1]['href']
    lis=ases[2].text
    product=prod.text.strip()
    
try
    spe=soup.findAll('span',class_='specValue')
    itemspec=spe[0].text
    material=spe[1].text
    size=spe[2].text
    lenght=spe[3].text
    coating=spe[4].text
    coverage=spe[5].text
    color=spe[6].text
    cuff=spe[7].text
    standard=spe[8].text
    application=spe[9].text
    application
    
except IndexError:
    print("Table has less values available")
    
else:
    #Append data extracted to the empty lists
    price_list.append(price)
    web_price_list.append(' '.join(p))
    uom_list.append(uom)
    sku_list.append(sku)
    definition_list.append(definition)
    item_list.append(item)
    mfr_model_list.append(mfr_model)
    brand_list.append(marca)
    country_of_origin.append(country)
    cat_group_list.append(cat_group)
    unspsc_list.append(unspsc)
    item_link_list.append(f'https://www.grainger.com/{item_link}')
    cat_link_list.append(lis)
    product_text.append(product)
 

In [None]:
#Create dataframe for antistatic gloves
df_antistatic_gloves = pd.DataFrame(
    {'SKU': sku_list,
     'Definition': definition_list,
     'Product Description':product_text,
     'Manufacturer Model':mfr_model_list,
     'Brand': brand_list,
     'Category':cat_link_list,
     'Category Group':cat_group_list,
     'UNSPSC':unspsc_list,
     'Price':price_list,
     'Price Type':web_price_list,
     'Country of Origin':country_of_origin,
     'UOM':uom_list,
     'Link':item_link_list
    })
df_antistatic_gloves.to_csv('antistatic_gloves.csv', encoding='utf-8')
df_antistatic_gloves

In [None]:
url = "http://ergast.com/api/f1/"
year = url.join([str(i) + "/qualifying?limit=10000" + "\n" for i in range(1999, 2016)])
print(year)

url = 'http://ergast.com/api/f1/{0}/qualifying?limit=10000'

print('\n'.join(url.format(year) for year in range(2000, 2016)))


Testing blocks

In [None]:
#For Windows users - Opening the target url
urlsku ='https://www.grainger.com/product/HONEYWELL-antistatic-gloves-3JFP3'
browser.visit(urlsku)

#activating soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [None]:
articles = soup.find('span', class_='gcprice-value')
skus=soup.find('div',class_='price-replace-wrapper')
lis=soup.findAll('div',class_='head-container clearfix')
uoms=soup.find('span',class_='gcprice-unit')

price=articles.text.strip()
uom=uoms.text
sku=skus.attrs['data-sku']
definition=soup.find('h1').text.strip()
item=lis[0]('span')[0].text
mfr_model=lis[0]('span')[2].text
brand=lis[0]('a')[0].text
cat_group=lis[0]('a')[1].text
unspsc=lis[0]('span')[5].text
item_link=lis[0]('a')[1]['href']
mfr_model

In [None]:
#Create an empty list that will hold the partial urls
partial_link=[]
#For loop to iterate all the urls and add brand and category
for link in brand:
        #Store the partial link to be created into a variable
        rec_link=f'https://www.grainger.com/product/{link}-antistatic-gloves-'
        #Appending the partial link created to the empty list
        partial_link.append(rec_link)
        #Create an empty list that will hold the full end url
        sku_links=[]
        #For loop to iterate each partial link and add the sku number
        for i in sku:  
            #Store the full link to be created into a variable and adding sku number
            sku_li=f'{rec_link}{i}'
            #Appending the full url to the empty list
            sku_links.append(sku_li)
#Display urls in the list
sku_links

In [None]:
#Extracting all categories
#Set up an empty list
subcategories_list=[]
#Find all the text contained in span tags
spans = soup.findAll('tr', class_='search-table-view__web-parent-table-row')
spans
#spans2=spans[0].attrs['data-sku']
#spans2

In [None]:
#For loop to iterate through all the list
for span in spans:
    #Get each text from each tag, and strip blank spaces
    #subcategories=spans.attrs['data-sku']
    #Append each list to the general list
    subcategories_list.append(span.attrs['data-sku'])
#Display results
subcategories_list

In [None]:
link=f'https://www.grainger.com/product/{first_column}-antistatic-gloves-'
print(link)

In [None]:
browser.visit('https://www.fastenal.com/product/abrasives/coated-and-non-woven-abrasives/fiber-and-sanding-discs/609478;jsessionid=kBb4JktuaBd8+ngJiIno0+qd.d36f637b-91a9-3b2b-9fa6-50af87272845?categoryId=609478&level=3&isExpanded=true&productFamilyId=26373&view=2')
browser.click_link_by_partial_text('99329957')
sleep(5)
html = browser.html
soup= BeautifulSoup(html, 'html.parser')
# print(html)
product_info = soup.find(id="productTable")
print(product_info)