# Grainger - Category Chart 

Code developed to build the category chart for the Fastenal website. The objective is to collect all of the urls for each one of the product categories and subcategories and their names so we can build the hierarchy and also use the urls for the data scraping of all of the part numbers 

## Import all dependencies and start selenium 

We are using selenium and BeautifulSoup for the scraping process

In [None]:
#Finding the location of chromedriver.exe
#https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

In [1]:
# Import your newly installed selenium package
from selenium import webdriver
from splinter import Browser
from selenium.webdriver.common.keys import Keys
import time
time.sleep(3)
from splinter.exceptions import ElementDoesNotExist
from bs4 import BeautifulSoup

# Now create an 'instance' of your driver
WebDriver = {'executable_path': 'chromedriver'}
# A new Chrome (or other browser) window should open up
browser = Browser('chrome', **WebDriver, headless=False)
# Defining width and height of the browser
browser.driver.set_window_size(1750, 1250)
#For Mac users - Opening the targer url
#driver.get("https://www.fastenal.com/product/abrasives/coated-and-non-woven-abrasives/fiber-and-sanding-discs/609478?categoryId=609478&level=3&isExpanded=true&productFamilyId=26373&view=2")
#For Windows users - Opening the target url
url ='https://www.grainger.com/category?analytics=nav'
browser.visit(url)

## Initialize searches 

#### All Products

Inspecting the website, we found that the Product category information is in a div inside the class 'col-xs-12 category-list margin--top-0', and it is consistent across all the pages, so we can create a function that gives us access to that div initially 

In [None]:
#activating soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

products = soup.findAll('ul',class_='categories__list')
all_products_urls=[i['href'] for i in (products[0]('a'))]
all_products_urls

Safety Categories - all Categories

In [None]:
#For Windows users - Opening the target url
url2 ='https://www.grainger.com/category/safety'
browser.visit(url2)

#activating soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [None]:
spans = soup.findAll('span', class_='category-text')
subcategories_list=[span.text.strip() for span in spans]
    
subcategories_list

Gloves and Hand Protection - all subcategories

In [12]:
#For Windows users - Opening the target url
url2 ='https://www.grainger.com/category/safety/gloves-and-hand-protection'
browser.visit(url2)

#activating soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [13]:
#Extract the categories from the span tag
spans = soup.findAll('span', class_='category-text')
#List comprehension to extract the text from the span class
subcategories_list=[span.text.strip() for span in spans] 
#Function to add hypen between words and use it to build the next url
def replace_runs_of_whitespace_with_hyphen(word):
    return '-'.join(word.split())
#List comprehension to insert the hyphen between words
hyphrases = [replace_runs_of_whitespace_with_hyphen(w) for w in subcategories_list]
#List comprehension to build the next url using the new hyphened words
gloves_and_hand_protection=[f'https://www.grainger.com/category/safety/gloves-and-hand-protection/{link}' for link in hyphrases]
gloves_and_hand_protection

['https://www.grainger.com/category/safety/gloves-and-hand-protection/Antistatic-Gloves',
 'https://www.grainger.com/category/safety/gloves-and-hand-protection/Arc-Flash-Gloves',
 'https://www.grainger.com/category/safety/gloves-and-hand-protection/Chemical-Resistant-Gloves',
 'https://www.grainger.com/category/safety/gloves-and-hand-protection/Coated-Gloves',
 'https://www.grainger.com/category/safety/gloves-and-hand-protection/Cold-Condition-Gloves',
 'https://www.grainger.com/category/safety/gloves-and-hand-protection/Cryogenic-Gloves',
 'https://www.grainger.com/category/safety/gloves-and-hand-protection/Cut-Resistant-Gloves',
 'https://www.grainger.com/category/safety/gloves-and-hand-protection/Disposable-Gloves',
 'https://www.grainger.com/category/safety/gloves-and-hand-protection/Electrical-Glove-Accessories',
 'https://www.grainger.com/category/safety/gloves-and-hand-protection/Electrical-Gloves',
 'https://www.grainger.com/category/safety/gloves-and-hand-protection/Extricatio

Antistatic Gloves - end urls

In [16]:
#For Windows users - Opening the target url
url3 ='https://www.grainger.com/category/safety/gloves-and-hand-protection/antistatic-gloves'
browser.visit(url3)

#activating soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [180]:
#Extract all values from the tr tag
bs=soup.findAll('tr',class_='search-table-view__web-parent-table-row')
#Extract the brand values from the tag
brand=[row.findAll('td')[4].contents[0].strip() for row in bs]
#Extract the sku values from the tag
sku=[row.findAll('button')[0].text.strip() for row in bs]
#Extract all category values from tag
cat=[row.findAll('td')[0].contents[0].strip() for row in bs]

#Function to add hypen between words and use it to build the next url
def replace_runs_of_whitespace_with_hyphen(word):
    return '-'.join(word.split())
#List comprehension to insert the hyphen between words
hyphrases = [replace_runs_of_whitespace_with_hyphen(w) for w in cat]

#List comprehension to create the end url
partial_link=[f'https://www.grainger.com/product/{i}' for i in sku]


for b in brand:
    link3=[]
    link2={f'https://www.grainger.com/product/{i}'for i in sku}
    link3.append(link2)
link3

[{'https://www.grainger.com/product/19L033',
  'https://www.grainger.com/product/19L034',
  'https://www.grainger.com/product/19L035',
  'https://www.grainger.com/product/19L036',
  'https://www.grainger.com/product/19L037',
  'https://www.grainger.com/product/19L038',
  'https://www.grainger.com/product/19L039',
  'https://www.grainger.com/product/19L040',
  'https://www.grainger.com/product/19L041',
  'https://www.grainger.com/product/19L042',
  'https://www.grainger.com/product/19L043',
  'https://www.grainger.com/product/1DPF5',
  'https://www.grainger.com/product/1DPF6',
  'https://www.grainger.com/product/1DPF7',
  'https://www.grainger.com/product/1DPF8',
  'https://www.grainger.com/product/1DPF9',
  'https://www.grainger.com/product/1DPG1',
  'https://www.grainger.com/product/3JFP3',
  'https://www.grainger.com/product/3JFP4',
  'https://www.grainger.com/product/3NGY2',
  'https://www.grainger.com/product/8CAW1',
  'https://www.grainger.com/product/9WRN3'}]

In [None]:
max_len = []
for word in fourier_dict:
    word = fourier_dict[word]
    for occur in word:
        max_len.append(len(occur))

In [None]:
max_len = [len(occur) for word in fourier_dict for occur in fourier_dict[word]]

In [None]:
partial_link=[f'https://www.grainger.com/product/{b}-{c}-{i}' for b in brand for c in hyphrases for i in sku]

['https://www.grainger.com/product/HONEYWELL']

Extract the detailed sku information

In [None]:
#For Windows users - Opening the target url
urlsku ='https://www.grainger.com/product/HONEYWELL-antistatic-gloves-3JFP3'
browser.visit(urlsku)

#activating soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [None]:
articles = soup.find('span', class_='gcprice-value')
skus=soup.find('div',class_='price-replace-wrapper')
lis=soup.findAll('div',class_='head-container clearfix')

price=articles.text.strip()
sku=skus.attrs['data-sku']
definition=soup.find('h1').text.strip()
item=lis[0]('span')[0].text
mfr_model=lis[0]('span')[2].text
brand=lis[0]('a')[0].text
cat_group=lis[0]('a')[1].text
unspsc=lis[0]('span')[5].text
item_link=lis[0]('a')[1]['href']
item_link

In [None]:
price_list=[]
sku_list=[]
definition_list=[]
item_list=[]
mfr_model_list=[]
brand_list=[]
cat_group_list=[]
unspsc_list=[]
item_link_list=[]

# Iterate through all pages
for x in partial_link:
    #For Windows users - Opening the target url
    browser.visit(x)

    #activating soup
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    # Retrieve all elements that contain book information
    articles = soup.find_all('span', class_='gcprice-value')
    
    price=articles.text.strip()
    price_list.append(price)
    
price_list

Testing blocks

In [138]:
#Create an empty list that will hold the partial urls
partial_link=[]
#For loop to iterate all the urls and add brand and category
for link in brand:
        #Store the partial link to be created into a variable
        rec_link=f'https://www.grainger.com/product/{link}-antistatic-gloves-'
        #Appending the partial link created to the empty list
        partial_link.append(rec_link)
        #Create an empty list that will hold the full end url
        sku_links=[]
        #For loop to iterate each partial link and add the sku number
        for i in sku:  
            #Store the full link to be created into a variable and adding sku number
            sku_li=f'{rec_link}{i}'
            #Appending the full url to the empty list
            sku_links.append(sku_li)
#Display urls in the list
sku_links

['https://www.grainger.com/product/HONEYWELL-antistatic-gloves-3JFP3',
 'https://www.grainger.com/product/HONEYWELL-antistatic-gloves-9WRN3',
 'https://www.grainger.com/product/HONEYWELL-antistatic-gloves-3JFP4',
 'https://www.grainger.com/product/HONEYWELL-antistatic-gloves-8CAW1',
 'https://www.grainger.com/product/HONEYWELL-antistatic-gloves-3NGY2',
 'https://www.grainger.com/product/HONEYWELL-antistatic-gloves-19L033',
 'https://www.grainger.com/product/HONEYWELL-antistatic-gloves-19L034',
 'https://www.grainger.com/product/HONEYWELL-antistatic-gloves-19L035',
 'https://www.grainger.com/product/HONEYWELL-antistatic-gloves-19L036',
 'https://www.grainger.com/product/HONEYWELL-antistatic-gloves-19L037',
 'https://www.grainger.com/product/HONEYWELL-antistatic-gloves-19L038',
 'https://www.grainger.com/product/HONEYWELL-antistatic-gloves-19L039',
 'https://www.grainger.com/product/HONEYWELL-antistatic-gloves-19L040',
 'https://www.grainger.com/product/HONEYWELL-antistatic-gloves-19L041

In [None]:
#Extracting all categories
#Set up an empty list
subcategories_list=[]
#Find all the text contained in span tags
spans = soup.findAll('tr', class_='search-table-view__web-parent-table-row')
spans
#spans2=spans[0].attrs['data-sku']
#spans2

In [None]:
#For loop to iterate through all the list
for span in spans:
    #Get each text from each tag, and strip blank spaces
    #subcategories=spans.attrs['data-sku']
    #Append each list to the general list
    subcategories_list.append(span.attrs['data-sku'])
#Display results
subcategories_list

In [None]:
link=f'https://www.grainger.com/product/{first_column}-antistatic-gloves-'
print(link)