## Notebook walking through updating the text file - this could be automated, but keeping it as a notebook allows for an interactive process to QA/QC the descriptions and make sure they were scraped correctly. 

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('ref_files/product_info.csv')
data.head()

Unnamed: 0,Name,Description,Material
0,The Waylons in Chestnut,"Built by our buddies at VALLON, The Waylons is...","The Waylons’ frame is made with lightweight, d..."
1,The Sashiko Denim Repair Kit,"Sure, you could hand off your well-worn denim ...",The Sashiko Denim Repair Kit includes an assor...
2,The Camp Candle in Shoreline,The Camp Candle was poured by hand in small ba...,"Made with natural, renewable soy wax. Soy wax ..."
3,The Cotton Hemp Tee in Charcoal Open Road,This exclusive edition of The Cotton Hemp Tee ...,Organic cotton offers all the benefits of the ...
4,The Cotton Hemp Tee in Navy Give to Get,Regenerative agriculture is all about reciproc...,We love organic cotton because it offers all o...


In [3]:
# different locations to scrape product data...

other_url = 'https://www.taylorstitch.com/collections/2022-summer-sale-archive-bring-back?sorted=best-selling-sales-count'
shirts_url = 'https://www.taylorstitch.com/collections/mens-shirts'
bottoms_url = 'https://www.taylorstitch.com/collections/mens-bottoms'
knits_url = 'https://www.taylorstitch.com/collections/mens-knits'
outerwear_url = 'https://www.taylorstitch.com/collections/mens-outerwear'
last_call_url = 'https://www.taylorstitch.com/collections/mens-last-call'
base_url = 'https://www.taylorstitch.com'

url_list = [shirts_url,bottoms_url,knits_url,outerwear_url,last_call_url]

In [None]:
product_urls = [] # list of all product pages to get later

for url in url_list:
    site = requests.get(url)

    soup = BeautifulSoup(site.text, 'html.parser')

    products = soup.find_all('ul',{'class':'product matrix'})
    products = products[0].find_all('a',href=True) # a tags hold products here

    for product in products:
        product_urls.append(base_url+product['href'])
        
# look at each product, pull relevent info.
# store all info in lists, which will be converted to pandas df later
product_title = []
product_description = []
product_material = []


for product in product_urls:
    product_page = requests.get(product)
    product_soup = BeautifulSoup(product_page.text, 'html.parser')
    
    title_info = product_soup.find('h1')['data-title']
    if data['Name'].str.contains(title_info).any():
      continue
    else: print('adding',title_info)

    description_info = product_soup.find_all('div',
                                             {'id':'collapsible-description'})
    material_info = product_soup.find_all('div',
                                             {'id':'collapsible-material'})

    try: 
        description = description_info[0].find('p').text
        material = material_info[0].find('p').text
    
    except:
        print('could not add info for',title_info) 
        continue
        
    product_title.append(title_info)
    product_description.append(description)
    product_material.append(material)

    
all_info = pd.DataFrame(list(zip(product_title, 
                                 product_description,
                                 product_material)),
               columns =['Name', 'Description', 'Material'])

In [None]:
all_info = all_info.drop_duplicates()

In [None]:
# print each one out to inspect the description 
for item in all_info['Description']:
  print(item,'\n')

In [None]:
# add the new descriptions to the existing file
data = pd.concat([data,all_info])

In [None]:
# look at new size
data.shape

In [None]:
# and finally save. 
data.to_csv('product_info.csv',index=False,header=True)