In [2]:
from bs4 import BeautifulSoup
import feedparser, urllib, json, os, re, requests
from collections import defaultdict

# Grab Ratings Function
- This function will parse through the individual brand page and retrieve the user-facing ratings given for the three sustainability criteria (Planet, People, Animals) and the cost rating.

In [3]:
#soup.find_all('h3', {'class': 'StyledHeading-sc-1rdh4aw-0 haGwpv'})

def grab_ratings(brand_page_soup):

    #regrex patterns
    rate_pat = re.compile('\d')
    cost_pat = re.compile('[$]')

    brand_ratings = defaultdict(dict)
    brand_metrics = brand_page_soup.find_all('span', {'class': 'StyledText-sc-1sadyjn-0 ccIhDL'})

    #get second item, first item is the aggregated "rating", then count the number of '$' to rank it
    brand_cost = brand_page_soup.find_all('span', {'class': 'StyledText-sc-1sadyjn-0 kkXGYR'})[1].text
    brand_cost_rank = len(re.findall(cost_pat, brand_cost))


    for i, metric in enumerate(['plant', 'people', 'animals']):
        rating = re.findall(rate_pat, brand_metrics[i].text)
        if not rating:
            brand_ratings[metric] = None
        else:
            rating_dec = int(rating[0])/int(rating[1])
            brand_ratings[metric] = rating_dec
    
    return(brand_cost_rank, brand_ratings)

# Scraper
- Initiate the request and move down to get a list of all the brand links
- We try to limit our initial pull as close to our target (t-shirts) as possible, with the 'tops' category

In [4]:
#MAIN "TOPS"

tops_site = 'https://directory.goodonyou.eco/categories/tops'
hdr = {'User-agent': 'Mozilla/5.0'}
req = urllib.request.Request(tops_site,headers=hdr)

tops_page = urllib.request.urlopen(req)
tops_soup = BeautifulSoup(tops_page, 'html.parser')
entire_page_meta = json.loads(tops_soup.script.string)

all_brands = entire_page_meta['props']['pageProps']['category']['brands']

tops_brand_page = {'https://directory.goodonyou.eco/brand/'+i['id'] for i in all_brands}

## Scrapping Brand Pages
- Step into the urls grabbed from the front page by creating a new request
- Not all brands listed under 'tops' have actual t-shirts, thus we must check to see that they have the category listed on the page and skip if not
- Record all of the data associated with the brand (url, cost and rating) -- note, other metadata was collected but is not currently used for our recommendation system.

In [None]:
#STEPPING INTO EACH BRAND PAGE
t_shirt_brand_dict = {}
for brand_site in tops_brand_page:
    
    req2 = urllib.request.Request(brand_site,headers=hdr)

    try:
        page2 = urllib.request.urlopen(req2)
    except:
        t_shirt_brand_dict[brand_site] = 'NOT FOUND'
        continue
    soup2 = BeautifulSoup(page2, 'html.parser')    
    
    brand = json.loads(soup2.script.string)
    brand_meta = brand['props']['pageProps']['brand']
    t_shirt_check=False
    for cat_i in brand_meta['categories']:
        if cat_i['name']=='T-Shirts':
            t_shirt_check = True
            brand_cost, brand_rate = grab_ratings(soup2)


    if t_shirt_check:
        pass
    else:
        continue
    t_shirt_brand_dict[brand_meta['id']] = {'link':brand_site, 'cost':brand_cost, 'rating':brand_rate}
    
    for meta_i in brand_meta:
        t_shirt_brand_dict[brand_meta['id']][meta_i] = brand_meta[meta_i]
#    'extra_meta':brand_meta}


# Save File

JSON File Structure

brand name: {link:(http://...), cost:(1,2,3,4), ratings{planet:(1-5), 'people':(1-5), 'animals':(1-5)}, additional metadata: {...}}

In [8]:
# note that GOY_brand_data.json must already exist at this point
with open('data/GOY_brand_data.json', 'w+') as f:
    # this would place the entire output on one line
    # use json.dump(lista_items, f, indent=4) to "pretty-print" with four spaces per indent
    json.dump(t_shirt_brand_dict, f)