# What's in this notebook?
- Code for scraping Trader Joes site for all their addresses
- Loading TJ's addresses into a nice readable dataframe
- Code for gather census info on each address' area
- Loading census info into a nice readable data frame

## Scraping TJ's Site

In [23]:
import requests 
from bs4 import BeautifulSoup
import time
url = "https://locations.traderjoes.com"

home = requests.get(url)

In [18]:
# home page gets you to all the links to the locations by state
state_locs = [] # hold all the urls for tj locations by state
soup = BeautifulSoup(home.content, 'html5lib') 
for div in soup.findAll('div', attrs = {'class':'itemlist'}):
    state_locs.append(div.a['href'])

In [24]:
# now we need to go by city within each state to get the tjs locations
locations = []
for state_url in state_locs:
    state = requests.get(state_url)
    soup = BeautifulSoup(state.content, 'html5lib') 
    for div in soup.findAll('div', attrs = {'class': 'itemlist'}):
        locations.append(div.a['href'])
    time.sleep(1)

locations

['https://locations.traderjoes.com/al/birmingham/',
 'https://locations.traderjoes.com/az/gilbert/',
 'https://locations.traderjoes.com/az/glendale/',
 'https://locations.traderjoes.com/az/mesa/',
 'https://locations.traderjoes.com/az/oro-valley/',
 'https://locations.traderjoes.com/az/phoenix/',
 'https://locations.traderjoes.com/az/prescott/',
 'https://locations.traderjoes.com/az/scottsdale/',
 'https://locations.traderjoes.com/az/surprise/',
 'https://locations.traderjoes.com/az/tempe/',
 'https://locations.traderjoes.com/az/tucson/',
 'https://locations.traderjoes.com/ca/agoura-hills/',
 'https://locations.traderjoes.com/ca/alameda/',
 'https://locations.traderjoes.com/ca/aliso-viejo/',
 'https://locations.traderjoes.com/ca/arroyo-grande/',
 'https://locations.traderjoes.com/ca/bakersfield/',
 'https://locations.traderjoes.com/ca/berkeley/',
 'https://locations.traderjoes.com/ca/brea/',
 'https://locations.traderjoes.com/ca/brentwood/',
 'https://locations.traderjoes.com/ca/burban

In [56]:
# now get the addresses for each store in each location in the city
addresses = []
for location in locations:
    loc = requests.get(location)
    soup = BeautifulSoup(loc.content, 'html5lib')
    for loc in soup.findAll('div', attrs = {'class': 'address-left'}):
        address = []
        for x in loc.findAll('span')[1:5]:
            address.append(x.text)
        addresses.append(address)
    time.sleep(.5)

addresses

[['205 Summit Blvd, Suite 100', 'Birmingham', 'AL', '35243'],
 ['1779 E. Williams Field Rd.', 'Gilbert', 'AZ', '85295'],
 ['7720 West Bell Rd', 'Glendale', 'AZ', '85308'],
 ['2050 E Baseline Rd', 'Mesa', 'AZ', '85204'],
 ['7912 N Oracle', 'Oro Valley', 'AZ', '85704'],
 ['4025 E Chandler Blvd', 'Phoenix', 'AZ', '85048'],
 ['4726 East Shea Blvd', 'Phoenix', 'AZ', '85028'],
 ['4821 N 20th St', 'Phoenix', 'AZ', '85016'],
 ['252 N Lee Blvd', 'Prescott', 'AZ', '86303'],
 ['7555 E Frank Lloyd Wright', 'Scottsdale', 'AZ', '85260'],
 ['6202 N Scottsdale Rd', 'Scottsdale', 'AZ', '85253'],
 ['14095 W Grand Ave', 'Surprise', 'AZ', '85374'],
 ['6460 S McClintock Dr', 'Tempe', 'AZ', '85283'],
 ['1101 N Wilmot Rd', 'Tucson', 'AZ', '85712'],
 ['4209 N Campbell Ave', 'Tucson', 'AZ', '85719'],
 ['4766 E Grant Rd', 'Tucson', 'AZ', '85712'],
 ['28941 Canwood St', 'Agoura Hills', 'CA', '91301'],
 ['2217 South Shore Center', 'Alameda', 'CA', '94501'],
 ['26541 Aliso Creek Rd', 'Aliso Viejo', 'CA', '92656'],

## Loading TJ's Info Into a Dataframe

In [60]:
# we should probably turn this into something friendly -- we'll make it a dataframe
import pandas as pd

df = pd.DataFrame(data=addresses, columns=['street', 'city', 'state', 'zip'])

In [61]:
# lets pickle our work so we don't have to do it again 
import pickle
pickle.dump(df, open( "tj-addresses.pickle", "wb" ) )

In [70]:
df

Unnamed: 0,street,city,state,zip
0,"205 Summit Blvd, Suite 100",Birmingham,AL,35243
1,1779 E. Williams Field Rd.,Gilbert,AZ,85295
2,7720 West Bell Rd,Glendale,AZ,85308
3,2050 E Baseline Rd,Mesa,AZ,85204
4,7912 N Oracle,Oro Valley,AZ,85704
5,4025 E Chandler Blvd,Phoenix,AZ,85048
6,4726 East Shea Blvd,Phoenix,AZ,85028
7,4821 N 20th St,Phoenix,AZ,85016
8,252 N Lee Blvd,Prescott,AZ,86303
9,7555 E Frank Lloyd Wright,Scottsdale,AZ,85260


In [76]:
len(df['zip'].unique())

481

## Gathering Census Info

In [78]:
pip install selenium 

Collecting selenium
[?25l  Downloading https://files.pythonhosted.org/packages/80/d6/4294f0b4bce4de0abf13e17190289f9d0613b0a44e5dd6a7f5ca98459853/selenium-3.141.0-py2.py3-none-any.whl (904kB)
[K    100% |████████████████████████████████| 911kB 6.5MB/s ta 0:00:011
Installing collected packages: selenium
Successfully installed selenium-3.141.0
Note: you may need to restart the kernel to use updated packages.


In [87]:
from selenium import webdriver

EXE_PATH = r'/Users/hannah/Downloads/chromedriver'
driver = webdriver.Chrome(executable_path=EXE_PATH)
driver.get('https://factfinder.census.gov/faces/nav/jsf/pages/index.xhtml')

search = driver.find_elements_by_id('cfsearchtextboxmain')[0]  

# find_elements will give us the list of all elements with id as subjectInput 
search.send_keys(35243) # zipcode goes here                   
time.sleep(2)
# hit enter twice to advance to the results
search.send_keys(Keys.ENTER)
search.send_keys(Keys.ENTER)

# find the population census info
for element in driver.find_elements_by_tag_name('a'):
    if element.text == 'General Population and Housing Characteristics (Population, Age, Sex, Race, Households and Housing, ...)':
        element.click()
        
driver.find_element_by_id('data')

In [172]:
import requests
from   bs4 import BeautifulSoup

#how to get the population data, as per https://kaijento.github.io/2017/05/14/web-scraping-factfinder.census.gov/

zipcodes = ['11375']

base   = 'https://factfinder.census.gov/'
report = base + 'bkmk/table/1.0/en/DEC/10_DP/DPDP1/8600000US'
render = base + 'tablerestful/tableServices/renderProductData'

with requests.session() as s:
    s.headers['user-agent'] = 'Chrome/76'

    for zipcode in zipcodes:
        s.get(report + zipcode)
        r = s.get(render)
        print(r)
        html = r.json()['ProductData']['productDataTable']
        soup = BeautifulSoup(html, 'html5lib')

<Response [200]>


In [174]:
r.json()

{'ProductData': {'displayID': 'DP-1',
  'displayLabel': 'Profile of General Population and Housing Characteristics: 2010',
  'productDataset': '2010 Demographic Profile Data',
  'eusbreadcrumb': '<div id="pageinstr">\n<span id="pagetitle">Advanced Search</span> - <span id="pagedescription">Search all data in American FactFinder</span>\n</div>\n<div id="steps" class="AS">\n<div class="step completed" onclick="javascript:processTransition(\'datafinder\');"  title="Search all data in American FactFinder">\n<span>1</span> Advanced Search\n</div>\n<div class="step activelaststep" title="Table Viewer">\n<span>2</span> Table Viewer\n</div>\n</div>',
  'currentContext': 'datafinder',
  'breadcrumbTitle': 'Advanced Search',
  'currentContextURI': '/faces/nav/jsf/pages/searchresults.xhtml?refresh=t',
  'backToBreadcrumbTitle': 'Advanced Search',
  'universe': '',
  'tableToolsAvailable': 'true',
  'tableToolsEnabled': 'false',
  'mappable': 'false',
  'statsigSupported': 'false',
  'statsigEnabl

In [199]:
# for x in soup.findAll('table',attrs = {'id': 'data'} ):
#     print(x.text)
print(soup.findAll('table',attrs = {'id': 'data'})[0])

<table class="stat-tbl" id="data"><thead><tr class="h"><th class="metastub left right regular top bottom br-edge" colspan="1" id="pc1" rowspan="1">Subject</th><th class="L0 boxhead label top bottom left regular br-edge" colspan="1" id="c1">Number</th><th class="L0 boxhead label top bottom right regular br-edge" colspan="1" id="c2">Percent</th></tr>
</thead>
<tbody>
<tr class="h"><th class="label L0 regular top left stub br-edge" colspan="1" headers="pc1" id="r1">SEX AND AGE</th><td class="field left top" headers=""> </td><td class="field right top" headers=""> </td></tr>
<tr class="h stripe"><th class="label regular left L1 stub br-edge" colspan="1" headers="pc1" id="r2">Total population</th><td class="field left" headers="c1 r1 r2">68,733</td><td class="field right" headers="c2 r1 r2">100.0</td></tr>
<tr class="h"><th class="label regular L2 left stub br-edge" colspan="1" headers="pc1" id="r3">Under 5 years</th><td class="field left" headers="c1 r1 r2 r3">3,853</td><td class="field ri

In [175]:
base   = 'https://factfinder.census.gov/'
report = base + '/bkmk/table/1.0/en/ACS/17_5YR/S1501/8600000US35243'
render = base + 'tablerestful/tableServices/renderProductData'

zipcodes = ['11375']

with requests.session() as s:
    s.headers['user-agent'] = 'Chrome/76'

    for zipcode in zipcodes:
        s.get(report + zipcode)
        r = s.get(render)
        
        html = r.json()

In [176]:
html

{'Exception': {'userMessageCode': '',
  'userMessage': '',
  'isFatal': 'false',
  'redirectTo': '/faces/nav/jsf/pages/error.xhtml'}}

In [None]:
# okay maybe we'll just try and use the API :/ 

import config

key = config.census_key

