In [1]:
from bs4 import BeautifulSoup
from itertools import chain

import pandas as pd
import requests

In [2]:
url = 'https://locations.traderjoes.com/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [3]:
all_page_links = [link.get('href') for link in soup.findAll('a')]

In [4]:
results = soup.find(id="contentbegin")
states = [link.get('href') for link in results.find_all('a')]

In [5]:
def get_content(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find(id='contentbegin')
    links = [link.get('href') for link in results.find_all('a')]
    return links 

cities = [get_content(state) for state in states]

flat_cities = list(chain(*cities))


In [6]:
stores = [get_content(city) for city in flat_cities]

In [7]:
all_stores = list(set(list(chain(*stores))))

In [9]:
def get_info(url): 
    """
    Returns the information of interest for a given Trader Joe's store
    Args:
        url (list): URL of the store
    Returns:
        store_info  (list): Contains the store's name, city, state, zip, landline, 
                            cell phone, and URL
    """
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find(id='contentbegin')
    store_name = results.find('div', class_="h1title").get_text() 
    address = results.find('div', class_="addressline").get_text().replace('\t', "").strip().split('\n')
    address = [a.strip() for a in address]
    address = list(filter(None, address))
    del address[2] 
    
    store_info = [store_name] + address +[url]

    return store_info

In [10]:
all_store_info = [get_info(store) for store in all_stores] 

info = ['store_name','street', 'city', 'state', 'zip', 'landline', 'mobile', 'website']

In [11]:
df = pd.DataFrame(all_store_info, columns=info)    

In [12]:
df.count()

store_name    524
street        524
city          524
state         524
zip           524
landline      524
mobile        521
website       521
dtype: int64

In [13]:
df[df.website.isna()]

Unnamed: 0,store_name,street,city,state,zip,landline,mobile,website
104,Trader Joe's South Bend (600),1140 East Howard,South Bend,IN,46617,https://locations.traderjoes.com/in/south-bend...,,
337,Trader Joe's Crestview Hills Grocery (788),2780 Dixie Highway,Crestview Hills,KY,41017,https://locations.traderjoes.com/ky/crestview-...,,
500,Trader Joe's Crestview Hills Wine (789),2785 Dixie Highway,Crestview Hills,KY,41017,https://locations.traderjoes.com/ky/crestview-...,,


In [14]:
df.loc[df.website.isnull(), 'website'] = df['landline']
df.loc[df.mobile.isnull(), "landline"] = None

In [15]:
df.loc[df.mobile.isnull(), ]

Unnamed: 0,store_name,street,city,state,zip,landline,mobile,website
104,Trader Joe's South Bend (600),1140 East Howard,South Bend,IN,46617,,,https://locations.traderjoes.com/in/south-bend...
337,Trader Joe's Crestview Hills Grocery (788),2780 Dixie Highway,Crestview Hills,KY,41017,,,https://locations.traderjoes.com/ky/crestview-...
500,Trader Joe's Crestview Hills Wine (789),2785 Dixie Highway,Crestview Hills,KY,41017,,,https://locations.traderjoes.com/ky/crestview-...


In [16]:
df.count()

store_name    524
street        524
city          524
state         524
zip           524
landline      521
mobile        521
website       524
dtype: int64