# CVS Web Scraper
Author: Stephen Kita

Date: 21 July 2020

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import time
import random

## Initialize Global Variables

In [None]:
my_header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}
base_url = 'https://www.cvs.com/store-locator/cvs-pharmacy-locations/'
df_cvs = pd.DataFrame(columns=('state', 'town', 'count'))
states = ["Arkansas", "Arizona", "California", "Colorado", "Connecticut", 
          "Delaware", "District-of-Columbia", "Florida", "Georgia", "Hawaii", 
          "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", 
          "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", 
          "Mississippi", "Montana", "Nebraska", "Nevada", "New-Hampshire", "New-Jersey", 
          "New-Mexico", "New-York", "North-Carolina", "North-Dakota", "Ohio", "Oklahoma", 
          "Oregon", "Pennsylvania", "Puerto-Rico", "Rhode-Island", "South-Carolina", 
          "South-Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Vermont",
          "Washington", "West-Virginia", "Wisconsin", "Wyoming"]

## Scrape Website and Save to CSV

In [9]:
for state in states:
    url = base_url + state
    response = requests.get(url, headers=my_header)
    
    if response.ok:
        print(f'Scraping {state}...')
        text = BeautifulSoup(response.text, 'html.parser')
        text_towns = text.find('div', 'states').select('li')
        for town_seq in text_towns:
            town_seq = town_seq.get_text().strip()
            town = ' '.join(re.findall('\w*[a-z]', town_seq))
            count = int(re.findall('\w*\d+', town_seq)[0])
            df_cvs = df_cvs.append({'state': state,'town': town,'count': count}, ignore_index=True)
        print('... complete')
    else:
        print('#'*50)
        print(f'Failed to scrape {state}')
        print('#'*50)

    time.sleep(random.randint(5,10))

df_cvs.to_csv('out.csv', index=False)
print('Writing to .csv complete!')

Scraping Puerto-Rico...
['Puerto-Rico' 'Arecibo' 1]
['Puerto-Rico' 'Bayamon' 4]
['Puerto-Rico' 'Caguas' 1]
['Puerto-Rico' 'Carolina' 3]
['Puerto-Rico' 'Condado' 1]
['Puerto-Rico' 'Dorado' 1]
['Puerto-Rico' 'Fajardo' 1]
['Puerto-Rico' 'Guaynabo' 2]
['Puerto-Rico' 'Manati' 1]
['Puerto-Rico' 'Ponce' 1]
['Puerto-Rico' 'San Juan' 3]
['Puerto-Rico' 'Toa Baja' 1]
['Puerto-Rico' 'Vega Alta' 1]
['Puerto-Rico' 'Vega Baja' 1]
... complete
Writing to CSV...
... complete


## Load CSV and Perform EDA

In [7]:
import pandas as pd
df_cvs = pd.read_csv('csv-locs.csv')
df_cvs.head()

Unnamed: 0,state,town,count
0,Alaska,Anchorage,2
1,Alaska,Wasilla,2
2,Alabama,Alabaster,2
3,Alabama,Albertville,1
4,Alabama,Aliceville,1


In [10]:
df_cvs.groupby('state').agg({'count':['sum','mean','std']})

Unnamed: 0_level_0,count,count,count
Unnamed: 0_level_1,sum,mean,std
state,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Alabama,180,1.8,2.169578
Alaska,4,2.0,0.0
Arizona,197,4.104167,7.594061
Arkansas,23,1.4375,0.629153
California,1175,2.952261,4.255118
Colorado,51,1.758621,1.550465
Connecticut,178,1.79798,1.377515
Delaware,20,1.428571,0.937614
District-of-Columbia,64,64.0,
Florida,874,3.273408,6.658998
