In [2]:
# Imports and constants

import requests as rq
from bs4 import BeautifulSoup as soup
import pandas as pd
import time

bls_state_url = 'https://data.bls.gov/cgi-bin/surveymost?sm'

In [3]:
# Get response from bls main state unemployment site
bls_ro = rq.get(bls_state_url)
bls_ro

<Response [200]>

In [4]:
# Turn it into beautiful soup using BS4
ro_soup = soup(bls_ro.text, 'html.parser')
ro_soup

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

<html lang="en">
<head>
<title>Top Picks (Most Requested Statistics) : U.S. Bureau of Labor Statistics</title>
<!-- ****************************************** Begin META TAGS ********************************************* -->
<!-- START include/global/head.stm -->
<script id="_fed_an_ua_tag" src="https://dap.digitalgov.gov/Universal-Federated-Analytics-Min.js?agency=DOL&amp;subagency=BLS&amp;yt=true"></script>
<script src="/javascripts/jquery-latest.js"></script>
<script src="/javascripts/bls-latest.js"></script>
<script src="/javascripts/jquery-tools.js"></script>
<script src="/javascripts/jquery-migrate-1.2.1.min.js"></script>
<link href="/assets/bootstrap/latest/bootstrap.min.css" rel="stylesheet"/>
<script src="/assets/bootstrap/latest/popper.min.js"></script>
<script src="/assets/bootstrap/latest/bootstrap.min.js"></script>
<link href="/stylesheets/bls_combined.css" rel="styleshe

In [6]:
# Select only the tags containing links and names of states
state_links = ro_soup.css.select('a[href^="https://data.bls.gov/cgi-bin/surveymost?"]')
state_links

[<a aria-label="BLS Popular Series Top Picks" href="https://data.bls.gov/cgi-bin/surveymost?bls">BLS Popular Series</a>,
 <a href="https://data.bls.gov/cgi-bin/surveymost?sm+01">Alabama</a>,
 <a href="https://data.bls.gov/cgi-bin/surveymost?sm+02">Alaska</a>,
 <a href="https://data.bls.gov/cgi-bin/surveymost?sm+04">Arizona</a>,
 <a href="https://data.bls.gov/cgi-bin/surveymost?sm+05">Arkansas</a>,
 <a href="https://data.bls.gov/cgi-bin/surveymost?sm+06">California</a>,
 <a href="https://data.bls.gov/cgi-bin/surveymost?sm+08">Colorado</a>,
 <a href="https://data.bls.gov/cgi-bin/surveymost?sm+09">Connecticut</a>,
 <a href="https://data.bls.gov/cgi-bin/surveymost?sm+10">Delaware</a>,
 <a href="https://data.bls.gov/cgi-bin/surveymost?sm+11">District Of Columbia</a>,
 <a href="https://data.bls.gov/cgi-bin/surveymost?sm+12">Florida</a>,
 <a href="https://data.bls.gov/cgi-bin/surveymost?sm+13">Georgia</a>,
 <a href="https://data.bls.gov/cgi-bin/surveymost?sm+15">Hawaii</a>,
 <a href="https://

In [13]:
# List comprehension to make a list of links next to their respective states
state_link_list = [(state_links[i].get('href'), state_links[i].get_text()) for i in range(len(state_links))][1:]



In [14]:
# Define web-scraper
def state_scraper(state_link_list):

    final_df = pd.DataFrame([])

    for link in range(len(state_link_list)):

        # Response from link
        print(f'Initializing "get" request for {state_link_list[link][1]} ')
        state_ro = rq.get(state_link_list[link][0]) 
        print(f'State Link for {state_link_list[link][1]} Acquired')

        # Soup
        state_soup = soup(state_ro.text, 'html.parser')

        # Find serials and survey names
        state_serial_and_id = state_soup.find_all('dt')[0].get_text()

        # Split into list
        state_serial_and_id = state_serial_and_id.split('\n')[1:-1]

        # Turn into DataFrame, split by '-'
        df = pd.DataFrame([item.split(' - ') for item in state_serial_and_id], columns=['series', 'seriesID'])

        # Take state name from series column and put it in its own column
        df['state'] = df['series'].str[0:(df['series'].str.find(',').astype(int)[0])]
        df['series'] = df['series'].str[(df['series'].str.find(', ').astype(int)[0]+2):]

        # Add a new column that applies 't' if seasonally adjusted, 'f' if not
        df['is_adjusted'] = df['series'].str.lower().apply(lambda x: False if 'Not Seasonally Adjusted' in x else True)

        # Append to final DataFrame
        final_df = pd.concat([final_df, df], ignore_index= True)

        # Be nice to BLS
        print(f'{state_link_list[link][1]} data added to DataFrame')
        print('Sleeping for 2 seconds')
        time.sleep(2)
        

    final_df['survey'] = 'CES'
    return final_df
        
    

In [15]:
# Run it

result = state_scraper(state_link_list)

Initializing "get" request for Alabama 
State Link for Alabama Acquired
Alabama data added to DataFrame
Sleeping for 2 seconds
Initializing "get" request for Alaska 
State Link for Alaska Acquired
Alaska data added to DataFrame
Sleeping for 2 seconds
Initializing "get" request for Arizona 
State Link for Arizona Acquired
Arizona data added to DataFrame
Sleeping for 2 seconds
Initializing "get" request for Arkansas 
State Link for Arkansas Acquired
Arkansas data added to DataFrame
Sleeping for 2 seconds
Initializing "get" request for California 
State Link for California Acquired
California data added to DataFrame
Sleeping for 2 seconds
Initializing "get" request for Colorado 
State Link for Colorado Acquired
Colorado data added to DataFrame
Sleeping for 2 seconds
Initializing "get" request for Connecticut 
State Link for Connecticut Acquired
Connecticut data added to DataFrame
Sleeping for 2 seconds
Initializing "get" request for Delaware 
State Link for Delaware Acquired
Delaware data

In [16]:
# Save to CSV
result.to_csv('state_series_dimension.csv')