In [2]:
import pandas as pd
import requests
import datetime
import os
from scrapy import Selector

In [3]:
# os.makedirs('/test')

In [4]:
# Data file URL
url = 'https://www.bls.gov/cew/downloadable-data-files.htm'

#Pull in page content
r = requests.get(url)

#Make Selector item to scrape
sel = Selector(text = r.text)

# Select annual average file list
sing_files = [f'https://data.bls.gov/cew/data/files/{year}/csv/{year}_annual_singlefile.zip' for year in range(1990, datetime.date.today().year-1)]

In [42]:
# Define list of state codes from BLS link and filter to code and name
# This will be a reference to filter the single files
state_list = pd.read_csv('https://data.bls.gov/cew/doc/titles/area/area_titles.csv')
state_list = state_list[state_list['area_title'].str.contains('-- Statewide',regex=False)]
state_list['area_title'] = state_list['area_title'].str.split(' --',n=1,expand=True)[0]
state_list['area_fips'] = state_list['area_fips'].astype('int')
state_list['area_fips'] = state_list['area_fips'].astype('str').str.zfill(5)
state_list

Unnamed: 0,area_fips,area_title
4,1000,Alabama
76,2000,Alaska
129,4000,Arizona
149,5000,Arkansas
229,6000,California
292,8000,Colorado
361,9000,Connecticut
374,10000,Delaware
388,12000,Florida
461,13000,Georgia


In [6]:
# Define list of industries from BLS
industry_list = pd.read_csv('https://data.bls.gov/cew/doc/titles/industry/industry_titles.csv')

# Winnow down to just six-digit values
six_digit = industry_list[industry_list['industry_code'].str.len()==6]

# Identify only the newspaper-related code: 511110
news = (six_digit[six_digit['industry_title']
                  .str.contains('Newspaper publishers',regex=False)]
       )

all = (industry_list[industry_list['industry_title']
                     .str.contains('Total, all industries',regex=False)]
      )

news_all = pd.concat([news,all])

In [48]:
dfs=[]

# Convert to enable join
state_list['area_fips'] = state_list['area_fips'].astype('object')

for file in sing_files:
    qcew = pd.read_csv(file, low_memory=False)
    qcew = qcew.merge(news_all,how='inner',on='industry_code').merge(state_list,how='inner',on='area_fips')
    dfs.append(qcew)
    print(f'Loaded from: {file}')

Loaded from: https://data.bls.gov/cew/data/files/1990/csv/1990_annual_singlefile.zip
Loaded from: https://data.bls.gov/cew/data/files/1991/csv/1991_annual_singlefile.zip
Loaded from: https://data.bls.gov/cew/data/files/1992/csv/1992_annual_singlefile.zip
Loaded from: https://data.bls.gov/cew/data/files/1993/csv/1993_annual_singlefile.zip
Loaded from: https://data.bls.gov/cew/data/files/1994/csv/1994_annual_singlefile.zip
Loaded from: https://data.bls.gov/cew/data/files/1995/csv/1995_annual_singlefile.zip
Loaded from: https://data.bls.gov/cew/data/files/1996/csv/1996_annual_singlefile.zip
Loaded from: https://data.bls.gov/cew/data/files/1997/csv/1997_annual_singlefile.zip
Loaded from: https://data.bls.gov/cew/data/files/1998/csv/1998_annual_singlefile.zip
Loaded from: https://data.bls.gov/cew/data/files/1999/csv/1999_annual_singlefile.zip
Loaded from: https://data.bls.gov/cew/data/files/2000/csv/2000_annual_singlefile.zip
Loaded from: https://data.bls.gov/cew/data/files/2001/csv/2001_an

In [50]:
df = pd.concat(dfs, sort=False, ignore_index=True)
df.head()

Unnamed: 0,area_fips,own_code,industry_code,agglvl_code,size_code,year,qtr,disclosure_code,annual_avg_estabs,annual_avg_emplvl,...,oty_taxable_annual_wages_chg,oty_taxable_annual_wages_pct_chg,oty_annual_contributions_chg,oty_annual_contributions_pct_chg,oty_annual_avg_wkly_wage_chg,oty_annual_avg_wkly_wage_pct_chg,oty_avg_annual_pay_chg,oty_avg_annual_pay_pct_chg,industry_title,area_title
0,1000,0,10,50,0,1990,A,,86872,1600920,...,-9982526190,-100.0,-143901271,-100.0,18,4.8,875,4.5,"10 Total, all industries",Alabama
1,1000,1,10,51,0,1990,A,,841,65549,...,0,0.0,0,0.0,35,6.2,1752,6.0,"10 Total, all industries",Alabama
2,1000,2,10,51,0,1990,A,,1046,77265,...,0,0.0,0,0.0,22,5.0,1108,4.9,"10 Total, all industries",Alabama
3,1000,3,10,51,0,1990,A,,1341,167017,...,-52230648,-100.0,-430505,-100.0,15,4.5,775,4.4,"10 Total, all industries",Alabama
4,1000,5,10,51,0,1990,A,,83645,1291089,...,-9930295542,-100.0,-143470766,-100.0,17,4.6,835,4.4,"10 Total, all industries",Alabama


In [54]:
df.to_csv('newspaper_jobs_and_totals_by_state.csv')