In [1]:
# May need to install these....
# uncomment if unsure
#!pip install wget
#!pip install dtale

In [2]:
import pandas as pd
import numpy as np
import os

import dtale # data Frame visualization
import wget

# Download the data

Download Current employment statistics from [US Bureau of Labor Statics website](https://www.bls.gov/)


Data Dictionary
https://download.bls.gov/pub/time.series/ce/ce.txt



In [3]:
# Check if file has ben downloaded if not download the file
# This could take a while first time downloading the files (over 5 mins)
# Does this qualify as web scraping? :-)
CESFileName = 'resources/AllCESSeries.txt'
SeriesFileName = 'resources/CESSeries.txt'
IndustryFileName = 'resources/CESIndustries.txt'
SuperSectorFileName = 'resources\CESSuperSector.txt'
DataTypeCodesFileName = 'resources\CESDataTypeCodes.txt'

SeriesShortFileName = 'resources\CESSeries_short.csv'

if not os.path.exists(CESFileName) :
    url = "https://download.bls.gov/pub/time.series/ce/ce.data.0.AllCESSeries"
    wget.download(url,CESFileName)
    
    url = "https://download.bls.gov/pub/time.series/ce/ce.series"    
    wget.download(url,SeriesFileName)
    
    url = "https://download.bls.gov/pub/time.series/ce/ce.industry"    
    wget.download(url,IndustryFileName)  

    url = "https://download.bls.gov/pub/time.series/ce/ce.supersector"    
    wget.download(url,SuperSectorFileName)  

    url = "https://download.bls.gov/pub/time.series/ce/ce.datatype"    
    wget.download(url,DataTypeCodesFileName)  
    

allCES =  pd.read_table(CESFileName, sep='\t', header=0,names=['series_id', 'year', 'period', 'value', 'footnote_codes'])    

seriesDF =  pd.read_table(SeriesFileName, sep='\t', header=0,
                          names=['series_id', 'supersector_code','industry_code','data_type_code', 'seasonal', 'series_title', 'footnote_codes','begin_year', 'begin_period', 'end_year', 'end_period'],
                          dtype = {'supersector_code':np.str_,  'industry_code':np.str_,'data_type_code':np.str_})    

industriesDF =  pd.read_table(IndustryFileName, sep='\t', header=0,dtype={'industry_code':np.str_,'naics_code':np.str_})    


SuperSectorDF =  pd.read_table(SuperSectorFileName, sep='\t', header=0,names=['supersector_code','supersector_name'],
                               dtype={'supersector_code':np.str_})    

DataTypeCodesDF =  pd.read_table(DataTypeCodesFileName, sep='\t', header=0,names=['data_type_code','data_type_text'],
                               dtype={'data_type_code':np.str_})    

seriesShortDF =  pd.read_csv(SeriesShortFileName ) 

#Clean up spaces on the series_id
allCES['series_id'] = allCES['series_id'].str.strip()
seriesDF['series_id'] = seriesDF['series_id'].str.strip()


In [4]:
# Split Industry from Series ID
allCES['industry_code'] = allCES['series_id'].str.slice(3,11)

seriesDF.drop(['begin_year', 'begin_period', 'end_year', 'end_period'],1,inplace=True)



In [5]:
# Only interested in 2020
# CES Seasonally Adjusted
condition = allCES['year'].isin([2015,2016,2017,2018,2019,2020]) & allCES['series_id'].str.startswith('CES')
CES2020DF = allCES[condition]
CES2020DF.shape

(804591, 6)

In [6]:
# Get month from period and dop period
# month 13 Annual Average      
# https://download.bls.gov/pub/time.series/ce/ce.period
CES2020DF['Month'] = CES2020DF['period'].str.replace('M','').astype(int)
CES2020DF.drop('period',1,inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [7]:
# Merge with Industry
CES2020DF = pd.merge(CES2020DF,industriesDF,on='industry_code',how='left')
CES2020DF.shape

(804591, 12)

In [8]:
# Merge with Series DF
CES2020DF = pd.merge(CES2020DF,seriesDF,on='series_id',how='left')
CES2020DF.shape


(804591, 18)

In [9]:
# Merge with SupoerSector DF
CES2020DF = pd.merge(CES2020DF,SuperSectorDF,on='supersector_code',how='left')
CES2020DF.shape

(804591, 19)

In [10]:
# Merge with DataTypeCodesDF
CES2020DF = pd.merge(CES2020DF,DataTypeCodesDF,on='data_type_code',how='left')
CES2020DF.shape

(804591, 20)

In [11]:
# data_type_code  = 1 all
# data_type_code  = 10  Women employees
# Assume Men =   all - Women

condition = CES2020DF['data_type_code'].isin(['01','10'])
CES2020DF = CES2020DF[condition]

CES2020DF.shape



(97155, 20)

In [12]:
#CES2020DF.columns

CES2020DF.drop(['seasonal','footnote_codes_x','naics_code','industry_name','display_level','selectable','sort_sequence','industry_code_y','footnote_codes_y','publishing_status'],1,inplace=True)


In [13]:
industries_codes = {"Total Nonfarm":"00000000", "Total Private":"05000000",  "Goods Producing":"06000000", "Service-Providing":"07000000", 
"Private Service Providing":"08000000","Mining and Logging":"10000000","Mining, Logging and Construction":"15000000",
"Construction":"20000000","Manufacturing":"30000000","Durable Goods":"31000000","Non-Durable Goods":"32000000",
"Trade, Transportation, and Utilities":"40000000","Wholesale Trade":"41000000","Retail Trade":"42000000",
"Transportation, Warehousing, and Utilities":"43000000","Information":"50000000","Financial Activities":"55000000",
"Finance and Insurance":"55520000","Real Estate and Rental and Leasing":"55530000","Professional and Business Services":"60000000", 
"Professional, Scientific, and Technical Services":"60540000", "Management of Companies and Enterprises":"60550000", 
"Administrative and Support and Waste Mgt":"60560000", "Education and Health Services":"65000000", 
"Educational Services":"65610000", "Health Care and Social Assistance":"65620000", "Leisure and Hospitality":"70000000",
"Arts, Entertainment, and Recreation":"70710000","Accommodation and Food Services":"70720000", "Other Services":"80000000", 
"Government":"90000000","Federal Government":"90910000","State Government":"90920000","Local Government":"90930000"}

codes =[ v for v in industries_codes.values() ]

condition = CES2020DF['industry_code_x'].isin(codes)
CES2020DF = CES2020DF[condition]
CES2020DF.shape


(3958, 10)

In [14]:
CES2020DF = pd.merge(CES2020DF,seriesShortDF,on='series_id',how='left')
CES2020DF.shape



(3958, 11)

In [15]:
# Browse the data
# uncomment the following lines to browse the data
d = dtale.show(CES2020DF, ignore_duplicate=True)
d.open_browser()

In [16]:
CES2020DF.to_pickle("resources/CES2020.pkl",compression='gzip')

In [17]:
# to load Picke file use the following once pickle has been generated
CES2020DF_2=pd.read_pickle('resources/CES2020.pkl',compression='gzip')