In [1]:
# Used to get zoom-able & resize-able notebook. This is the best for quick tests where you need to work interactively.
%matplotlib notebook    

# this only draws the images, not interactive / zoom-able but it works well. I recommend to change figure size to be bigger than the default in most cases
#%matplotlib inline

In [2]:
# Used for high quality plot
%config InlineBackend.figure_format = 'svg'

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import matplotlib.dates as mdates
import datetime
import math
from scipy import optimize

## GITHUB ACCESS
The following is how you import an object:

This object lets you access files from the GITHUB repository of files
its where all of the COVID-19 data is kept.

What is GITHUB : https://en.wikipedia.org/wiki/GitHub

Documentation : https://pygithub.readthedocs.io/en/latest/

In [4]:
from github import Github

When you import the github library, an object called g needs to be
created. This object will represent the github object that we will use
to pull data from the github repository. You will be able to pull data
from your own personal repository, or any repository that exits in
github.

This object will be used to access any file or directory within the
github "filesystem".

For this access to occur you need to insert your personal github
username and password. If you dont have one, you would need to creat a
github account. Please note that the fields below are exactly what
you would enter when prompted when logging into the github site.

In [5]:
# Replace with your login
#
#g = Github("username", "password")

In the following command we create another object called
repo (short for repository of files), and assign it to a specific
repository. Specifically the PHD repository located at:
https://github.com/ehsintegration/yfd-phd-bls-data

In [6]:
repo = g.get_repo("ehsintegration/yfd-phd-bls-data")

In [7]:
contents = repo.get_contents("")
print(contents)
for index in contents:
    print(index)

[ContentFile(path="DATA"), ContentFile(path="README.md"), ContentFile(path="scripts")]
ContentFile(path="DATA")
ContentFile(path="README.md")
ContentFile(path="scripts")


## Get the list of files
Get the list of files from the directory that we are interested in, the DATA folder

In [8]:
contents = repo.get_contents("DATA")
count = 1
for index in contents:
    url = index.download_url
    print(url)
    count = count + 1
print(count)

https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/FATAL_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_14to15_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_16to19_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_20to24_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_25to34_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_35to44_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_45to54_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_55to64_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_65plus_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_NR_all.csv
https://raw.

## Use HTTP Access to grab the files
Here we are pulling in another object called BS4, which stands for
Beautiful Soup 4. We will use the Beautiful Soup Object to fetch web data.
Beautiful soup is normally used to parse HTML, but we are just using
it to download the file and convert it to a string. 

In [9]:
import bs4 as bs
import urllib.request

## Create SOC dictionary

In [10]:
# pull in the list of SOC industry codes
soc = []

desired_file = "SOC_all.csv"

# Iterate through all of the URLs
for index in contents:
    url = index.download_url
    
    # Is content_file a file with the .csv extension
    if (desired_file in url):
        print(url)
        
        # Grab the source contents of the file
        source = urllib.request.urlopen(url).read()
        
        #print(source)
        decoded_data = source.decode("utf-8")
        soc.append(decoded_data)   

#print(soc)

https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/SOC_all.csv


In [11]:
# Create a dictionary of soc_codes
#
soc_dict = {}

line = soc[0].replace("\"","")
soc_array= line.split("\n")

# Create Dictionary of SOC codes
#
for line in soc_array:
    key  = line[0:7].replace("-","")
    data = line[9:]
    soc_dict.setdefault(key, []).append(data)
    
#index = 0
#for key in soc_dict:
#    print(index, key, soc_dict[key])
#    index = index + 1

## Select Generic Database to work with

In [None]:
genericDB_files_dict = {}

desired_file_prefix = "NF_LOS"  # "NF_AGE_14to15_all.csv"

## Create NF_LOS dictionary

In [12]:
# pull in the data

# Iterate through all of the URLs
for index in contents:
    url = index.download_url
    
    # Is content_file a file with the .csv extension
    if (desired_file_prefix in url):
        print(url)
        
        name = url.split("/")[-1].split("_all")[0]
        print(name)
        
        # Grab the source contents of the file
        source = urllib.request.urlopen(url).read()
        
        #print(source)
        decoded_data = source.decode("utf-8")
        genericDB_files_dict.setdefault(name, []).append(decoded_data)
        #genericDB.append(decoded_data)   

#print(genericDB_files_dict)

https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_LOS_1to5yr_all.csv
NF_LOS_1to5yr
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_LOS_3to11mos_all.csv
NF_LOS_3to11mos
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_LOS_5plus_all.csv
NF_LOS_5plus
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_LOS_NR_all.csv
NF_LOS_NR
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_LOS_ltg3mos_all.csv
NF_LOS_ltg3mos


In [13]:
# Create a dictionary of soc_codes
#
genericDB_dict = {}


# Create Dictionary of SOC codes
#
for db in genericDB_files_dict.keys():
    
    genericDB = genericDB_files_dict[db]
    
    #print(genericDB[0])    
    line = genericDB[0].replace("\"","")
    genericDB_array= line.split("\n")
    #print(genericDB_array[12:])
    
    # Line 12 is where the header stops and the data starts
    #
    for line in genericDB_array[12:]:
        line_array = line.split(",")
        #key  = line_array[0].    [0:7].replace("-","")
        key  = line_array[0][6:12]
        data = line_array[1:]
        #print(line_array)
        #print(key,data)

        # Replace empty values with zero
        if (len(key) > 1):
            #print(key,data[0:8])
            for index in range(0,8):
                if (len(data[index]) == 0):
                    data[index] = '0'
            #print(key,data[0:8])
        
        # Convert to Integers
        if (len(key) > 1):
            #print(key,data[0:8])
            for index in range(0,8):
                data[index] = int(data[index])
            #print(key,data[0:8])   
            
        # Create dictionary entry
        if (len(key) > 1):
            genericDB_dict.setdefault(db+"_"+key, []).append(data[0:8])
    
index = 0
for key in genericDB_dict:
    print(index, key, genericDB_dict[key])
    index = index + 1

0 NF_LOS_1to5yr_0XXXXX [[390, 1210, 1130, 760, 410, 540, 190, 270]]
1 NF_LOS_1to5yr_110000 [[7850, 8180, 7680, 7190, 8300, 8430, 7470, 8540]]
2 NF_LOS_1to5yr_111000 [[760, 1730, 1540, 1180, 1210, 1740, 1370, 930]]
3 NF_LOS_1to5yr_111010 [[150, 90, 90, 70, 20, 80, 430, 110]]
4 NF_LOS_1to5yr_111011 [[150, 90, 90, 70, 20, 80, 430, 110]]
5 NF_LOS_1to5yr_111020 [[610, 1640, 1450, 1100, 1190, 1660, 940, 830]]
6 NF_LOS_1to5yr_111021 [[610, 1640, 1450, 1100, 1190, 1660, 940, 830]]
7 NF_LOS_1to5yr_112000 [[210, 350, 660, 480, 290, 670, 240, 460]]
8 NF_LOS_1to5yr_112010 [[20, 20, 20, 40, 40, 0, 0, 20]]
9 NF_LOS_1to5yr_112011 [[20, 20, 20, 40, 40, 0, 0, 20]]
10 NF_LOS_1to5yr_112020 [[170, 320, 620, 350, 240, 630, 170, 430]]
11 NF_LOS_1to5yr_112021 [[80, 60, 120, 60, 60, 60, 60, 60]]
12 NF_LOS_1to5yr_112022 [[90, 260, 500, 290, 180, 570, 110, 370]]
13 NF_LOS_1to5yr_112030 [[0, 0, 20, 100, 20, 40, 50, 0]]
14 NF_LOS_1to5yr_112031 [[0, 0, 20, 100, 20, 40, 50, 0]]
15 NF_LOS_1to5yr_113000 [[1590, 1050,

1454 NF_LOS_3to11mos_272012 [[0, 0, 0, 0, 0, 0, 0, 110]]
1455 NF_LOS_3to11mos_272020 [[430, 470, 1850, 650, 440, 670, 650, 470]]
1456 NF_LOS_3to11mos_272021 [[340, 280, 0, 520, 300, 410, 540, 100]]
1457 NF_LOS_3to11mos_272022 [[80, 140, 80, 80, 140, 220, 110, 370]]
1458 NF_LOS_3to11mos_272023 [[0, 50, 0, 50, 0, 40, 0, 0]]
1459 NF_LOS_3to11mos_272030 [[50, 20, 40, 20, 40, 20, 30, 20]]
1460 NF_LOS_3to11mos_272031 [[50, 20, 40, 20, 40, 20, 30, 20]]
1461 NF_LOS_3to11mos_272090 [[80, 0, 90, 60, 50, 60, 30, 60]]
1462 NF_LOS_3to11mos_272099 [[80, 0, 90, 60, 50, 60, 30, 60]]
1463 NF_LOS_3to11mos_273000 [[30, 120, 20, 90, 50, 30, 100, 160]]
1464 NF_LOS_3to11mos_273020 [[0, 50, 0, 30, 20, 0, 0, 120]]
1465 NF_LOS_3to11mos_273022 [[0, 50, 0, 30, 20, 0, 0, 120]]
1466 NF_LOS_3to11mos_273030 [[0, 0, 0, 0, 0, 0, 0, 20]]
1467 NF_LOS_3to11mos_273031 [[0, 0, 0, 0, 0, 0, 0, 20]]
1468 NF_LOS_3to11mos_273040 [[20, 0, 0, 50, 0, 0, 20, 0]]
1469 NF_LOS_3to11mos_273041 [[20, 0, 0, 20, 0, 0, 0, 0]]
1470 NF_LOS_3

2787 NF_LOS_5plus_333000 [[34910, 34900, 32390, 31100, 28170, 27950, 24660, 25080]]
2788 NF_LOS_5plus_333010 [[12330, 10760, 10660, 10460, 9510, 9070, 8460, 9530]]
2789 NF_LOS_5plus_333011 [[150, 150, 280, 160, 140, 140, 120, 380]]
2790 NF_LOS_5plus_333012 [[12180, 10610, 10380, 10300, 9370, 8930, 8350, 9150]]
2791 NF_LOS_5plus_333020 [[1650, 990, 1730, 920, 1500, 1530, 1140, 960]]
2792 NF_LOS_5plus_333021 [[1650, 990, 1730, 920, 1500, 1530, 1140, 960]]
2793 NF_LOS_5plus_333030 [[70, 130, 100, 50, 90, 20, 0, 70]]
2794 NF_LOS_5plus_333031 [[70, 130, 100, 50, 90, 20, 0, 70]]
2795 NF_LOS_5plus_333040 [[100, 280, 230, 90, 80, 240, 210, 140]]
2796 NF_LOS_5plus_333041 [[100, 280, 230, 90, 80, 240, 210, 140]]
2797 NF_LOS_5plus_333050 [[20750, 22740, 19670, 19570, 16990, 17090, 14840, 14380]]
2798 NF_LOS_5plus_333051 [[20710, 22740, 19630, 19510, 16910, 17070, 14810, 14320]]
2799 NF_LOS_5plus_333052 [[40, 0, 40, 60, 80, 20, 30, 60]]
2800 NF_LOS_5plus_339000 [[3690, 4570, 5250, 4480, 4490, 4200

4287 NF_LOS_ltg3mos_211013 [[0, 0, 0, 0, 0, 70, 0, 20]]
4288 NF_LOS_ltg3mos_211014 [[50, 150, 220, 120, 40, 50, 40, 60]]
4289 NF_LOS_ltg3mos_211015 [[30, 0, 0, 0, 50, 0, 60, 0]]
4290 NF_LOS_ltg3mos_211019 [[50, 100, 110, 40, 80, 160, 170, 100]]
4291 NF_LOS_ltg3mos_211020 [[80, 200, 200, 130, 170, 120, 250, 210]]
4292 NF_LOS_ltg3mos_211021 [[20, 100, 20, 40, 60, 40, 0, 70]]
4293 NF_LOS_ltg3mos_211022 [[0, 0, 20, 0, 0, 20, 0, 20]]
4294 NF_LOS_ltg3mos_211023 [[20, 20, 20, 40, 20, 0, 0, 0]]
4295 NF_LOS_ltg3mos_211029 [[50, 60, 150, 30, 90, 50, 220, 110]]
4296 NF_LOS_ltg3mos_211090 [[100, 360, 190, 220, 190, 240, 190, 250]]
4297 NF_LOS_ltg3mos_211092 [[0, 0, 60, 40, 0, 0, 0, 70]]
4298 NF_LOS_ltg3mos_211093 [[80, 180, 60, 130, 160, 200, 90, 140]]
4299 NF_LOS_ltg3mos_211099 [[0, 180, 50, 50, 30, 30, 60, 40]]
4300 NF_LOS_ltg3mos_230000 [[20, 90, 60, 0, 0, 40, 260, 0]]
4301 NF_LOS_ltg3mos_231000 [[0, 60, 0, 0, 0, 0, 0, 0]]
4302 NF_LOS_ltg3mos_231010 [[0, 20, 0, 0, 0, 0, 0, 0]]
4303 NF_LOS_ltg3m

In [14]:
print(genericDB_dict['NF_LOS_1to5yr_119020'][0])

[240, 200, 290, 290, 920, 510, 320, 590]


## Create SOC Level Groups
Create SOC Level Groups for SOC work groups as a dictionary of keys

Level 1 "11XXXX", "12XXXX"....<br>
Level 2 "110XXX", "111XXX", "120XXX"...<br>
Level 3 "1101XX", "1102XX", "1201XX"......<br>

In [15]:
level1_sockeys = {}
level2_sockeys = {}
level3_sockeys = {}

for key in soc_dict.keys():
    lev1 = key[0:2]
    lev2 = key[0:3]
    lev3 = key[0:4]
    #print(lev1)
    level1_sockeys.setdefault(lev1, []).append(key)
    level2_sockeys.setdefault(lev2, []).append(key)
    level3_sockeys.setdefault(lev3, []).append(key)
    
#print(level1_sockeys)

## Create a count of DATABASE data for the diferent level soc groups

In [19]:
no_genericDB_data_dict = {}

for index in level1_sockeys.keys():
    # running totals for 2011 to 2018   
    for db in genericDB_files_dict.keys(): 
        if (len(index) > 0):
            totals = np.array([0,0,0,0,0,0,0,0])
            for key in level1_sockeys[index]:
                new_key = db+"_"+key
                if new_key in genericDB_dict.keys():
                    #print(genericDB_dict[key][0])
                    totals = totals + np.array(genericDB_dict[new_key][0])
                    #print(key,genericDB_dict[key])
                else:
                    no_genericDB_data_dict.setdefault(key, []).append(0)
                    #print("No GenericDB data for ", key)
            print(db+"\t"+index,totals)
        
print("") 

for index in level2_sockeys.keys():
    # running totals for 2011 to 2018   
    for db in genericDB_files_dict.keys(): 
        if (len(index) > 0):
            totals = np.array([0,0,0,0,0,0,0,0])
            for key in level2_sockeys[index]:
                new_key = db+"_"+key
                if new_key in genericDB_dict.keys():
                    #print(genericDB_dict[key][0])
                    totals = totals + np.array(genericDB_dict[new_key][0])
                    #print(key,genericDB_dict[key])
                else:
                    no_genericDB_data_dict.setdefault(key, []).append(0)
                    #print("No GenericDB data for ", key)
            print(db+"\t"+index,totals)
        
print("") 

for index in level3_sockeys.keys():
    # running totals for 2011 to 2018   
    for db in genericDB_files_dict.keys(): 
        if (len(index) > 0):
            totals = np.array([0,0,0,0,0,0,0,0])
            for key in level3_sockeys[index]:
                new_key = db+"_"+key
                if new_key in genericDB_dict.keys():
                    #print(genericDB_dict[key][0])
                    totals = totals + np.array(genericDB_dict[new_key][0])
                    #print(key,genericDB_dict[key])
                else:
                    no_genericDB_data_dict.setdefault(key, []).append(0)
                    #print("No GenericDB data for ", key)
            print(db+"\t"+index,totals)
        
print("") 

NF_LOS_1to5yr	11 [30750 32090 30280 28320 32880 33420 29450 33450]
NF_LOS_3to11mos	11 [13850 10070 11810  8680 11030  9090 13020 14120]
NF_LOS_5plus	11 [55640 55970 56120 54730 58580 45810 45650 51430]
NF_LOS_NR	11 [ 820  600 3280  430 1580  920  630  810]
NF_LOS_ltg3mos	11 [1740 2090 3120 3690 3460 4260 4310 4050]
NF_LOS_1to5yr	13 [ 8460 10540 11100  9040 10180  8250 10650  9810]
NF_LOS_3to11mos	13 [3480 3390 3940 4590 2570 4260 5380 4130]
NF_LOS_5plus	13 [16510 15830 17910 19070 14140 13730 13440 13780]
NF_LOS_NR	13 [660 370 610 440 300 500 150 220]
NF_LOS_ltg3mos	13 [2450  990 2660 1050 1080 1040 1190 1010]
NF_LOS_1to5yr	15 [1120  930  930  820  850  880  730  880]
NF_LOS_3to11mos	15 [370 270 210 260 570 250 150 160]
NF_LOS_5plus	15 [1780 3820 2370 1890 1420 1370 1380 1840]
NF_LOS_NR	15 [50 20 40 30 70 20 20 20]
NF_LOS_ltg3mos	15 [90 30 70 60 90 90 70 30]
NF_LOS_1to5yr	17 [5080 4610 2590 4580 4780 2460 6760 3830]
NF_LOS_3to11mos	17 [ 950 2070 1470 1890 1040 1720  890 2700]
NF_LOS_5p

NF_LOS_ltg3mos	4330 [550 340 500 770 200 430 810 480]
NF_LOS_1to5yr	4340 [17240 11330  8710 12270 10970 10000 10110 12540]
NF_LOS_3to11mos	4340 [5010 4480 4410 4710 8380 7540 6030 5970]
NF_LOS_5plus	4340 [19120 13970 14390 15210 13150 12640 14260 11620]
NF_LOS_NR	4340 [400 310 310 240 660 440 840 180]
NF_LOS_ltg3mos	4340 [2120 2340 2290 3930 3600 1660 2800 2620]
NF_LOS_1to5yr	4341 [6200 3320 2800 2280 4180 3160 3320 4160]
NF_LOS_3to11mos	4341 [1720 2460 1180 1060 2420 1080 2240 2180]
NF_LOS_5plus	4341 [9660 6960 7580 7680 5200 5500 6100 6920]
NF_LOS_NR	4341 [120  40 180 140  40 220   0 220]
NF_LOS_ltg3mos	4341 [ 480  980  680  580 1040  420 1200  620]
NF_LOS_1to5yr	4350 [20590 22340 16990 20300 18900 17750 18180 24000]
NF_LOS_3to11mos	4350 [ 9780  8850  8860 10810 10660 12870 11380 12570]
NF_LOS_5plus	4350 [26550 29430 35540 31800 26480 27510 21460 23230]
NF_LOS_NR	4350 [680 820 810 880 370 620 600 890]
NF_LOS_ltg3mos	4350 [3960 4550 5170 4590 5420 5740 4890 6600]
NF_LOS_1to5yr	4351 [2

## SOC code that were not found in the data set

In [17]:
print(no_genericDB_data_dict.keys())

dict_keys(['111030', '111031', '112032', '112033', '113012', '113013', '119070', '119071', '119072', '119120', '119121', '119130', '119131', '119170', '119171', '119179', '112010', '112011', '113110', '113111', '119039', '119040', '119041', '119080', '119081', '119160', '119161', '111010', '111011', '112021', '112030', '113020', '113021', '113050', '113051', '113060', '113061', '113070', '113071', '113120', '113121', '113130', '113131', '119010', '119013', '119031', '131010', '131011', '131074', '131082', '132022', '132023', '132030', '132031', '132040', '132041', '132054', '132082', '131021', '131130', '131131', '132052', '132060', '132061', '132080', '132081', '131022', '131023', '131032', '131050', '131051', '131075', '131080', '131081', '131120', '131121', '131160', '131161', '132010', '132011', '132020', '132050', '132051', '132053', '132070', '132071', '132072', '132090', '132099', '131140', '131141', '151200', '151210', '151211', '151212', '151220', '151221', '151230', '151231',