In [1]:
# Used to get zoom-able & resize-able notebook. This is the best for quick tests where you need to work interactively.
%matplotlib notebook    

# this only draws the images, not interactive / zoom-able but it works well. I recommend to change figure size to be bigger than the default in most cases
#%matplotlib inline

In [2]:
# Used for high quality plot
%config InlineBackend.figure_format = 'svg'

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import matplotlib.dates as mdates
import datetime
import math
from scipy import optimize

## The following is how you import an object:

This object lets you access files from the GITHUB repository of files
its where all of the COVID-19 data is kept.

What is GITHUB : https://en.wikipedia.org/wiki/GitHub

Documentation : https://pygithub.readthedocs.io/en/latest/

In [2]:
from github import Github

## GITHUB ACCESS
When you import the github library, an object called g needs to be
created. This object will represent the github object that we will use
to pull data from the github repository. You will be able to pull data
from your own personal repository, or any repository that exits in
github.

This object will be used to access any file or directory within the
github "filesystem".

For this access to occur you need to insert your personal github
username and password. If you dont have one, you would need to creat a
github account. Please note that the fields below are exactly what
you would enter when prompted when logging into the github site.

In [4]:
# Replace with your login
#
#g = Github("username", "password")

In the following command we create another object called
repo (short for repository of files), and assign it to a specific
repository. Specifically the PHD repository located at:
https://github.com/ehsintegration/yfd-phd-bls-data

In [5]:
repo = g.get_repo("ehsintegration/yfd-phd-bls-data")

In [6]:
contents = repo.get_contents("")
print(contents)
for index in contents:
    print(index)

[ContentFile(path="DATA"), ContentFile(path="README.md"), ContentFile(path="scripts")]
ContentFile(path="DATA")
ContentFile(path="README.md")
ContentFile(path="scripts")


## Get List of Files
Get the list of files from the directory that we are interested in,
the DATA folder


In [7]:
contents = repo.get_contents("DATA")
count = 1
for index in contents:
    url = index.download_url
    print(url)
    count = count + 1
print(count)

https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/FATAL_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_14to15_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_16to19_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_20to24_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_25to34_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_35to44_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_45to54_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_55to64_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_65plus_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_NR_all.csv
https://raw.

## Use HTTP Access to grab the files
Here we are pulling in another object called BS4, which stands for
Beautiful Soup 4. We will use the Beautiful Soup Object to fetch web data.
Beautiful soup is normally used to parse HTML, but we are just using
it to download the file and convert it to a string. 

In [8]:
import bs4 as bs
import urllib.request

## Create SOC dictionary

In [9]:
# pull in the list of SOC industry codes
soc = []

desired_file = "SOC_all.csv"

# Iterate through all of the URLs
for index in contents:
    url = index.download_url
    
    # Is content_file a file with the .csv extension
    if (desired_file in url):
        print(url)
        
        # Grab the source contents of the file
        source = urllib.request.urlopen(url).read()
        
        #print(source)
        decoded_data = source.decode("utf-8")
        soc.append(decoded_data)   

#print(soc)

https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/SOC_all.csv


In [10]:
# Create a dictionary of soc_codes
#
soc_dict = {}

line = soc[0].replace("\"","")
soc_array= line.split("\n")

# Create Dictionary of SOC codes
#
for line in soc_array:
    key  = line[0:7].replace("-","")
    data = line[9:]
    soc_dict.setdefault(key, []).append(data)
    
#index = 0
#for key in soc_dict:
#    print(index, key, soc_dict[key])
#    index = index + 1

## Select Generic Database to work with

In [None]:
genericDB = []

desired_file = "FATAL_all.csv

## Create FATAL dictionary

In [11]:
# pull in the data
# Iterate through all of the URLs
for index in contents:
    url = index.download_url
    
    # Is content_file a file with the .csv extension
    if (desired_file in url):
        print(url)
        
        # Grab the source contents of the file
        source = urllib.request.urlopen(url).read()
        
        #print(source)
        decoded_data = source.decode("utf-8")
        genericDB.append(decoded_data)   

#print(gebericDB)

https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/FATAL_all.csv


In [12]:
# Create a dictionary of soc_codes
#
genericDB_dict = {}

line = genericDB[0].replace("\"","")
genericDB_array= line.split("\n")

#print(genericDB_array)

# Create Dictionary of SOC codes
#
for line in genericDB_array[12:]:
    line_array = line.split(",")
    #key  = line_array[0].    [0:7].replace("-","")
    key  = line_array[0][6:12]
    data = line_array[1:]
    #print(line_array)
    
    # Replace empty values with zero
    if (len(key) > 1):
        #print(key,data[0:8])
        for index in range(0,8):
            if (len(data[index]) == 0):
                data[index] = '0'
        #print(key,data[0:8])
        
    # Convert to Integers
    if (len(key) > 1):
        #print(key,data[0:8])
        for index in range(0,8):
            data[index] = int(data[index])
        #print(key,data[0:8])   
            
    # Create dictionary entry
    if (len(key) > 1):
        genericDB_dict.setdefault(key, []).append(data[0:8])
    
#index = 0
#for key in genericDB_dict:
#    print(index, key, genericDB_dict[key])
#    index = index + 1

In [13]:
print(genericDB_dict['000000'][0])

[4693, 4628, 4585, 4821, 4836, 5190, 5147, 5250]


## Create SOC level Groups
Create SOC Level Groups for SOC work groups as a dictionary of keys

Level 1 "11XXXX", "12XXXX"....<br>
Level 2 "110XXX", "111XXX", "120XXX"...<br>
Level 3 "1101XX", "1102XX", "1201XX"......<br>

In [14]:
level1_sockeys = {}
level2_sockeys = {}
level3_sockeys = {}

for key in soc_dict.keys():
    lev1 = key[0:2]
    lev2 = key[0:3]
    lev3 = key[0:4]
    #print(lev1)
    level1_sockeys.setdefault(lev1, []).append(key)
    level2_sockeys.setdefault(lev2, []).append(key)
    level3_sockeys.setdefault(lev3, []).append(key)
    
#print(level1_sockeys)

## Create a count of DATABASE data for the diferent level soc groups

In [15]:
no_genericDB_data_dict = {}

for index in level1_sockeys.keys():
    # running totals for 2011 to 2018
    if (len(index) > 0):
        totals = np.array([0,0,0,0,0,0,0,0])
        for key in level1_sockeys[index]:
            if key in genericDB_dict.keys():
                #print(genericDB_dict[key][0])
                totals = totals + np.array(genericDB_dict[key][0])
                #print(key,genericDB_dict[key])
            else:
                no_genericDB_data_dict.setdefault(key, []).append(0)
                #print("No GenericDB data for ", key)
        print(index,totals)
        
print("") 

for index in level2_sockeys.keys():
    # running totals for 2011 to 2018
    if (len(index) > 0):
        totals = np.array([0,0,0,0,0,0,0,0])
        for key in level2_sockeys[index]:
            if key in genericDB_dict.keys():
                #print(genericDB_dict[key][0])
                totals = totals + np.array(genericDB_dict[key][0])
                #print(key,genericDB_dict[key])
            else:
                no_genericDB_data_dict.setdefault(key, []).append(0)
                #print("No genericDB data for ", key)
        print(index,totals)
        
print("") 

for index in level3_sockeys.keys():
    # running totals for 2011 to 2018
    if (len(index) > 0):
        totals = np.array([0,0,0,0,0,0,0,0])
        for key in level3_sockeys[index]:
            if key in genericDB_dict.keys():
                #print(genericDB_dict[key][0])
                totals = totals + np.array(genericDB_dict[key][0])
                #print(key,genericDB_dict[key])
            else:
                no_genericDB_data_dict.setdefault(key, []).append(0)
                #print("No genericDB data for ", key)
        print(index,totals)
        
print(no_genericDB_data_dict.keys())

11 [415 389 376 402 357 353 370 372]
13 [14  9 15 17 12 15 20 22]
15 [0 0 1 0 0 0 0 0]
17 [24 18 22 22 22 28 13 20]
19 [ 4  4 13  3  1  6  3  3]
21 [14 18 11 22 15 13 26  9]
23 [14  8 13  6 11 13 10 11]
25 [20 17  8  8  7 11 11 18]
27 [32 28 35 37 47 43 35 49]
29 [24 12 20 21 28 22 24 27]
31 [0 0 1 3 1 0 0 5]
33 [267 219 241 198 203 278 257 255]
35 [40 42 56 42 40 69 74 76]
37 [268 246 241 246 279 319 322 342]
39 [53 37 31 42 35 45 44 37]
41 [221 206 205 227 212 238 205 223]
43 [57 62 52 63 45 56 68 41]
45 [211 208 197 226 250 247 211 212]
47 [723 820 789 848 889 930 914 949]
49 [331 327 349 384 371 442 394 389]
51 [137 143 138 132 194 161 139 144]
53 [1086 1115 1110 1185 1163 1231 1291 1317]
55 [0 0 0 0 0 0 0 0]

110 [0 0 0 0 0 0 0 0]
111 [31 32 30 25 22 11 24 39]
112 [ 5  9 15  8  7  4 10  7]
113 [20 12 13 24 16 17 18  8]
119 [359 336 318 345 312 321 318 318]
130 [0 0 0 0 0 0 0 0]
131 [ 7  9  8 14  6  7  8 13]
132 [ 7  0  7  3  6  8 12  9]
150 [0 0 0 0 0 0 0 0]
151 [0 0 0 0 0 0 0 0]
