In [1]:
# Used to get zoom-able & resize-able notebook. This is the best for quick tests where you need to work interactively.
%matplotlib notebook    

# this only draws the images, not interactive / zoom-able but it works well. I recommend to change figure size to be bigger than the default in most cases
#%matplotlib inline

In [2]:
# Used for high quality plot
%config InlineBackend.figure_format = 'svg'

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import matplotlib.dates as mdates
import datetime
import math
from scipy import optimize

## GITHUB ACCESS
The following is how you import an object:

This object lets you access files from the GITHUB repository of files
its where all of the COVID-19 data is kept.

What is GITHUB : https://en.wikipedia.org/wiki/GitHub

Documentation : https://pygithub.readthedocs.io/en/latest/

In [4]:
from github import Github

When you import the github library, an object called g needs to be
created. This object will represent the github object that we will use
to pull data from the github repository. You will be able to pull data
from your own personal repository, or any repository that exits in
github.

This object will be used to access any file or directory within the
github "filesystem".

For this access to occur you need to insert your personal github
username and password. If you dont have one, you would need to creat a
github account. Please note that the fields below are exactly what
you would enter when prompted when logging into the github site.

In [5]:
# Replace with your login
#
#g = Github("username", "password")

In the following command we create another object called
repo (short for repository of files), and assign it to a specific
repository. Specifically the PHD repository located at:
https://github.com/ehsintegration/yfd-phd-bls-data

In [6]:
repo = g.get_repo("ehsintegration/yfd-phd-bls-data")

In [7]:
contents = repo.get_contents("")
print(contents)
for index in contents:
    print(index)

[ContentFile(path="DATA"), ContentFile(path="README.md"), ContentFile(path="scripts")]
ContentFile(path="DATA")
ContentFile(path="README.md")
ContentFile(path="scripts")


## Get List of Files
Get the list of files from the directory that we are interested in, the DATA folder

In [8]:
contents = repo.get_contents("DATA")
count = 1
for index in contents:
    url = index.download_url
    print(url)
    count = count + 1
print(count)

https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/FATAL_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_14to15_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_16to19_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_20to24_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_25to34_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_35to44_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_45to54_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_55to64_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_65plus_all.csv
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_AGE_NR_all.csv
https://raw.

## Use HTTP Access to grab the files
Here we are pulling in another object called BS4, which stands for
Beautiful Soup 4. We will use the Beautiful Soup Object to fetch web data.
Beautiful soup is normally used to parse HTML, but we are just using
it to download the file and convert it to a string. 

In [9]:
import bs4 as bs
import urllib.request

## Create SOC dictionary

In [10]:
# pull in the list of SOC industry codes
soc = []

desired_file = "SOC_all.csv"

# Iterate through all of the URLs
for index in contents:
    url = index.download_url
    
    # Is content_file a file with the .csv extension
    if (desired_file in url):
        print(url)
        
        # Grab the source contents of the file
        source = urllib.request.urlopen(url).read()
        
        #print(source)
        decoded_data = source.decode("utf-8")
        soc.append(decoded_data)   

#print(soc)

https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/SOC_all.csv


In [11]:
# Create a dictionary of soc_codes
#
soc_dict = {}

line = soc[0].replace("\"","")
soc_array= line.split("\n")

# Create Dictionary of SOC codes
#
for line in soc_array:
    key  = line[0:7].replace("-","")
    data = line[9:]
    soc_dict.setdefault(key, []).append(data)
    
#index = 0
#for key in soc_dict:
#    print(index, key, soc_dict[key])
#    index = index + 1

## Select Generic Database to work with

In [None]:
genericDB_files_dict = {}

desired_file_prefix = "NF_GENDER"  # "NF_AGE_14to15_all.csv"

## Create NF_LOS dictionary

In [12]:
# pull in the data

# Iterate through all of the URLs
for index in contents:
    url = index.download_url
    
    # Is content_file a file with the .csv extension
    if (desired_file_prefix in url):
        print(url)
        
        name = url.split("/")[-1].split("_all")[0]
        print(name)
        
        # Grab the source contents of the file
        source = urllib.request.urlopen(url).read()
        
        #print(source)
        decoded_data = source.decode("utf-8")
        genericDB_files_dict.setdefault(name, []).append(decoded_data)
        #genericDB.append(decoded_data)   

#print(genericDB_files_dict)

https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_GENDER_F_all.csv
NF_GENDER_F
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_GENDER_M_all.csv
NF_GENDER_M
https://raw.githubusercontent.com/ehsintegration/yfd-phd-bls-data/master/DATA/NF_GENDER_NR_all.csv
NF_GENDER_NR


In [13]:
# Create a dictionary of soc_codes
#
genericDB_dict = {}


# Create Dictionary of SOC codes
#
for db in genericDB_files_dict.keys():
    
    genericDB = genericDB_files_dict[db]
    
    #print(genericDB[0])    
    line = genericDB[0].replace("\"","")
    genericDB_array= line.split("\n")
    #print(genericDB_array[12:])
    
    # Line 12 is where the header stops and the data starts
    #
    for line in genericDB_array[12:]:
        line_array = line.split(",")
        #key  = line_array[0].    [0:7].replace("-","")
        key  = line_array[0][6:12]
        data = line_array[1:]
        #print(line_array)
        #print(key,data)

        # Replace empty values with zero
        if (len(key) > 1):
            #print(key,data[0:8])
            for index in range(0,8):
                if (len(data[index]) == 0):
                    data[index] = '0'
            #print(key,data[0:8])
        
        # Convert to Integers
        if (len(key) > 1):
            #print(key,data[0:8])
            for index in range(0,8):
                data[index] = int(data[index])
            #print(key,data[0:8])   
            
        # Create dictionary entry
        if (len(key) > 1):
            genericDB_dict.setdefault(db+"_"+key, []).append(data[0:8])
    
index = 0
for key in genericDB_dict:
    print(index, key, genericDB_dict[key])
    index = index + 1

0 NF_GENDER_F_0XXXXX [[1320, 1010, 1120, 930, 390, 760, 270, 290]]
1 NF_GENDER_F_110000 [[14550, 15470, 14530, 13000, 12950, 14560, 13620, 14920]]
2 NF_GENDER_F_111000 [[1950, 2850, 1980, 1640, 1230, 2460, 1590, 1390]]
3 NF_GENDER_F_111010 [[220, 90, 420, 120, 80, 420, 230, 400]]
4 NF_GENDER_F_111011 [[220, 90, 420, 120, 80, 420, 230, 400]]
5 NF_GENDER_F_111020 [[1720, 2750, 1560, 1520, 1140, 2010, 1360, 990]]
6 NF_GENDER_F_111021 [[1720, 2750, 1560, 1520, 1140, 2010, 1360, 990]]
7 NF_GENDER_F_112000 [[700, 800, 990, 590, 830, 790, 380, 590]]
8 NF_GENDER_F_112010 [[30, 90, 30, 50, 0, 0, 30, 20]]
9 NF_GENDER_F_112011 [[30, 90, 30, 50, 0, 0, 30, 20]]
10 NF_GENDER_F_112020 [[640, 600, 840, 420, 790, 750, 300, 540]]
11 NF_GENDER_F_112021 [[210, 140, 160, 80, 210, 140, 160, 250]]
12 NF_GENDER_F_112022 [[430, 450, 670, 340, 580, 600, 140, 290]]
13 NF_GENDER_F_112030 [[30, 110, 130, 130, 40, 30, 60, 30]]
14 NF_GENDER_F_112031 [[30, 110, 130, 130, 40, 30, 60, 30]]
15 NF_GENDER_F_113000 [[2420,

1289 NF_GENDER_M_151132 [[80, 30, 70, 30, 60, 20, 0, 20]]
1290 NF_GENDER_M_151133 [[100, 100, 110, 220, 250, 60, 60, 40]]
1291 NF_GENDER_M_151134 [[0, 0, 0, 0, 0, 0, 0, 20]]
1292 NF_GENDER_M_151140 [[390, 280, 290, 320, 220, 330, 230, 170]]
1293 NF_GENDER_M_151141 [[20, 0, 0, 0, 20, 60, 0, 0]]
1294 NF_GENDER_M_151142 [[350, 200, 150, 240, 160, 230, 180, 110]]
1295 NF_GENDER_M_151143 [[20, 70, 130, 60, 40, 30, 40, 50]]
1296 NF_GENDER_M_151150 [[1070, 1160, 490, 810, 600, 700, 500, 470]]
1297 NF_GENDER_M_151151 [[400, 290, 160, 340, 310, 200, 210, 140]]
1298 NF_GENDER_M_151152 [[670, 870, 330, 470, 290, 500, 290, 330]]
1299 NF_GENDER_M_151190 [[160, 160, 130, 170, 90, 210, 250, 110]]
1300 NF_GENDER_M_151199 [[160, 160, 130, 170, 90, 210, 250, 110]]
1301 NF_GENDER_M_152000 [[70, 380, 50, 40, 80, 40, 30, 40]]
1302 NF_GENDER_M_152030 [[30, 380, 40, 30, 20, 0, 30, 40]]
1303 NF_GENDER_M_152031 [[30, 380, 40, 30, 20, 0, 30, 40]]
1304 NF_GENDER_M_152040 [[0, 0, 0, 0, 50, 40, 0, 0]]
1305 NF_GEND

2622 NF_GENDER_NR_514030 [[0, 0, 60, 0, 0, 0, 0, 0]]
2623 NF_GENDER_NR_514031 [[0, 0, 60, 0, 0, 0, 0, 0]]
2624 NF_GENDER_NR_514040 [[100, 110, 110, 90, 100, 110, 120, 90]]
2625 NF_GENDER_NR_514041 [[100, 110, 110, 90, 100, 110, 120, 90]]
2626 NF_GENDER_NR_514120 [[30, 0, 0, 0, 30, 20, 0, 0]]
2627 NF_GENDER_NR_514121 [[0, 0, 0, 0, 30, 20, 0, 0]]
2628 NF_GENDER_NR_519000 [[0, 0, 40, 40, 50, 100, 0, 90]]
2629 NF_GENDER_NR_519060 [[0, 0, 30, 0, 0, 0, 0, 0]]
2630 NF_GENDER_NR_519061 [[0, 0, 30, 0, 0, 0, 0, 0]]
2631 NF_GENDER_NR_519190 [[0, 0, 0, 40, 50, 70, 0, 80]]
2632 NF_GENDER_NR_519198 [[0, 0, 0, 0, 0, 20, 0, 0]]
2633 NF_GENDER_NR_519199 [[0, 0, 0, 40, 40, 50, 0, 70]]
2634 NF_GENDER_NR_530000 [[1780, 1600, 1660, 2060, 2060, 1930, 1690, 2300]]
2635 NF_GENDER_NR_531000 [[70, 100, 80, 80, 110, 90, 80, 70]]
2636 NF_GENDER_NR_531020 [[70, 100, 80, 80, 90, 90, 70, 60]]
2637 NF_GENDER_NR_531021 [[70, 100, 80, 80, 90, 90, 70, 60]]
2638 NF_GENDER_NR_531030 [[0, 0, 0, 0, 20, 0, 0, 0]]
2639 NF_GEN

In [15]:
print(genericDB_dict['NF_GENDER_F_113020'][0])

[100, 60, 110, 110, 140, 90, 60, 100]


## Create SOC Level Groups
Create SOC Level Groups for SOC work groups as a dictionary of keys

Level 1 "11XXXX", "12XXXX"....<br>
Level 2 "110XXX", "111XXX", "120XXX"...<br>
Level 3 "1101XX", "1102XX", "1201XX"......<br>

In [16]:
level1_sockeys = {}
level2_sockeys = {}
level3_sockeys = {}

for key in soc_dict.keys():
    lev1 = key[0:2]
    lev2 = key[0:3]
    lev3 = key[0:4]
    #print(lev1)
    level1_sockeys.setdefault(lev1, []).append(key)
    level2_sockeys.setdefault(lev2, []).append(key)
    level3_sockeys.setdefault(lev3, []).append(key)
    
#print(level1_sockeys)

## Create a count of DATABASE data for the diferent level soc groups

In [17]:
no_genericDB_data_dict = {}

for index in level1_sockeys.keys():
    # running totals for 2011 to 2018   
    for db in genericDB_files_dict.keys(): 
        if (len(index) > 0):
            totals = np.array([0,0,0,0,0,0,0,0])
            for key in level1_sockeys[index]:
                new_key = db+"_"+key
                if new_key in genericDB_dict.keys():
                    #print(genericDB_dict[key][0])
                    totals = totals + np.array(genericDB_dict[new_key][0])
                    #print(key,genericDB_dict[key])
                else:
                    no_genericDB_data_dict.setdefault(key, []).append(0)
                    #print("No GenericDB data for ", key)
            print(db+"\t"+index,totals)
        
print("") 

for index in level2_sockeys.keys():
    # running totals for 2011 to 2018   
    for db in genericDB_files_dict.keys(): 
        if (len(index) > 0):
            totals = np.array([0,0,0,0,0,0,0,0])
            for key in level2_sockeys[index]:
                new_key = db+"_"+key
                if new_key in genericDB_dict.keys():
                    #print(genericDB_dict[key][0])
                    totals = totals + np.array(genericDB_dict[new_key][0])
                    #print(key,genericDB_dict[key])
                else:
                    no_genericDB_data_dict.setdefault(key, []).append(0)
                    #print("No GenericDB data for ", key)
            print(db+"\t"+index,totals)
        
print("") 

for index in level3_sockeys.keys():
    # running totals for 2011 to 2018   
    for db in genericDB_files_dict.keys(): 
        if (len(index) > 0):
            totals = np.array([0,0,0,0,0,0,0,0])
            for key in level3_sockeys[index]:
                new_key = db+"_"+key
                if new_key in genericDB_dict.keys():
                    #print(genericDB_dict[key][0])
                    totals = totals + np.array(genericDB_dict[new_key][0])
                    #print(key,genericDB_dict[key])
                else:
                    no_genericDB_data_dict.setdefault(key, []).append(0)
                    #print("No GenericDB data for ", key)
            print(db+"\t"+index,totals)
        
print("") 

NF_GENDER_F	11 [56920 60460 56550 51190 51060 57500 53770 58570]
NF_GENDER_M	11 [46350 40630 48440 44940 56760 35930 39660 45450]
NF_GENDER_NR	11 [ 40  80  20   0   0 200   0 140]
NF_GENDER_F	13 [21200 21910 27860 24280 19470 19400 22720 19270]
NF_GENDER_M	13 [10680  9490  8560 10570  9060  8820  8530  9890]
NF_GENDER_NR	13 [ 0  0  0  0  0  0  0 20]
NF_GENDER_F	15 [1390 1670 2080 1210 1280 1010 1100 1730]
NF_GENDER_M	15 [2190 3450 1540 1870 1700 1620 1260 1270]
NF_GENDER_NR	15 [0 0 0 0 0 0 0 0]
NF_GENDER_F	17 [1960 2110 1410 1370 3370 1360 1310 1330]
NF_GENDER_M	17 [12150 12760 12380 14210 11780  9800 12050 12190]
NF_GENDER_NR	17 [0 0 0 0 0 0 0 0]
NF_GENDER_F	19 [4380 5180 4130 5270 4390 3160 4420 4110]
NF_GENDER_M	19 [6530 4340 4470 7040 3800 5340 4180 3700]
NF_GENDER_NR	19 [0 0 0 0 0 0 0 0]
NF_GENDER_F	21 [38860 42130 43170 40830 39870 46000 37720 42840]
NF_GENDER_M	21 [13870 15720 14570 16510 13580 16470 14780 10690]
NF_GENDER_NR	21 [  0   0  40  40 160  40   0 240]
NF_GENDER_F	23 [

## SOC code that were not found in the data set

In [18]:
print(no_genericDB_data_dict.keys())

dict_keys(['111030', '111031', '112032', '112033', '113012', '113013', '119072', '119130', '119131', '119170', '119171', '119179', '113110', '113111', '119070', '119071', '119120', '119121', '111010', '111011', '112000', '112010', '112011', '112020', '112021', '112022', '112030', '113000', '113010', '113020', '113021', '113030', '113031', '113050', '113051', '113060', '113061', '113070', '113071', '113120', '113121', '113130', '113131', '119010', '119013', '119020', '119021', '119030', '119031', '119032', '119033', '119039', '119040', '119041', '119050', '119051', '119080', '119081', '119110', '119111', '119140', '119141', '119150', '119151', '119160', '119161', '119190', '119199', '131010', '131011', '131074', '131082', '132022', '132023', '132054', '132030', '132031', '132040', '132041', '132071', '132082', '131000', '131020', '131021', '131022', '131023', '131030', '131031', '131032', '131040', '131041', '131050', '131051', '131070', '131071', '131075', '131080', '131081', '131110',