In this code we take the occupation codes in the ASEC 2010-2019 transitions networks and find their SOC match using the crosslwalk available here for OCC2010 codes
https://www.census.gov/topics/employment/industry-occupation/guidance/code-lists.html

There are occupaitons that do now have a match with the NAICS codes. We remove this nodes from the network. 

In [101]:
import pandas as pd
import random
import numpy as np
from matplotlib import pylab as plt
import scipy.stats
import copy
from scipy.sparse import linalg as LA
import networkx as nx

In [102]:
A = np.genfromtxt(path_data + "asec_10_19_avg.csv", delimiter=',')
G = nx.from_numpy_array(A, create_using=nx.DiGraph())

In [105]:
nx.is_strongly_connected(G)

True

In [106]:
len(A)

442

In [107]:
path_data = "../data/"
file_occ_soc_map  = "2010-occ-codes-soc-categories_clean.csv"
file_occ_in_asec = "occ_names_class_asec.csv"
file_occ_in_asec_emp = "occ_names_employment_asec.csv"

In [108]:
# categories 2010
df_asec = pd.read_csv(path_data + file_occ_in_asec)
df_crosswalk = pd.read_csv(path_data + file_occ_soc_map)
df_crosswalk["2010 SOC Code"] = df_crosswalk["2010 SOC Code"].str.strip()
dict_aseccode_name = dict(zip(df_asec["Code"], df_asec["Label"]))
dict_occ_soc = dict(zip(df_crosswalk["2010 Census Code"], df_crosswalk["2010 SOC Code"]))

In [109]:
dict_occ_soc;

In [111]:
560 in codes_asec

True

In [112]:
codes_asec.difference(codes_crosswalk)

{130,
 560,
 620,
 720,
 730,
 1000,
 1100,
 1960,
 2020,
 2140,
 2150,
 3130,
 3240,
 3410,
 3530,
 3650,
 3950,
 8230,
 9100,
 9830}

In [113]:
# make mappings for those occupations that have correspondance to more levels
one_to_many_asec_cros = {}
one_to_many_asec_naics_many = {}
for c_asec in codes_asec.difference(codes_crosswalk):
    one_to_many_asec_cros[c_asec] = []
    one_to_many_asec_naics_many[c_asec] = []
    for c_cros in codes_crosswalk.difference(codes_asec):
        if len(str(c_asec)) == len(str(c_cros)):
            if str(c_asec)[:-1] == str(c_cros)[:-1]:
                one_to_many_asec_cros[c_asec].append(c_cros)
                one_to_many_asec_naics_many[c_asec] = one_to_many_asec_naics_many[c_asec] + [dict_occ_soc[c_cros]]

In [114]:
one_to_many_asec_cros

{130: [135, 136, 137],
 9100: [],
 8230: [],
 1960: [1965],
 3240: [3245],
 560: [565],
 3130: [],
 3650: [3655],
 3530: [3535],
 1100: [1105, 1106, 1107],
 720: [725, 726],
 3410: [],
 730: [735],
 2140: [2145],
 2020: [2025],
 2150: [],
 9830: [],
 1000: [1005, 1006, 1007],
 620: [],
 3950: [3955]}

In [115]:
# For those that map into only one naics code, add the map
additional_maps = {}
for code, mapping in one_to_many_asec_naics_many.items():
    if len(mapping) == 1:
        additional_maps[code] = mapping[0]

In [117]:
# For those that have more than one map, let's do it manually
for code, mapping in one_to_many_asec_naics_many.items():
    if len(mapping) > 1:
        print(code, one_to_many_asec_naics_many[code])

130 ['11-3111', '11-3121', '11-3131']
1100 ['15-1142', '15-1143', '15-1199']
720 ['13-1121', '13-1131']
1000 ['15-1111', '15-1121', '15-1122']


In [118]:
manual_maps = {130:'11-31XX', 720:'13-1121', 1000:'15-112X',1100:'15-114X' }

In [119]:
# occupations without a match
no_match = []
for key, val in one_to_many_asec_cros.items():
    if val == []:
        print(key, dict_aseccode_name[key])
        no_match.append(key)

9100 Bus and Ambulance Drivers and Attendants
8230 Bookbinders, Printing Machine Operators, and Job Printers
3130 Registered Nurses
3410 Health Diagnosing and Treating Practitioner Support Technicians
2150 Legal Support Workers, nec
9830 Military, Rank Not Specified
620 Human Resources, Training, and Labor Relations Specialists


In [120]:
additional_maps = {**additional_maps, **manual_maps}

In [122]:
dict_occ_soc = {**dict_occ_soc, **additional_maps}

In [123]:
df_asec["soc_code"] = df_asec["Code"].map(dict_occ_soc)

In [124]:
df_asec[df_asec["soc_code"].isna()]

Unnamed: 0,Code,BroadClassification,Label,soc_code
28,620,BUSINESS OPERATIONS SPECIALISTS,"Human Resources, Training, and Labor Relations...",
97,2150,LEGAL,"Legal Support Workers, nec",
132,3130,HEALTHCARE PRACTITIONERS AND TECHNICAL,Registered Nurses,
147,3410,HEALTHCARE PRACTITIONERS AND TECHNICAL,Health Diagnosing and Treating Practitioner Su...,
373,8230,PRODUCTION,"Bookbinders, Printing Machine Operators, and J...",
417,9100,TRANSPORTATION AND MATERIAL MOVING,Bus and Ambulance Drivers and Attendants,
441,9830,TRANSPORTATION AND MATERIAL MOVING,"Military, Rank Not Specified",


In [126]:
len(df_asec)

442

In [125]:
df_asec["soc_code"].isna().sum()

7

In [127]:
df_asec.to_csv(path_data + "occ_names_class_asec_soc_map.csv")

## Following code might be useful if we decide to remove nodes with no match

In [77]:
no_match

[9100, 8230, 3130, 3410, 2150, 9830, 620]