In [215]:
import sys
sys.path.append("./modules")

In [216]:
import d3plusScotty as d3plus
import pandas as pd
import importlib
from tkinter import *
import numpy as np
import networkx as nx
import requests
import itertools

In [23]:
importlib.reload(d3plus)

<module 'd3plusODA' from './modules\\d3plusODA.py'>

In [24]:
def hexcolor(value, range, reverse=False):
    min = 0
    max = abs(range[1]-range[0])
    value -= range[0]

    if value <= int((min+max)/2):
        r = 255
        g = int(255*value/int((min+max)/2))
        b = 0
    else:
        r = int(255*(max-value)/int((min+max)/2))
        g = 255
        b = 0

    if reverse:
        r, g = g, r

    return "#%s%s%s" % tuple([hex(c)[2:].rjust(2, "0") for c in (r, g, b)])

#### Using Natural language Processing for Similarity Scores

In [26]:
import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

In [27]:
STOP_WORDS = spacy.lang.en.stop_words.STOP_WORDS

In [28]:
df_skills_class = pd.read_excel("./datasets/Australian Skills Classification 12-03-2021.xlsx", 
                                sheet_name='Occupation Descriptions')

In [29]:
df_skills_class.head()

Unnamed: 0,ANZSCO_Code,ANZSCO_Title,ANZSCO_Desc
0,1111,Chief Executives and Managing Directors,Chief Executives and Managing Directors determ...
1,1112,General Managers,"General Managers plan, organise, direct, contr..."
2,1213,Livestock Farmers,"Livestock Farmers plan, organise, control, coo..."
3,1214,Mixed Crop and Livestock Farmers,"Mixed Crop and Livestock Farmers plan, organis..."
4,1321,Corporate Services Managers,Corporate Services Managers organise and direc...


In [31]:
list_combinations = np.array(list(itertools.combinations(df_skills_class.ANZSCO_Code, 2)))

In [32]:
len(list_combinations)

179700

In [33]:
list_combinations

array([[  1111,   1112],
       [  1111,   1213],
       [  1111,   1214],
       ...,
       [254421, 254423],
       [254421, 254425],
       [254423, 254425]])

In [34]:
# Natural language processing using word embedings - Neural Networks
dict_desc_nlp = {k:nlp(d) for k,d in list(zip(list(set(df_skills_class.ANZSCO_Code)), list(set(df_skills_class.ANZSCO_Desc))))}

In [38]:
list_of_similarities = []
for combination in list_combinations:
    list_of_similarities.append(dict_desc_nlp[combination[0]].similarity(dict_desc_nlp[combination[1]]))

In [46]:
list_combinations_sim = np.append(list_combinations, np.reshape(list_of_similarities, (-1, 1)), axis=1)

In [47]:
list_combinations_sim[:3]

array([[1.11100000e+03, 1.11200000e+03, 7.21183404e-01],
       [1.11100000e+03, 1.21300000e+03, 7.19935780e-01],
       [1.11100000e+03, 1.21400000e+03, 7.05420558e-01]])

In [54]:
len(list_combinations_sim)

179700

In [63]:
# Getting only those jobs that have more than 80% of similarities 
list_combinations_sim_filt = list_combinations_sim[(list_combinations_sim[:,2] > 0.8)]

In [65]:
list_combinations_sim_filt

array([[1.11100000e+03, 1.32500000e+03, 8.12333432e-01],
       [1.11100000e+03, 2.24300000e+03, 8.15162869e-01],
       [1.11100000e+03, 2.49200000e+03, 8.29120704e-01],
       ...,
       [2.54417000e+05, 2.54418000e+05, 8.36401604e-01],
       [2.54418000e+05, 2.54423000e+05, 8.02816170e-01],
       [2.54421000e+05, 2.54423000e+05, 8.26112608e-01]])

#### Using ABS API

In [69]:
# Using Census 2016, G58 Occupation by Hours worked by sex -> id="ABS_C16_G58_SA" agencyID="ABS"
# curl -X GET "https://api.data.abs.gov.au/data/ABS_C16_G58_SA/all?detail=full" -H "accept: application/vnd.sdmx.data+json"
requests.get("https://api.data.abs.gov.au/data/ABS%2CABS_C16_G58_SA%2C1.0.0/all?startPeriod=2016&endPeriod=2020&detail=dataonly&dimensionAtObservation=TIME_PERIOD") 

<Response [504]>

#### Visualization

In [193]:
df_viz_elements = pd.DataFrame(list_combinations_sim_filt[:,:2], columns=['NodeFrom', 'NodeTo'], dtype=int)

In [194]:
df_viz_elements['NodeFromValue'] = np.random.randint(10, 100, len(list_combinations_sim_filt))
df_viz_elements['NodeToValue'] = np.random.randint(10, 100, len(list_combinations_sim_filt))

In [195]:
df_viz_elements['EdgeValue'] = list_combinations_sim_filt[:,2]

In [196]:
df_viz_elements = df_viz_elements[['NodeFrom','NodeFromValue','NodeTo','NodeToValue','EdgeValue']]

In [197]:
df_viz_elements.head()

Unnamed: 0,NodeFrom,NodeFromValue,NodeTo,NodeToValue,EdgeValue
0,1111,25,1325,54,0.812333
1,1111,33,2243,22,0.815163
2,1111,44,2492,54,0.829121
3,1111,59,3221,80,0.822058
4,1111,45,3411,86,0.825382


In [198]:
df_viz_elements = df_viz_elements.sort_values(by='EdgeValue', ascending=False)

In [200]:
df_viz_elements = df_viz_elements.groupby(['NodeFrom'], as_index=False).head(5)

In [201]:
df_viz_elements.shape

(2854, 5)

In [202]:
drop_indices = np.random.choice(df_viz_elements.index, 1854, replace=False)
df_viz_elements = df_viz_elements.drop(drop_indices)

In [203]:
df_nodefrom = df_viz_elements[['NodeFrom', 'NodeFromValue']]
df_nodefrom.columns = ['node', 'value']
df_nodeto = df_viz_elements[['NodeTo', 'NodeToValue']]
df_nodeto.columns = ['node', 'value']

In [204]:
df_viz_nodes = pd.concat([df_nodefrom, df_nodeto], axis=0)
df_viz_nodes = df_viz_nodes.groupby(['node'], as_index=False).first()

In [205]:
print(list(df_viz_nodes.node))

[1111, 1112, 1213, 1321, 1323, 1324, 1325, 1331, 1332, 1333, 1336, 1341, 1342, 1343, 1344, 1351, 1413, 1414, 1421, 1491, 1493, 2113, 2114, 2123, 2124, 2221, 2222, 2232, 2243, 2245, 2246, 2253, 2254, 2322, 2325, 2333, 2334, 2341, 2344, 2346, 2347, 2411, 2412, 2413, 2414, 2422, 2491, 2492, 2511, 2521, 2522, 2523, 2525, 2526, 2532, 2533, 2534, 2535, 2541, 2543, 2544, 2611, 2633, 2711, 2712, 2721, 2722, 2724, 2725, 3111, 3113, 3124, 3132, 3211, 3221, 3222, 3241, 3242, 3321, 3322, 3331, 3333, 3334, 3411, 3421, 3422, 3424, 3511, 3512, 3513, 3514, 3611, 3613, 3621, 3622, 3623, 3624, 3911, 3922, 3923, 3933, 3941, 3991, 3994, 3996, 4111, 4113, 4115, 4116, 4221, 4231, 4232, 4234, 4312, 4313, 4314, 4315, 4421, 4511, 4513, 4514, 4517, 4521, 4522, 4523, 4524, 5121, 5211, 5311, 5412, 5512, 5513, 5521, 5522, 5611, 5613, 5614, 5615, 5616, 5912, 5991, 5992, 5993, 5994, 5995, 5996, 5997, 6111, 6112, 6113, 6211, 6214, 6215, 6217, 6311, 6393, 6394, 6395, 7111, 7112, 7113, 7114, 7115, 7116, 7117, 7121, 712

In [206]:
print(list(df_viz_nodes.value))

[41, 87, 42, 39, 23, 45, 57, 49, 36, 47, 67, 87, 20, 98, 96, 65, 74, 64, 48, 63, 33, 95, 92, 58, 43, 67, 83, 19, 46, 95, 92, 88, 31, 69, 55, 25, 64, 23, 11, 33, 62, 82, 84, 46, 41, 99, 87, 13, 57, 84, 30, 73, 30, 14, 23, 34, 17, 23, 30, 65, 59, 18, 31, 88, 73, 86, 95, 81, 76, 99, 86, 88, 38, 72, 97, 27, 81, 36, 10, 39, 11, 99, 52, 12, 16, 41, 91, 60, 67, 35, 13, 56, 43, 32, 70, 81, 84, 16, 81, 60, 89, 99, 73, 91, 59, 14, 91, 33, 65, 81, 65, 76, 22, 11, 11, 25, 56, 74, 56, 28, 48, 98, 29, 56, 17, 28, 18, 65, 22, 35, 70, 32, 57, 11, 53, 57, 97, 11, 25, 22, 60, 77, 70, 12, 52, 60, 84, 18, 91, 41, 88, 91, 38, 51, 50, 10, 67, 64, 95, 90, 36, 88, 98, 70, 14, 35, 42, 54, 13, 81, 74, 63, 50, 42, 72, 10, 21, 40, 27, 75, 93, 73, 84, 36, 54, 95, 29, 60, 51, 63, 50, 51, 86, 34, 11, 17, 72, 65, 84, 41, 69, 30, 74, 50, 66, 58, 66, 50, 37, 24, 67, 89, 94, 85, 86, 33, 33, 99, 90, 27, 24, 65, 28, 33, 41, 98, 42, 43, 99, 89, 71, 16, 11, 14, 19, 73, 70, 95, 91, 75, 50, 19, 54, 47, 86, 70, 74, 28, 43, 45,

In [207]:
print(len(list(df_viz_nodes.node)))

562


In [208]:
min_value = df_viz_nodes.value.max()
max_value = df_viz_nodes.value.min()
df_viz_nodes['color'] = df_viz_nodes['value'].apply(lambda x:hexcolor(x, [min_value, max_value]))

In [209]:
df_viz_nodes.head()

Unnamed: 0,node,value,color
0,1111,41,#ffx15000
1,1112,87,#ffx4500
2,1213,42,#ffx14a00
3,1321,39,#ffx15b00
4,1323,23,#ffx1b800


In [210]:
json_viz_elements = d3plus.build_network(df_viz_elements, 'NodeFrom', 'NodeTo', 'EdgeValue')

In [211]:
df_viz_nodes['node_name'] = df_viz_nodes.node.apply(lambda x:str(df_skills_class[df_skills_class.ANZSCO_Code==x].ANZSCO_Title.values[0]))

In [212]:
df_viz_nodes.head()

Unnamed: 0,node,value,color,node_name
0,1111,41,#ffx15000,Chief Executives and Managing Directors
1,1112,87,#ffx4500,General Managers
2,1213,42,#ffx14a00,Livestock Farmers
3,1321,39,#ffx15b00,Corporate Services Managers
4,1323,23,#ffx1b800,Human Resource Managers


In [213]:
ns_viz = d3plus.NetworkSpace(id="node", presence="M", color="value", name=["node_name"], size='value', 
                             graph_data=json_viz_elements, tooltip=["value"])

In [214]:
ns_viz.draw(df_viz_nodes)
open("./elements_visualization_2.html" , "w+").write(ns_viz.dump_html(df_viz_nodes))

<IPython.core.display.Javascript object>

135809