In [1]:
import spacy 
import json
import re
import pandas as pd

In [51]:
def load_txt(file):
    with open(file,"r",encoding="utf-8") as f:
        data = []
        for line in f:
            line = line.replace("\n","")
            data.append(line)
    return data

def load_data(file):
    with open(file,"r",encoding="utf-8") as f:
        data = json.load(f)
    return data

def save_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [52]:
job_dict_malaysia = load_data("data/job_dict_malaysia.json")
job_dict_singapore = load_data("data/test_job_dict_linkedin.json")
job_dict = {}
job_dict.update(job_dict_malaysia)
job_dict.update(job_dict_singapore)

In [53]:
len(job_dict)

3429

In [54]:
def get_separated_roles(job_dict):

    roles_dict = {}
    data_science = []
    data_engineer = []
    machine_learning = []
    analyst = []
    art_int = []
    # left_over= []
    # statistics= []
    # job_dict
    i = 0
    for listing,key in zip(job_dict,list(job_dict.keys())):
        i+=1
        inf = job_dict.get(key)
        role = inf.get('role')
        ds = bool(re.search(".*[Dd]ata\s.*[Ssc]ien.*", role))
        de = bool(re.search(".*[Dd]ata\s.*[Ee]ngine.*", role))
        ml_long = bool(re.search(".*[Mm]achine\s.*[Ll]earn.*",role))
        ml = bool(re.search(".*[Mm][Ll]\s.*", role))
        anal = bool(re.search(".*[Dd]ata\s.*[Aa]nal.*", role))
        ai_long = bool(re.search(".*[Aa]rtificial\s.*[Ii]ntelligence.*", role))
        ai = bool(re.search(".*\s[Aa][Ii]\s.*", role))
        # stat = bool(re.search(".*[Sst]at.*\s.*[Pp]rogram.*", role))
        if(ds):
            data_science.append(listing)
        elif(de):
            data_engineer.append(listing)
        elif(ml_long or ml):
            machine_learning.append(listing)
        elif(anal):
            analyst.append(listing)
        elif(ai or ai_long):
            art_int.append(listing)
        # elif(stat):
        #     statistics.append(listing)
        # else:
        #     left_over.append(listing)
    roles_dict = {"Data Analyst":analyst,"Machine Learning":machine_learning+art_int,"Data Scientist":data_science,"Data Engineer":data_engineer}
    return roles_dict

def get_descriptions(roles__dict,job_dict):
    
    
    roles_description_dict = {}
    for key in roles_dict.keys():
        descriptions = []
        for link in roles_dict[key]:
            
            job = job_dict.get(link)
            des = str(job.get('description')).replace('\n'," ")
            if(des!='None'):
                descriptions.append(des)
        roles_description_dict[key] = descriptions
    # print(roles_description_dict)
    return roles_description_dict

In [55]:
roles_dict = {}
roles_description_dict = {}
roles_dict = get_separated_roles(job_dict)
roles_description_dict = get_descriptions(roles_dict,job_dict)

In [56]:
nlp_model = spacy.load("Modelling/final_models_v2/model-best/")


programming_languages_dict = {}
tool_dict = {}
education_dict = {}
library_dict = {}

for key in roles_description_dict.keys():
    print(key)
    programming_languages = []
    tool = []
    education = []
    library = []
    for description in roles_description_dict[key]:
        # print(description)
        doc = nlp_model(description)
        pl = []
        tl = []
        lib = []
        edu = []       
        for ent in doc.ents:
            if(ent.label_=="PROGLANG"):
                pl.append(ent.text)
            elif(ent.label_=="TOOL"):
                tl.append(ent.text)
            elif(ent.label_=="LIBRARY"):
                lib.append(ent.text)
            else:
                edu.append(ent.text)
        programming_languages.append(pl)
        tool.append(tl)
        library.append(lib)
        education.append(edu)
        
        # break
    programming_languages_dict[key] = programming_languages
    tool_dict[key] = tool
    education_dict[key] = education
    library_dict[key] = library
    # break
        
        


Data Analyst
Machine Learning
Data Scientist
Data Engineer


In [57]:
new_list = []
def filter_function(element,category):
    data = []
    # print(element)
    # category=="PL"
    if(category=="PL"):
        data = list(map(lambda x:x.lower(),load_txt("data/programming_languages.txt")))
    elif(category=="TL"):
        data = list(map(lambda x:x.lower(),load_txt("data/dt_tools.txt")))
    elif(category=="LIB"):
        data = list(map(lambda x:x.lower(),load_txt("data/libraries&packages.txt")))
    else:
        data = list(map(lambda x:x.lower(),load_txt("data/education.txt")))
        
    if element in data:
        # print("Enter")
        return True
        # return False
    else: 
        return False
        # return True
    
# Check if there is any leftover elements which wasn't recorded in data txt
# distinct_list = list(set(map(lambda x:x.lower(), education_dict['Machine Learning'])))
# not_in_list = list(filter(lambda seq: filter_function(seq, "EDU"),distinct_list))
# print(not_in_list)

In [99]:
## Clean Dictionary
def clean_dict(words_dict,category):
    
    for key in words_dict.keys():
        cleaned_lists = []
        for array in words_dict[key]:
            distinct_word = list(set(map(lambda x : x.lower(), array)))
            filtered_distinct_word = list(filter(lambda seq: filter_function(seq, category),distinct_word))
            cleaned_lists.append(filtered_distinct_word)
        words_dict[key] = cleaned_lists
        
    return words_dict

cleaned_programming_language_dict = clean_dict(programming_languages_dict,"PL")
cleaned_tool_dict = clean_dict(tool_dict,"TL")
cleaned_education_dict = clean_dict(education_dict,"EDU")
cleaned_library_dict = clean_dict(library_dict,"LIB")

In [194]:
## Get item for column Words
def get_word_column(items_dict):
    word_list = []
    for key in items_dict.keys():
        for array in items_dict[key]:
            word_list.extend(array)
    distinct_words = list(set(map(lambda x : x.lower(), word_list)))
    # filtered_distinct_word = list(filter(lambda seq: filter_function(seq, category),distinct_word))
    return distinct_words

def init_column(df):
    roles = ['DS_count','DA_count','DE_count','ML_count']
    for role in roles:
        df[role] = 0
    return df

pl_distinct_words_list = get_word_column(cleaned_programming_language_dict)
tl_distinct_words_list = get_word_column(cleaned_tool_dict)
lib_distinct_words_list = get_word_column(cleaned_library_dict)
edu_distinct_words_list = get_word_column(cleaned_education_dict)

pl_df = pd.DataFrame(pl_distinct_words_list,columns = ["Words"])
pl_df = init_column(pl_df)
tl_df = pd.DataFrame(tl_distinct_words_list,columns = ["Words"])
tl_df = init_column(tl_df)
lib_df = pd.DataFrame(lib_distinct_words_list,columns = ["Words"])
lib_df = init_column(lib_df)
edu_df = pd.DataFrame(edu_distinct_words_list,columns = ["Words"])
edu_df = init_column(edu_df)

In [195]:
pl_df

Unnamed: 0,Words,DS_count,DA_count,DE_count,ML_count
0,powershell,0,0,0,0
1,sql,0,0,0,0
2,sas,0,0,0,0
3,asp,0,0,0,0
4,python,0,0,0,0
5,bash,0,0,0,0
6,dax,0,0,0,0
7,powerhouse,0,0,0,0
8,c++,0,0,0,0
9,perl,0,0,0,0


In [196]:
def update_word_count(word_dict,df):
    for key in word_dict.keys():
        if(key=="Data Scientist"):
            column = "DS_count"
        elif(key=="Data Analyst"):
            column = "DA_count"
        elif(key=="Data Engineer"):
            column = "DE_count"
        else:
            column = "ML_count"
        count_list = [0]*len(df)
        for arr in word_dict[key]:
            for word in arr:
                index = df[df["Words"]==word]['Words'].index[0]
                # print(word, index)
                count_list[index]+=1
        df[column] = count_list
            

In [197]:
update_word_count(cleaned_programming_language_dict,pl_df)
update_word_count(cleaned_tool_dict,tl_df)
update_word_count(cleaned_education_dict,edu_df)
update_word_count(cleaned_library_dict,lib_df)
print(edu_df)

       Words  DS_count  DA_count  DE_count  ML_count
0  doctorate        15         2         7         1
1   bachelor        59        86        62         9
2     master        35        39        38         7
3        phd         3         0         1         0
4        bsc         0         0         1         0


In [198]:
edu_df.iloc[3] = edu_df.iloc[0]+edu_df.iloc[3]
edu_df.loc[3,"Words"] = "phd"
edu_df.iloc[1] = edu_df.iloc[1]+edu_df.iloc[4]
edu_df.loc[1,"Words"] = "bachelor"
edu_df.drop(index=[0,4],inplace=True)

In [199]:
pl_df.to_csv("data/word_count_data/programming_languages.csv")
tl_df.to_csv("data/word_count_data/tools.csv")
lib_df.to_csv("data/word_count_data/library.csv")
edu_df.to_csv("data/word_count_data/education.csv")

In [7]:
pl_data = pd.read_csv("data/word_count_data/tools.csv")
pl_data

Unnamed: 0.1,Unnamed: 0,Words,DS_count,DA_count,DE_count,ML_count
0,0,azure,6,12,25,2
1,1,sas,7,10,2,0
2,2,powerbi,6,8,8,0
3,3,excel,22,44,9,2
4,4,postgressql,1,0,1,0
5,5,mysql,6,6,6,1
6,6,spark,20,6,33,5
7,7,qlik,2,1,2,0
8,8,postgresql,0,2,6,0
9,9,aws,13,14,25,3


In [8]:
import plotly.express as px
pl_data_asc = pl_data.sort_values(by=['DS_count'],ascending=False)
fig = px.bar(pl_data_asc, x='Words', y='DS_count',title = "Data Scientist Programming Languages")
fig.show()