# Preprocess Collected Data

In [2]:
import pickle
import re
import nltk
from nltk import WordNetLemmatizer 
import numpy as np

import spacy 
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
import json

In [3]:
# load saved data 
def load(filename):
    with open(f'./data/{filename}.p', 'rb') as fp:
        job_dict = pickle.load(fp)
    return job_dict

def save(final_file):
    with open(f'./data/job_dict.p', 'wb') as fp:
        pickle.dump(final_file,file=fp,protocol=pickle.HIGHEST_PROTOCOL)

def load_data(file):
    with open(file,"r",encoding="utf-8") as f:
        data = json.load(f)
    return data

def load_txt(file):
    with open(file,"r",encoding="utf-8") as f:
        data = []
        for line in f:
            line = line.replace("\n","")
            data.append(line)
            
    return data

def save_data(file, data):
    
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [4]:
def get_all_roles_links(job_dict):


    data_science = []
    data_engineer = []
    machine_learning = []
    analyst = []
    art_int = []
    # left_over= []
    # statistics= []
    # job_dict
    i = 0
    for listing,key in zip(job_dict,list(job_dict.keys())):
        i+=1
        inf = job_dict.get(key)
        role = inf.get('role')
        ds = bool(re.search(".*[Dd]ata\s.*[Ssc]ien.*", role))
        de = bool(re.search(".*[Dd]ata\s.*[Ee]ngine.*", role))
        ml_long = bool(re.search(".*[Mm]achine\s.*[Ll]earn.*",role))
        ml = bool(re.search(".*[Mm][Ll]\s.*", role))
        anal = bool(re.search(".*[Dd]ata\s.*[Aa]nal.*", role))
        ai_long = bool(re.search(".*[Aa]rtificial\s.*[Ii]ntelligence.*", role))
        ai = bool(re.search(".*\s[Aa][Ii]\s.*", role))
        # stat = bool(re.search(".*[Sst]at.*\s.*[Pp]rogram.*", role))
        if(ds):
            data_science.append(listing)
        elif(de):
            data_engineer.append(listing)
        elif(ml_long or ml):
            machine_learning.append(listing)
        elif(anal):
            analyst.append(listing)
        elif(ai or ai_long):
            art_int.append(listing)
        # elif(stat):
        #     statistics.append(listing)
        # else:
        #     left_over.append(listing)
    return (analyst)+(machine_learning)+(data_science)+(art_int)+(data_engineer)

In [5]:


def get_roles(job_links):
    
    roles = []
    
    for link in job_links:
        
        job = job_dict.get(link)
        roles.append(job.get('role'))
    
    return roles

def get_descriptions(job_links):
    
    descriptions = []
    
    for link in job_links:
        
        job = job_dict.get(link)
        descriptions.append(str(job.get('description')).replace('\n'," "))
        # print(descriptions)
        # a
        # # break
        
    return descriptions

def clean_descriptions(descriptions):
    clean_des = []
    for description in descriptions:
        if(description=='None'):
            continue
        else:
            clean_des.append(description)
    return clean_des

In [20]:
# load main data
dict_1 = load_data('data/job_street.json')
dict_2 = load_data('data/linkedin.json')
job_dict_malaysia  = {}
job_dict_malaysia.update(dict_1)
job_dict_malaysia.update(dict_2)

# load test data
# job_dict= load_data("data/test_job_dict_linkedin.json")
# job_links = get_all_roles_links(job_dict)
# roles = get_roles(job_links)
# role_descriptions = get_descriptions(job_links)
# role_descriptions = clean_descriptions(role_descriptions)
# save_data("data/test_description.txt",role_descriptions)

In [21]:
save_data("data/job_dict_malaysia.json",job_dict_malaysia)

In [8]:
        
def generate_better_data(file):
    
    data = load_txt(file)
    new_data = []
    
    for item in data:
        if(len(data)>2):
            new_data.append(item.lower())
    
    return data+new_data

def create_patterns(file,type,patterns):
    
    data = generate_better_data(file)
    # patterns = []
    for item in data:
        pattern = {
                    "label": type,
                    "pattern": item
        }
        patterns.append(pattern)
    return (patterns)

# @Language.component
def generate_rules(patterns):
    nlp = English()
    ruler = EntityRuler(nlp)
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    nlp.to_disk("ner_ruler")
    
def test_model(model,text):
    doc = nlp(text)
    results = []
    for ent in doc.ents:
        results.append(ent.text)
    return (results)
    

In [None]:
save_data('data/description.txt',role_descriptions)

# Generate Entity Rules 

In [10]:
patterns = []
patterns1 = create_patterns("./data/programming_languages.txt","PROGLANG",patterns)
patterns2 = create_patterns("./data/dt_tools.txt","TOOL",patterns1)
patterns3 = create_patterns("./data/libraries&packages.txt","LIBRARY",patterns2)
patterns4 = create_patterns("./data/education.txt","EDUCATION",patterns3)
generate_rules(patterns4)
nlp = spacy.load("ner_ruler")

In [11]:
text = load_data("./data/test_description.txt")[2]
r = test_model(nlp,text)