Import Packages

In [None]:
import pyodbc
import re
import json
import spacy

import numpy as np
import pandas as pd
import tensorflow as tf

from sqlalchemy import create_engine
from sklearn.metrics.pairwise import cosine_similarity

from collections import defaultdict
from gensim import corpora, models, similarities
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
from gensim.models.word2vec import Word2Vec

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import gensim.downloader as api
from gensim.models import doc2vec

from googletrans import Translator
from deep_translator import GoogleTranslator

from FlaskApp.transform import *

from job_model import JobModel
from app_model import AppModel

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import multiprocessing

import nltk

In [None]:
nltk_stopwords = nltk.corpus.stopwords.words('indonesian')
sastrawi_stopwords = StopWordRemoverFactory().get_stop_words()
combined_stopwords = nltk_stopwords + sastrawi_stopwords

Load Constant

In [None]:
# translator = Translator(service_urls=['translate.googleapis.com'])
factory = StemmerFactory()
stemmer = factory.create_stemmer()

rootwords = [i[0] for i in pd.read_csv('data/rootwords.txt').values]
stopwords = [i[0] for i in pd.read_csv('data/stopwords.csv').values]

with open('data/slangwords.json', 'r') as file:
    slangwords = json.load(file)

with open('data/englishwords.json', 'r') as file:
    englishwords = json.load(file)

with open('data/slangjobs.json', 'r') as file:
    slangjobs = json.load(file)

with open('data/job_slangwords_phase1.json', 'r') as file:
    job_slangwords_phase1 = json.load(file)

with open('data/job_slangwords_phase2.json', 'r') as file:
    job_slangwords_phase2 = json.load(file)

job_stopwords = [i[0] for i in pd.read_csv('data/job_stopwords.txt').values]

cores = multiprocessing.cpu_count()

Function Definition

In [None]:
def remove_stopwords(stopwords: list, text: str):
    list_text = text.split()

    new_text = []
    for word in list_text:
        if word not in stopwords:
            new_text.append(word)
    return ' '.join(new_text)

Notebook Settings

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

DB Connection

In [None]:
user = 'huda'
password = 'Vancha12'
host = '127.0.0.1'
port = 1433
database = 'HRSystemDB'


def get_connection():         
    return create_engine(
        url=f"mssql+pyodbc://{user}:{password}@{host}:{port}/{database}?driver=SQL Server",
    )

engine = get_connection()
conn = engine.connect()

Load Tables

In [None]:
df_job = pd.DataFrame(engine.execute(
    """
    SELECT Job.JobID, Job.UsiaMax, Job.SalaryMin, Job.SalaryMax, City.Name AS CityName, Province.Name AS ProvinceName, EducationLevel.EducationLevelName, Major.MajorName, Job.DriverLicenseType, Job.UsingGlasses, Job.Gender, Job.MaritalStatus, Job.JobTitle, FunctionPosition.FunctionPositionName, Job.Description, Job.Requirement
    FROM (((((Job
    RIGHT JOIN FunctionPosition ON Job.FunctionPositionID = FunctionPosition.FunctionPositionID)
    RIGHT JOIN EducationLevel ON Job.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN City ON Job.CityID = City.CityID)
    RIGHT JOIN Province ON Job.ProvinceID = Province.ProvinceID)
    RIGHT JOIN Major ON Job.MajorID = Major.MajorID)
    """
))

df_applicant_experience = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantExperience.ApplicantID, ApplicantExperience.DateFrom, ApplicantExperience.DateTo, ApplicantExperience.Position, ApplicantExperience.JobDescription
    FROM (ApplicantExperience
    LEFT JOIN Pipeline ON ApplicantExperience.ApplicantID = Pipeline.ApplicantID)
    """
))

In [None]:
job_str = ['CityName', 'ProvinceName', 'EducationLevelName', 'MajorName', 'DriverLicenseType', 'Gender', 'MaritalStatus', 'JobTitle', 'FunctionPositionName', 'Description', 'Requirement']
job_num = ['UsiaMax', 'SalaryMin', 'SalaryMax']
job_bol = ['UsingGlasses']

'''general'''
df_job.set_index(['JobID'], inplace=True)
df_job.fillna('', inplace=True)

'''str'''
df_job[job_str] = df_job[job_str].applymap(str.lower)
df_job.replace('none', '', inplace=True)

In [None]:
'''experience'''
df_applicant_experience = df_applicant_experience.fillna('')

# datetime column
df_applicant_experience.DateFrom = pd.to_datetime(
    df_applicant_experience.DateFrom.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)
df_applicant_experience.DateTo = pd.to_datetime(
    df_applicant_experience.DateTo.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)
df_applicant_experience = df_applicant_experience[~(df_applicant_experience.DateFrom.isna()) & ~(df_applicant_experience.DateTo.isna())]

# add YearsOfExperience column
df_applicant_experience['YearsOfExperience'] = substract_months(
    df_applicant_experience.DateFrom, df_applicant_experience.DateTo
)
df_applicant_experience = df_applicant_experience.sort_values('DateFrom').groupby(['ApplicantID']).agg({
    'DateFrom': 'last',
    'DateTo': 'last',
    'JobDescription': ' '.join,
    'Position': ' '.join,
    'YearsOfExperience': 'sum',
})
df_applicant_experience.drop(columns=['DateFrom', 'DateTo'], inplace=True)

df_applicant_experience = df_applicant_experience[df_applicant_experience.YearsOfExperience != 0]

In [None]:
df_job = df_job[['JobTitle', 'Description', 'Requirement']]
df_applicant_experience = df_applicant_experience[['Position', 'JobDescription']]

In [None]:
df_job['DescriptionRequirement'] = df_job.Description + ' ' + df_job.Requirement
df_job.drop(columns=['Description', 'Requirement'], inplace=True)

In [None]:
df_applicant_experience = df_applicant_experience.applymap(str.lower)
df_job = df_job.applymap(str.lower)

df_job.DescriptionRequirement = df_job.DescriptionRequirement.map(clean_text)
df_applicant_experience.JobDescription = df_applicant_experience.JobDescription.map(clean_text)

In [None]:
df_job.DescriptionRequirement = df_job.DescriptionRequirement.apply(lambda x: ' '.join([i for i in x.split() if i not in job_stopwords]))
df_job.head(20)

In [None]:
# df_job.to_csv('data/lihat_job.csv', sep=';')
# df_applicant_experience.to_csv('data/lihat_experience.csv', sep=';')

In [None]:
df_applicant_experience.rename(columns={'Position': 'JobTitle', 'JobDescription': 'DescriptionRequirement'}, inplace=True)

In [None]:
train_data = pd.concat([df_job, df_applicant_experience]).reset_index(drop=True)


In [None]:
train_data.JobTitle = train_data.JobTitle.map(
    str.lower
).map(
    remove_parenthesesnumber
).map(
    remove_standalonesymbols
).map(
    remove_morespace
).map(
    str.strip
).apply(
    lambda x: ' '.join(list(set(x.split())))
).apply(
    lambda x: change_words(job_slangwords_phase1, x)
).apply(
    lambda x: remove_stopwords(job_stopwords, x)
).map(
    clean_text
).apply(
    lambda x: change_words(job_slangwords_phase2, x)
)

train_data.DescriptionRequirement = train_data.DescriptionRequirement.map(
    clean_text
).apply(
    lambda x: change_words(slangwords, x)
).apply(
    lambda x: change_words(englishwords, x)
).apply(
    lambda x: ' '.join([i for i in x.split() if i not in combined_stopwords])
)

In [None]:
train_data

In [None]:
dataset = train_data.DescriptionRequirement.map(str.split).values

data = []
for word in dataset:
    data.append(word)

def tagged_document(list_of_listwords, tags=None):
    if tags is not None:
        for x, listwords in zip(tags, list_of_listwords):
            yield doc2vec.TaggedDocument(listwords, [x])
    else:
        for x, listwords in enumerate(list_of_listwords):
            yield doc2vec.TaggedDocument(listwords, [x])

data_train = list(tagged_document(data, train_data.JobTitle.values))


In [None]:
d2v_model = doc2vec.Doc2Vec(vector_size=120, min_count=10, epochs=60, window=3)
d2v_model.build_vocab(data_train)
d2v_model.train(data_train, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

In [None]:
d2v_words = d2v_model.wv.index_to_key
d2v_vectors = d2v_model.wv.vectors

with open('data/d2v_vectors.tsv', 'w', encoding='utf-8') as f:
    for i, word in enumerate(d2v_words):
        vector_str = '\t'.join([str(num) for num in d2v_vectors[i]])
        f.write(f'{vector_str}\n')

with open('data/d2v_metadata.tsv', 'w', encoding='utf-8') as f:
    for word in d2v_words:
        f.write(f'{word}\n')

In [None]:
df_job

In [None]:
df_job.JobTitle = df_job.JobTitle.map(
    str.lower
).map(
    remove_parenthesesnumber
).map(
    remove_standalonesymbols
).map(
    remove_morespace
).map(
    str.strip
).apply(
    lambda x: ' '.join(list(set(x.split())))
).apply(
    lambda x: change_words(job_slangwords_phase1, x)
).apply(
    lambda x: remove_stopwords(job_stopwords, x)
).map(
    clean_text
).apply(
    lambda x: change_words(job_slangwords_phase2, x)
)

df_applicant_experience.Position = df_applicant_experience.Position.map(
    str.lower
).map(
    remove_parenthesesnumber
).map(
    remove_standalonesymbols
).map(
    remove_morespace
).map(
    str.strip
).apply(
    lambda x: ' '.join(list(set(x.split())))
).apply(
    lambda x: change_words(job_slangwords_phase1, x)
).apply(
    lambda x: remove_stopwords(job_stopwords, x)
).map(
    clean_text
).apply(
    lambda x: change_words(job_slangwords_phase2, x)
)

In [None]:
df_job.DescriptionRequirement = df_job.DescriptionRequirement.map(
    clean_text
).apply(
    lambda x: change_words(slangwords, x)
).apply(
    lambda x: change_words(englishwords, x)
).apply(
    lambda x: ' '.join([i for i in x.split() if i not in stopwords])
)

df_applicant_experience.JobDescription = df_applicant_experience.JobDescription.map(
    clean_text
).apply(
    lambda x: change_words(slangwords, x)
).apply(
    lambda x: change_words(englishwords, x)
).apply(
    lambda x: ' '.join([i for i in x.split() if i not in stopwords])
)

In [None]:
index = 26133

df_view = df_job.copy()
df_view['similarity'] = [d2v_model.similarity_unseen_docs(i.split(), df_applicant_experience.JobDescription.loc[index].split()) for i in df_job.DescriptionRequirement]
df_view[df_view.JobTitle.str.contains('accountant')].sort_values('similarity', ascending=False)

In [None]:
df_applicant_experience.Position.loc[index], df_applicant_experience.JobDescription.loc[index]

In [None]:
df_applicant_experience['count'] = df_applicant_experience.JobDescription.apply(lambda x: len(x.split()))

In [None]:
df_applicant_experience.sort_values('count', ascending=False).head(20)

In [None]:
df_job[df_job.JobTitle.str.contains('accountant')]