Import Packages

In [158]:
import pyodbc
import re
import json
import spacy

import numpy as np
import pandas as pd
import tensorflow as tf

from sqlalchemy import create_engine
from sklearn.metrics.pairwise import cosine_similarity

from collections import defaultdict
from gensim import corpora, models, similarities
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
from gensim.models.word2vec import Word2Vec

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import gensim.downloader as api
from gensim.models import doc2vec

from googletrans import Translator
from deep_translator import GoogleTranslator

from FlaskApp.transform import *

from job_model import JobModel
from app_model import AppModel

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import multiprocessing

import nltk

In [159]:
nltk_stopwords = nltk.corpus.stopwords.words('indonesian')
sastrawi_stopwords = StopWordRemoverFactory().get_stop_words()
combined_stopwords = nltk_stopwords + sastrawi_stopwords

Load Constant

In [177]:
# translator = Translator(service_urls=['translate.googleapis.com'])
factory = StemmerFactory()
stemmer = factory.create_stemmer()

rootwords = [i[0] for i in pd.read_csv('data/rootwords.txt').values]
stopwords = [i[0] for i in pd.read_csv('data/stopwords.csv').values]

with open('data/slangwords.json', 'r') as file:
    slangwords = json.load(file)

with open('data/englishwords.json', 'r') as file:
    englishwords = json.load(file)

with open('data/slangjobs.json', 'r') as file:
    slangjobs = json.load(file)

with open('data/job_slangwords_phase1.json', 'r') as file:
    job_slangwords_phase1 = json.load(file)

with open('data/job_slangwords_phase2.json', 'r') as file:
    job_slangwords_phase2 = json.load(file)

job_stopwords = [i[0] for i in pd.read_csv('data/job_stopwords.txt').values]

cores = multiprocessing.cpu_count()

Function Definition

In [178]:
def remove_stopwords(stopwords: list, text: str):
    list_text = text.split()

    new_text = []
    for word in list_text:
        if word not in stopwords:
            new_text.append(word)
    return ' '.join(new_text)

Notebook Settings

In [179]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

DB Connection

In [180]:
user = 'huda'
password = 'Vancha12'
host = '127.0.0.1'
port = 1433
database = 'HRSystemDB'


def get_connection():         
    return create_engine(
        url=f"mssql+pyodbc://{user}:{password}@{host}:{port}/{database}?driver=SQL Server",
    )

engine = get_connection()
conn = engine.connect()

Load Tables

In [181]:
df_job = pd.DataFrame(engine.execute(
    """
    SELECT Job.JobID, Job.UsiaMax, Job.SalaryMin, Job.SalaryMax, City.Name AS CityName, Province.Name AS ProvinceName, EducationLevel.EducationLevelName, Major.MajorName, Job.DriverLicenseType, Job.UsingGlasses, Job.Gender, Job.MaritalStatus, Job.JobTitle, FunctionPosition.FunctionPositionName, Job.Description, Job.Requirement
    FROM (((((Job
    RIGHT JOIN FunctionPosition ON Job.FunctionPositionID = FunctionPosition.FunctionPositionID)
    RIGHT JOIN EducationLevel ON Job.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN City ON Job.CityID = City.CityID)
    RIGHT JOIN Province ON Job.ProvinceID = Province.ProvinceID)
    RIGHT JOIN Major ON Job.MajorID = Major.MajorID)
    """
))

df_applicant_experience = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantExperience.ApplicantID, ApplicantExperience.DateFrom, ApplicantExperience.DateTo, ApplicantExperience.Position, ApplicantExperience.JobDescription
    FROM (ApplicantExperience
    LEFT JOIN Pipeline ON ApplicantExperience.ApplicantID = Pipeline.ApplicantID)
    """
))

In [182]:
job_str = ['CityName', 'ProvinceName', 'EducationLevelName', 'MajorName', 'DriverLicenseType', 'Gender', 'MaritalStatus', 'JobTitle', 'FunctionPositionName', 'Description', 'Requirement']
job_num = ['UsiaMax', 'SalaryMin', 'SalaryMax']
job_bol = ['UsingGlasses']

'''general'''
df_job.set_index(['JobID'], inplace=True)
df_job.fillna('', inplace=True)

'''str'''
df_job[job_str] = df_job[job_str].applymap(str.lower)
df_job.replace('none', '', inplace=True)

In [183]:
'''experience'''
df_applicant_experience = df_applicant_experience.fillna('')

# datetime column
df_applicant_experience.DateFrom = pd.to_datetime(
    df_applicant_experience.DateFrom.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)
df_applicant_experience.DateTo = pd.to_datetime(
    df_applicant_experience.DateTo.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)
df_applicant_experience = df_applicant_experience[~(df_applicant_experience.DateFrom.isna()) & ~(df_applicant_experience.DateTo.isna())]

# add YearsOfExperience column
df_applicant_experience['YearsOfExperience'] = substract_months(
    df_applicant_experience.DateFrom, df_applicant_experience.DateTo
)
df_applicant_experience = df_applicant_experience.sort_values('DateFrom').groupby(['ApplicantID']).agg({
    'DateFrom': 'last',
    'DateTo': 'last',
    'JobDescription': ' '.join,
    'Position': ' '.join,
    'YearsOfExperience': 'sum',
})
df_applicant_experience.drop(columns=['DateFrom', 'DateTo'], inplace=True)

df_applicant_experience = df_applicant_experience[df_applicant_experience.YearsOfExperience != 0]

In [184]:
df_job = df_job[['JobTitle', 'Description', 'Requirement']]
df_applicant_experience = df_applicant_experience[['Position', 'JobDescription']]

In [185]:
df_job['DescriptionRequirement'] = df_job.Description + ' ' + df_job.Requirement
df_job.drop(columns=['Description', 'Requirement'], inplace=True)

In [186]:
df_applicant_experience = df_applicant_experience.applymap(str.lower)
df_job = df_job.applymap(str.lower)

df_job.DescriptionRequirement = df_job.DescriptionRequirement.map(clean_text)
df_applicant_experience.JobDescription = df_applicant_experience.JobDescription.map(clean_text)

  return BeautifulSoup(text, features='lxml').get_text(separator=' ')


In [187]:
df_job.DescriptionRequirement = df_job.DescriptionRequirement.apply(lambda x: ' '.join([i for i in x.split() if i not in job_stopwords]))
df_job.head(20)

Unnamed: 0_level_0,JobTitle,DescriptionRequirement
JobID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,night audit,under asst front office manager gunawangsa hotel merr surabaya with year minimum experiences same position has experiences with power pro hotel system preferable domicile surabaya surrounding areas good personality attitude passionate loyal willing become key player dynamic team able work under pressure
2,book keeper,becoming book keeper gunawangsa hotel merr bachelor degree accounting with years minimum experiences same position has experiences with power pro hotel system preferable domicile surabaya surrounding areas good personality attitude passionate loyal willing become key player dynamic team able work under pressure
3,it android programmer,developing android application their integration with back end services accordance with user requirements analyzing optimizing application code efficiency reability performance usia maksimal tahun minimal pendidikan it atau sistem informasi menguasai pemprograman khusunya tentang android asp net mvc dll fresh graduate wellcome siap kerja team ulet kreatif dan rajin
4,sekretaris legal,melakukan tugas kesekretariatan menggunakan terminologi prosedur dan dokumen hukum menyiapkan dokumen dan korespondensi hukum seperti surat panggilan keluhan isyarat dan panggilan dari pengadilan menyiapkan dan memproses dokumen dan surat surat hukum seperti surat panggilan panggilan dari pengadilan keluhan permohonan banding mosi dan perjanjian praperadilan mengorganisir dan memelihara perpustakaan hukum dokumen dan file kasus membuat jadwal dan janji pendidikan minimal hukum usia maksimal tahun pengalaman kerja min thn pengalaman membuat konsep dan koreksi surat surat perjanjian kontrak kerjasama dan yang sejenisnya siap kerja under pressure penampilan menarik mampu berbahasa inggris aktif dan pasif penempatan surabaya
5,sekretaris direksi,bantu handle pekerjaan atasan membuat surat menyurat mengatur jadwal atasan usia maksimal tahun pendidikan min semua jurusan jurusan sekretaris lebih disukai pengalaman kerja minimal tahun dibidang yang sama lebih disukai bisa bahasa inggris aktif bisa bahasa mandarin berpenampilan menarik sopan rapi inisiatif tinggi dan komunikatif mampu bekerja secara multitasking memiliki time task management yang baik mampu bekerja dengan target dan deadline pekerja keras dan tahan terhadap tekanan memiliki disiplin dan etika yang tinggi
6,marketing executive,mencari para calon klien yang prospektif mengembangkan dan merealisasikan rencana penjualan dan tergetnya melakukan penawaran harga dan negosiasi dengan klien dll usia maksimal tahun pendidikan minimal semua jurusan memiliki pengalaman bekerja minimal tahun dibidang yang sama memiliki networking yang luas memiliki kendaraan sendiri lebih diutamakan memiliki sim lebih disukai penempatan surabaya
7,senior estimator,membuat perhitungan rencana anggaran biaya rab membuat perhitungan harga upah kerja progress kerja membuat time schedule usia maksimal tahun pendidikan teknik sipil pengalaman estimator drafter min tahun memiliki pengetahuan mengenai konstruksi bilboard dll mampu membuat rab rap dan analisa harga dapat bekerja secara team atau individu tanggung jawab kreatif dan inisiatif
8,web programmer,membuat aplikasi sesuai dengan kebutuhan perusahaan termasuk existing aplikasi mewujudkan desain web menjadi system yang berfungsi dengan baik usia maksimal tahun pendidikan teknik informatika ilmu komputer pengalaman minimal tahun dalam membangun aplikasi web menguasai pemograman php html my sql dll penempatan surabaya
10,driver,merawat kebersihan mobil kantor antar dan jemput karyawan saat ada keperluan kantor laki laki usia maks tahun pendidikan min sma smk sederajat mempunyai sim berpengalaman dibidang yang sama min tahun hafal rute jalan kota surabaya displin jujur dan cekatan mampu berkoordinasi dan berkomunikasi baik dengan tim berpenampilan menarik
11,source &aplicant jobs,description testing job untuk pekerjaan aplicant dan source requirement testing job untuk pekerjaan aplicant dan source


In [188]:
# df_job.to_csv('data/lihat_job.csv', sep=';')
# df_applicant_experience.to_csv('data/lihat_experience.csv', sep=';')

In [189]:
df_applicant_experience.rename(columns={'Position': 'JobTitle', 'JobDescription': 'DescriptionRequirement'}, inplace=True)

In [190]:
train_data = pd.concat([df_job, df_applicant_experience]).reset_index(drop=True)


In [191]:
train_data.JobTitle = train_data.JobTitle.map(
    str.lower
).map(
    remove_parenthesesnumber
).map(
    remove_standalonesymbols
).map(
    remove_morespace
).map(
    str.strip
).apply(
    lambda x: ' '.join(list(set(x.split())))
).apply(
    lambda x: change_words(job_slangwords_phase1, x)
).apply(
    lambda x: remove_stopwords(job_stopwords, x)
).map(
    clean_text
).apply(
    lambda x: change_words(job_slangwords_phase2, x)
)

train_data.DescriptionRequirement = train_data.DescriptionRequirement.map(
    clean_text
).apply(
    lambda x: change_words(slangwords, x)
).apply(
    lambda x: change_words(englishwords, x)
).apply(
    lambda x: ' '.join([i for i in x.split() if i not in combined_stopwords])
)

  return BeautifulSoup(text, features='lxml').get_text(separator=' ')


In [192]:
train_data

Unnamed: 0,JobTitle,DescriptionRequirement
0,auditor night,asisten kantor pengelola hotel surabaya minimal pengalaman posisi memiliki pengalaman kekuatan pro hotel sistem tinggal surabaya daerah bagus kepribadian sikap penuh semangat setia bersedia kunci pemain dinamis tim tekanan
1,book keeper,buku penjaga hotel sarjana derajat akuntansi bertahun-tahun minimal pengalaman posisi memiliki pengalaman kekuatan pro hotel sistem tinggal surabaya daerah bagus kepribadian sikap penuh semangat setia bersedia kunci pemain dinamis tim tekanan
2,it android programmer,mengembangkan android aplikasi milik integrasi jasa sesuai pengguna persyaratan menganalisa mengoptimalkan aplikasi kode efisiensi keandalan pertunjukan usia maksimal minimal pendidikan sistem informasi menguasai pemrograman android asp bersih mvc segar lulus selamat kerja tim ulet kreatif rajin
3,legal secretary,tugas kesekretariatan terminologi prosedur dokumen hukum dokumen korespondensi hukum surat panggilan keluhan isyarat panggilan pengadilan memproses dokumen surat surat hukum surat panggilan panggilan pengadilan keluhan permohonan banding mosi perjanjian praperadilan mengatur memelihara perpustakaan hukum dokumen berkas jadwal janji pendidikan minimal hukum usia maksimal pengalaman kerja minimal pengalaman konsep koreksi surat surat perjanjian kontrak kerja sejenisnya kerja tekanan penampilan menarik berbahasa inggris aktif pasif pemindahan surabaya
4,secretary director,bantu menangani pekerjaan atasan surat menyurat mengatur jadwal atasan usia maksimal pendidikan minimal jurusan jurusan sekretaris disukai pengalaman kerja minimal dibidang disukai bahasa inggris aktif bahasa mandarin berpenampilan menarik sopan rapi inisiatif komunikatif multitasking memiliki tugas pengelolaan target tenggat pekerja keras tahan tekanan memiliki disiplin etika
...,...,...
5425,graphic design,desain katalog produk promosi desain sosial media desain desain promosi desain sosial media desain situs web resmi desain merchandise desain label
5426,acquisition manager development business,acquiring klien properti owner cooperate travelio partners mengelola memajukan klien properti rented pelanggan cooperate bangunan pengelolaan penanganan guest pelanggan issue terkait properti pemantauan menganalisa langsung properti harga memeriksa mempertahankan kecepatan allotment promosi lain.
5427,accountant staff,pegangan transaksi harian menangani kas tugas cek kelengkapan tagihan
5428,affair general staff,inventarisasi set perusahaan proses biaya urusan pengurus asuransi gedung kendaraan pengurus rawat set perusahaan pengurus perizinan perusahaan pengurus pengadaan barang satuan kendaraan pengurus pajak reklame kendaraan inventaris perusahaan pengurus badan penyelenggara jaminan sosial karyawan laporan absensi karyawan sistem laporan uang makan karyawan


In [193]:
dataset = train_data.DescriptionRequirement.map(str.split).values

data = []
for word in dataset:
    data.append(word)

def tagged_document(list_of_listwords, tags=None):
    if tags is not None:
        for x, listwords in zip(tags, list_of_listwords):
            yield doc2vec.TaggedDocument(listwords, [x])
    else:
        for x, listwords in enumerate(list_of_listwords):
            yield doc2vec.TaggedDocument(listwords, [x])

data_train = list(tagged_document(data, train_data.JobTitle.values))


In [194]:
d2v_model = doc2vec.Doc2Vec(vector_size=120, min_count=10, epochs=60, window=3)
d2v_model.build_vocab(data_train)
d2v_model.train(data_train, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

2023-04-19 16:44:53,714 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d120,n5,w3,mc10,s0.001,t3>', 'datetime': '2023-04-19T16:44:53.714905', 'gensim': '4.3.1', 'python': '3.10.0 | packaged by conda-forge | (default, Nov 10 2021, 13:20:59) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}
2023-04-19 16:44:53,714 : INFO : collecting all words and their counts
2023-04-19 16:44:53,714 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-04-19 16:44:53,785 : INFO : collected 16477 word types and 3505 unique tags from a corpus of 5430 examples and 399981 words
2023-04-19 16:44:53,785 : INFO : Creating a fresh vocabulary
2023-04-19 16:44:53,787 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=10 retains 3671 unique words (22.28% of original 16477, drops 12806)', 'datetime': '2023-04-19T16:44:53.787539', 'gensim': '4.3.1', 'python': '3.10.0 | packaged by conda-forge | (default, Nov 10 2021, 

In [195]:
d2v_words = d2v_model.wv.index_to_key
d2v_vectors = d2v_model.wv.vectors

with open('data/d2v_vectors.tsv', 'w', encoding='utf-8') as f:
    for i, word in enumerate(d2v_words):
        vector_str = '\t'.join([str(num) for num in d2v_vectors[i]])
        f.write(f'{vector_str}\n')

with open('data/d2v_metadata.tsv', 'w', encoding='utf-8') as f:
    for word in d2v_words:
        f.write(f'{word}\n')

In [71]:
df_job.JobTitle = df_job.JobTitle.map(
    str.lower
).map(
    remove_parenthesesnumber
).map(
    remove_standalonesymbols
).map(
    remove_morespace
).map(
    str.strip
).apply(
    lambda x: ' '.join(list(set(x.split())))
).apply(
    lambda x: change_words(job_slangwords_phase1, x)
).apply(
    lambda x: remove_stopwords(job_stopwords, x)
).map(
    clean_text
).apply(
    lambda x: change_words(job_slangwords_phase2, x)
)

df_applicant_experience.Position = df_applicant_experience.Position.map(
    str.lower
).map(
    remove_parenthesesnumber
).map(
    remove_standalonesymbols
).map(
    remove_morespace
).map(
    str.strip
).apply(
    lambda x: ' '.join(list(set(x.split())))
).apply(
    lambda x: change_words(job_slangwords_phase1, x)
).apply(
    lambda x: remove_stopwords(job_stopwords, x)
).map(
    clean_text
).apply(
    lambda x: change_words(job_slangwords_phase2, x)
)

  return BeautifulSoup(text, features='lxml').get_text(separator=' ')


In [68]:
df_job.DescriptionRequirement = df_job.DescriptionRequirement.map(
    clean_text
).apply(
    lambda x: change_words(slangwords, x)
).apply(
    lambda x: change_words(englishwords, x)
).apply(
    lambda x: ' '.join([i for i in x.split() if i not in stopwords])
)

df_applicant_experience.JobDescription = df_applicant_experience.JobDescription.map(
    clean_text
).apply(
    lambda x: change_words(slangwords, x)
).apply(
    lambda x: change_words(englishwords, x)
).apply(
    lambda x: ' '.join([i for i in x.split() if i not in stopwords])
)

In [100]:
index = 26133

df_view = df_job.copy()
df_view['similarity'] = [d2v_model.similarity_unseen_docs(i.split(), df_applicant_experience.JobDescription.loc[index].split()) for i in df_job.DescriptionRequirement]
df_view[df_view.JobTitle.str.contains('accountant')].sort_values('similarity', ascending=False)

Unnamed: 0_level_0,JobTitle,DescriptionRequirement,similarity
JobID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1192,book department keeper accountant,buku penjaga sarjana derajat akuntansi bertahu...,0.304947
58,chief accountant,ketua akuntansi pengalaman keramahan industri ...,0.291761
1195,cost department control accountant,biaya kontrol sarjana derajat akuntansi memili...,0.278316
1364,accountant administration,terbiasa laporan keuangan usia maksimal pendid...,0.277437
3069,chief accountant,usia maksimal pendidikan minimal akuntansi pen...,0.236814
...,...,...,...
3086,accountant finance,menyelesaikan tagihan pembayaran kontrol akun ...,0.059237
1188,accountant supervisor,tanggung pendataan keuangan memasukkan jurnal ...,0.058298
3087,accountant finance,menyelesaikan tagihan pembayaran kontrol akun ...,0.027206
55,accountant finance,menerima pajak daerah pajak izin cek persyarat...,0.004860


In [102]:
df_applicant_experience.Position.loc[index], df_applicant_experience.JobDescription.loc[index]

('sales consultant staff specialist stand promotion univesitas boy pendididikan event surabaya game project ekonomi jawab supervisor supervisor pekan penanggung event jurusan team pic olahraga reseller negeri',
 'persuades pelanggan penjualan promosi display items wholesale commodity distributor visits retail establishments departemen stores taverns supermarkets clubs persuade pelanggan display items memajukan sale produk delivers promosi items posters glasses napkins samples produk arranges display items pelanggan establishment mengambil penjualan memesan pelanggan persuades pelanggan penjualan promosi display items wholesale commodity distributor visits retail establishments departemen stores taverns supermarkets clubs persuade pelanggan display items memajukan sale produk delivers promosi items posters glasses napkins samples produk arranges display items pelanggan establishment mengambil penjualan memesan pelanggan persuades pelanggan penjualan promosi display items wholesale commo

In [93]:
df_applicant_experience['count'] = df_applicant_experience.JobDescription.apply(lambda x: len(x.split()))

In [96]:
df_applicant_experience.sort_values('count', ascending=False).head(20)

Unnamed: 0_level_0,Position,JobDescription,count
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20835,dan auditor procurment staff,jobdesk intern audit procurment perseroan terb...,2964
36900,site project relationship sales officer mep ma...,tanggung menjaga mengembangkan hadiah pelangga...,2670
31674,agent staff center math private school recepti...,guru bidang studi matematika sma kelas ix prog...,2556
10501,leader it support project officer supervisor e...,perseroan terbatas singa langit bergerak bidan...,2335
18409,staff officer accountant finance treasury,pekerjaan tugas tinjauan verify faktur charges...,2172
35471,ticketing ux content writter xi clothing super...,pekerjaan lepas freelance rincian tugas merawa...,1845
27090,staff assistant supervisor manager engineer,kurikulum vitae kapal nusantara start berkedud...,1768
29549,sales consultant center specialist marketing a...,successfully presented indonesia posisi perspe...,1598
26133,sales consultant staff specialist stand promot...,persuades pelanggan penjualan promosi display ...,1504
36001,legal project hrbp officer human resources dev...,detail pekerjaan cakupan penyusunan employees ...,1430


In [79]:
df_job[df_job.JobTitle.str.contains('accountant')]

Unnamed: 0_level_0,JobTitle,DescriptionRequirement
JobID,Unnamed: 1_level_1,Unnamed: 2_level_1
36,accountant senior auditor,membahas menilai kebaikan pelaksanaan pengenda...
44,accountant secretary,menyusun laporan keuangan menyusun laporan paj...
47,accountant senior,utang sistem prosedur akuntansi keuangan bersa...
55,accountant finance,menerima pajak daerah pajak izin cek persyarat...
58,chief accountant,ketua akuntansi pengalaman keramahan industri ...
...,...,...
3076,accountant senior,jurnal laporan keuangan mengawasi transaksi ke...
3077,accountant senior,jurnal laporan keuangan mengawasi transaksi ke...
3086,accountant finance,menyelesaikan tagihan pembayaran kontrol akun ...
3087,accountant finance,menyelesaikan tagihan pembayaran kontrol akun ...
