In [1]:
import pyodbc
import re
import json

import numpy as np
import pandas as pd
import tensorflow as tf

from sqlalchemy import create_engine
from sklearn.metrics.pairwise import cosine_similarity

from collections import defaultdict
from gensim import corpora, models, similarities
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
from gensim.models.word2vec import Word2Vec

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# from googletrans import Translator
# from deep_translator import GoogleTranslator

from FlaskApp.transform import *

from job_model import JobModel
from app_model import AppModel

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# translator = Translator(service_urls=['translate.googleapis.com'])
factory = StemmerFactory()
stemmer = factory.create_stemmer()

rootwords = [i[0] for i in pd.read_csv('data/rootwords.txt').values]
stopwords = [i[0] for i in pd.read_csv('data/stopwords.csv').values]

with open('data/slangwords.json', 'r') as file:
    slangwords = json.load(file)

with open('data/englishwords.json', 'r') as file:
    englishwords = json.load(file)

with open('data/slangjobs.json', 'r') as file:
    slangjobs = json.load(file)

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

In [4]:
user = 'huda'
password = 'Vancha12'
host = '127.0.0.1'
port = 1433
database = 'HRSystemDB'


def get_connection():         
    return create_engine(
        url=f"mssql+pyodbc://{user}:{password}@{host}:{port}/{database}?driver=SQL Server",
    )

engine = get_connection()
conn = engine.connect()

In [5]:
applicant_id = 31790

df_job = pd.DataFrame(engine.execute(
    """
    SELECT Job.JobID, Job.UsiaMax, Job.SalaryMin, Job.SalaryMax, City.Name AS CityName, Province.Name AS ProvinceName, EducationLevel.EducationLevelName, Major.MajorName, Job.DriverLicenseType, Job.UsingGlasses, Job.Gender, Job.MaritalStatus, Job.JobTitle, FunctionPosition.FunctionPositionName, Job.Description, Job.Requirement
    FROM (((((Job
    RIGHT JOIN FunctionPosition ON Job.FunctionPositionID = FunctionPosition.FunctionPositionID)
    RIGHT JOIN EducationLevel ON Job.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN City ON Job.CityID = City.CityID)
    RIGHT JOIN Province ON Job.ProvinceID = Province.ProvinceID)
    RIGHT JOIN Major ON Job.MajorID = Major.MajorID)
    """
))

df_function = pd.DataFrame(engine.execute(
    """
    SELECT FunctionPositionID, FunctionPositionName
    FROM FunctionPosition
    """
))

df_education = pd.DataFrame(engine.execute(
    """
    SELECT EducationLevelID, EducationLevelName
    FROM EducationLevel
    """
))

df_city = pd.DataFrame(engine.execute(
    """
    SELECT CityID, Name AS CityName
    FROM City
    """
))

df_province = pd.DataFrame(engine.execute(
    """
    SELECT ProvinceID, Name AS ProvinceName
    FROM Province
    """
))

df_major = pd.DataFrame(engine.execute(
    """
    SELECT MajorID, MajorName
    FROM Major
    """
))

df_applicant = pd.DataFrame(engine.execute(
    f"""
    SELECT Applicant.ApplicantID, Applicant.Dob, Applicant.ExpectedSalary, City.Name AS CityName, Province.Name AS ProvinceName, Applicant.DriverLicenseType, Applicant.IsUsingGlasses, Applicant.Gender, Applicant.MaritalStatus, Applicant.Strengthness
    FROM (((Applicant
    RIGHT JOIN City ON Applicant.CurrentAddressCityID = City.CityID)
    RIGHT JOIN Province ON Applicant.CurrentAddressProvinceID = Province.ProvinceID)
    LEFT JOIN Pipeline ON Applicant.ApplicantID = Pipeline.ApplicantID)
    """
))

df_applicant_education = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantEducation.ApplicantID, ApplicantEducation.DateStart, ApplicantEducation.DateEnd, EducationLevel.EducationLevelName, Major.MajorName
    FROM (((ApplicantEducation
    RIGHT JOIN EducationLevel ON ApplicantEducation.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN Major ON ApplicantEducation.MajorID = Major.MajorID)
    LEFT JOIN Pipeline ON ApplicantEducation.ApplicantID = Pipeline.ApplicantID)
    """
))

df_applicant_experience = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantExperience.ApplicantID, ApplicantExperience.DateFrom, ApplicantExperience.DateTo, ApplicantExperience.Position, ApplicantExperience.JobDescription
    FROM (ApplicantExperience
    LEFT JOIN Pipeline ON ApplicantExperience.ApplicantID = Pipeline.ApplicantID)
    """
))

In [6]:
job_str = ['CityName', 'ProvinceName', 'EducationLevelName', 'MajorName', 'DriverLicenseType', 'Gender', 'MaritalStatus', 'JobTitle', 'FunctionPositionName', 'Description', 'Requirement']
job_num = ['UsiaMax', 'SalaryMin', 'SalaryMax']
job_bol = ['UsingGlasses']

'''general'''
df_job.set_index(['JobID'], inplace=True)
df_job.fillna('', inplace=True)

'''str'''
df_job[job_str] = df_job[job_str].applymap(str.lower)
df_job.replace('none', '', inplace=True)

In [7]:
'''applicant'''
df_applicant = df_applicant.dropna(subset=['ApplicantID'])
df_applicant.ApplicantID = df_applicant.ApplicantID.astype(int)
df_applicant = df_applicant.drop_duplicates()
df_applicant = df_applicant.fillna('')
# age column
df_applicant['Age'] = pd.to_datetime(
    df_applicant.Dob.map(pick_date).apply(lambda x: filter_date(x, 1958, 2006))
).map(get_age)

df_applicant.drop(columns=['Dob'], inplace=True)

df_applicant.Age = df_applicant.Age.fillna(0).astype(int)
'''education'''
df_applicant_education = df_applicant_education.fillna('')
# datetime column
df_applicant_education.DateStart = pd.to_datetime(
    df_applicant_education.DateStart.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)
df_applicant_education.DateEnd = pd.to_datetime(
    df_applicant_education.DateEnd.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)
df_applicant_education = df_applicant_education[~(df_applicant_education.DateStart.isna()) & ~(df_applicant_education.DateEnd.isna())]
df_applicant_education = df_applicant_education.sort_values('DateStart').groupby(['ApplicantID']).agg('last')
df_applicant_education.drop(columns=['DateStart', 'DateEnd'], inplace=True)
'''experience'''
df_applicant_experience = df_applicant_experience.fillna('')

# datetime column
df_applicant_experience.DateFrom = pd.to_datetime(
    df_applicant_experience.DateFrom.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)
df_applicant_experience.DateTo = pd.to_datetime(
    df_applicant_experience.DateTo.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)
df_applicant_experience = df_applicant_experience[~(df_applicant_experience.DateFrom.isna()) & ~(df_applicant_experience.DateTo.isna())]

# add YearsOfExperience column
df_applicant_experience['YearsOfExperience'] = substract_months(
    df_applicant_experience.DateFrom, df_applicant_experience.DateTo
)
df_applicant_experience = df_applicant_experience.sort_values('DateFrom').groupby(['ApplicantID']).agg({
    'DateFrom': 'last',
    'DateTo': 'last',
    'JobDescription': ' '.join,
    'Position': ' '.join,
    'YearsOfExperience': 'sum',
})
df_applicant_experience.drop(columns=['DateFrom', 'DateTo'], inplace=True)
'''merge'''
df_applicant = pd.merge(df_applicant, df_applicant_experience, on=['ApplicantID'])
df_applicant = pd.merge(df_applicant, df_applicant_education, on=['ApplicantID'])


'''PRE-PROCESSING APPLICANT'''
df_applicant.set_index(['ApplicantID'], inplace=True)

app_str = ['CityName', 'ProvinceName', 'EducationLevelName', 'MajorName', 'DriverLicenseType', 'Gender', 'MaritalStatus', 'Position', 'JobDescription', 'Strengthness']
app_num = ['Age', 'ExpectedSalary', 'YearsOfExperience']
app_bol = ['IsUsingGlasses']

df_applicant = df_applicant[app_num + app_bol + app_str]

'''str'''
df_applicant[app_str] = df_applicant[app_str].applymap(str.lower)


'''bool'''
df_applicant.IsUsingGlasses = df_applicant.IsUsingGlasses.astype(str).map(str.lower)

In [8]:
df_job = df_job[['JobTitle', 'Description', 'Requirement']]
df_applicant = df_applicant[['Position', 'JobDescription', 'Strengthness']]

In [9]:
'''cleansing'''
df_job.Description = df_job.Description.map(clean_text)
df_job.Requirement = df_job.Requirement.map(clean_text)
df_job.JobTitle = df_job.JobTitle.map(remove_insideparentheses).map(clean_text)

df_applicant.JobDescription = df_applicant.JobDescription.map(clean_text)
df_applicant.Strengthness = df_applicant.Strengthness.map(clean_text)
df_applicant.Position = df_applicant.Position.map(remove_insideparentheses).map(clean_text)



In [10]:
'''cleansing'''
df_job.Description = df_job.Description.apply(lambda x: change_words(slangwords, x))
df_job.Requirement = df_job.Requirement.apply(lambda x: change_words(slangwords, x))
df_job.JobTitle = df_job.JobTitle.apply(lambda x: change_words(slangwords, x))

df_applicant.JobDescription = df_applicant.JobDescription.apply(lambda x: change_words(slangwords, x))
df_applicant.Strengthness = df_applicant.Strengthness.apply(lambda x: change_words(slangwords, x))
df_applicant.Position = df_applicant.Position.apply(lambda x: change_words(slangwords, x))

In [11]:
'''cleansing'''
df_job.Description = df_job.Description.apply(lambda x: change_words(englishwords, x))
df_job.Requirement = df_job.Requirement.apply(lambda x: change_words(englishwords, x))
df_job.JobTitle = df_job.JobTitle.apply(lambda x: change_words(englishwords, x))

df_applicant.JobDescription = df_applicant.JobDescription.apply(lambda x: change_words(englishwords, x))
df_applicant.Strengthness = df_applicant.Strengthness.apply(lambda x: change_words(englishwords, x))
df_applicant.Position = df_applicant.Position.apply(lambda x: change_words(englishwords, x))

In [90]:
'''cleansing'''
df_job.Description = df_job.Description.apply(lambda x: ' '.join([i for i in x.split() if i not in stopwords]))
df_job.Requirement = df_job.Requirement.apply(lambda x: ' '.join([i for i in x.split() if i not in stopwords]))
df_job.JobTitle = df_job.JobTitle.apply(lambda x: ' '.join([i for i in x.split() if i not in stopwords]))

df_applicant.JobDescription = df_applicant.JobDescription.apply(lambda x: ' '.join([i for i in x.split() if i not in stopwords]))
df_applicant.Strengthness = df_applicant.Strengthness.apply(lambda x: ' '.join([i for i in x.split() if i not in stopwords]))
df_applicant.Position = df_applicant.Position.apply(lambda x: ' '.join([i for i in x.split() if i not in stopwords]))

In [None]:
'''cleansing'''
df_job.Description = df_job.Description.apply(lambda x: stemmer_words(stemmer, x))
# df_job.Requirement = df_job.Requirement.apply(lambda x: stemmer_words(stemmer, x))
# df_job.JobTitle = df_job.JobTitle.apply(lambda x: stemmer_words(stemmer, x))

df_applicant.JobDescription = df_applicant.JobDescription.apply(lambda x: stemmer_words(stemmer, x))
# df_applicant.Strengthness = df_applicant.Strengthness.apply(lambda x: stemmer_words(stemmer, x))
# df_applicant.Position = df_applicant.Position.apply(lambda x: stemmer_words(stemmer, x))

In [98]:
df_job

Unnamed: 0_level_0,JobTitle,Description,Requirement
JobID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,malam audit,asisten kantor kelola surabaya,minimal alam posisi milik alam kuat pro sistem...
2,buku jaga,buku jaga,sarjana derajat akuntansi tahun minimal alam p...
3,android programmer,kembang android aplikasi milik integrasi jasa ...,usia maksimal minimal didik sistem informasi k...
4,sekretaris hukum,tugas sekretariat terminologi prosedur dokumen...,didik minimal hukum usia maksimal alam kerja m...
5,sekretaris direktur,bantu tangan kerja atas surat surat atur jadwa...,usia maksimal didik minimal jurus jurus sekret...
...,...,...,...
3090,kelola,pimpin motivator pegawai kelola operasional ha...,usia maksimal didik minimal jurus alam minimal...
3091,pasar staf,,
3092,kelola uang akuntansi,kuasa brevet lapor pajak audit lapor uang anal...,usia maksimal didik jurus akuntansi alam minim...
3093,urus,erti paham atur atur laku serta proses urus ur...,pria wanita usia maksimal didik minimal jurus ...


In [12]:
import gensim.downloader as api
from gensim.models import doc2vec

dataset = list(df_job.Description.apply(lambda x: x.split()).values) + list(df_job.Requirement.apply(lambda x: x.split()).values) + list(df_applicant.JobDescription.apply(lambda x: x.split()).values)
data =[]
for w in dataset:
    data.append(w)

def tagged_document(list_of_ListOfWords):
    for x, ListOfWords in enumerate(list_of_ListOfWords):
        yield doc2vec.TaggedDocument(ListOfWords, [x])

data_train = list(tagged_document(data))


d2v_model = doc2vec.Doc2Vec(vector_size=120, min_count=10, epochs=30, window=2)

d2v_model.build_vocab(data_train)
d2v_model.train(data_train, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

2023-04-19 05:30:01,284 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d120,n5,w2,mc10,s0.001,t3>', 'datetime': '2023-04-19T05:30:01.284552', 'gensim': '4.3.0', 'python': '3.11.0 | packaged by Anaconda, Inc. | (main, Mar  1 2023, 18:18:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}
2023-04-19 05:30:01,285 : INFO : collecting all words and their counts
2023-04-19 05:30:01,286 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-04-19 05:30:01,456 : INFO : collected 15238 word types and 5645 unique tags from a corpus of 5645 examples and 512213 words
2023-04-19 05:30:01,457 : INFO : Creating a fresh vocabulary
2023-04-19 05:30:01,479 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=10 retains 3614 unique words (23.72% of original 15238, drops 11624)', 'datetime': '2023-04-19T05:30:01.479554', 'gensim': '4.3.0', 'python': '3.11.0 | packaged by Anaconda, Inc. | (main, Mar  1 2023, 

In [75]:
# clean text
d2v_model.similarity_unseen_docs(df_job.Description.loc[1607].split(), df_applicant.JobDescription.loc[39348].split())

-0.024087548

In [147]:
# clean text > slangwords
d2v_model.similarity_unseen_docs(df_job.Description.loc[1607].split(), df_applicant.JobDescription.loc[39348].split())

0.48006448

In [13]:
# clean text > slangwords > englishwords
d2v_model.similarity_unseen_docs(df_job.Description.loc[1607].split(), df_applicant.JobDescription.loc[39348].split())

0.37745318

In [92]:
# clean text > slangwords > englishwords > stopwords
d2v_model.similarity_unseen_docs(df_job.Description.loc[1607].split(), df_applicant.JobDescription.loc[39348].split())

-0.1910725

In [100]:
# clean text > slangwords > englishwords > stopwords > stemm
d2v_model.similarity_unseen_docs(df_job.Description.loc[1607].split(), df_applicant.JobDescription.loc[39348].split())

-0.049069412

In [164]:
sorted(list(d2v_model.wv.key_to_index))

['-ku',
 '-nya',
 '9',
 'abk',
 'above',
 'absen',
 'absence',
 'absensi',
 'absent',
 'acara',
 'acces',
 'access',
 'accomodation',
 'accompanying',
 'according',
 'accountant',
 'accruing',
 'accuracy',
 'accurately',
 'achieved',
 'achievement',
 'achievements',
 'achieving',
 'acquisition',
 'acting',
 'actions',
 'activation',
 'actively',
 'actual',
 'acuan',
 'ada',
 'adalah',
 'adanya',
 'adapun',
 'add',
 'additional',
 'adjust',
 'administer',
 'administered',
 'administering',
 'administrasi',
 'administratif',
 'administrator',
 'adobe',
 'advance',
 'advisor',
 'advokasi',
 'advokat',
 'against',
 'agama',
 'agar',
 'agen',
 'agendas',
 'agensi',
 'agensty',
 'agreed',
 'agreements',
 'agunan',
 'agung',
 'agustus',
 'ahli',
 'ahu',
 'air',
 'ajb',
 'ajukan',
 'akad',
 'akan',
 'akhir',
 'aki',
 'akomodasi',
 'akrab',
 'akses',
 'akta',
 'akte',
 'aktif',
 'aktiva',
 'aktivasi',
 'aktivitas',
 'akun',
 'akuntansi',
 'akurat',
 'alam',
 'alamat',
 'alarm',
 'alasan',
 'ala

In [111]:
df_job.Description.loc[1607], df_applicant.JobDescription.loc[39348]

('kembang aplikasi ios integrasi layan layan guna analis optimal kode aplikasi efisiensi andal kerja',
 'riset validation mengadakan domain riset cbl challenge berdasarkan belajar kerangka mengatur them into solution konsep aplikasi perkembangan mengembangkan aplikasi technically cepat pemrograman bahasa supported ios kerangka jasa pengguna usability pengujian mengatur pengujian scenarios find prospective users aplikasi pengujian process improvements berdasarkan evaluasi results proyek presentasi hadiah aplikasi hasil peserta mentors apple reviewers')

In [94]:
df_applicant[df_applicant.JobDescription.str.contains(' android ')]

Unnamed: 0_level_0,Position,JobDescription,Strengthness
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
21,jawa situs web pengembang android pengembang,cakupan responsibility menganalisa mengembangk...,
3367,mendukung mendukung mendukung mendukung menduk...,bertugas mengisi berisi situs web intern klien...,belajar motto dipelajari hidup
18385,staf,penyelesaian jaringan perangkat keras mengemba...,cepat belajar ramah
21102,magang staf magang staf magang staf,proyek perangkat android pkbl lapangan perangk...,rajin suka belajar keras
22239,teknisi android permainan pengembang,merawat instalasi jaringan komputer penarikan ...,sabar teliti grusa grusu
26272,penjualan pemasaran staf freelance programmer,penjualan pemasaran menganalisis pasar penjual...,analisis kuat teliti mudah bersosialisasi adap...
35606,internship,situs web berdasarkan android devepment proyek...,hardworker multitasking


In [95]:
df_job[df_job.Description.str.contains(' android ')]

Unnamed: 0_level_0,JobTitle,Description,Requirement
JobID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,android programmer,mengembangkan android aplikasi milik integrasi...,usia maksimal minimal pendidikan sistem inform...
156,android programmer,mengembangkan aplikasi android integrasinya la...,usia maksimal minimal pendidikan sistem inform...
2736,android programmer,mengembangkan aplikasi android integrasinya la...,usia maksimal minimal pendidikan sistem inform...
2738,android programmer,mengembangkan aplikasi android integrasinya la...,usia maksimal minimal pendidikan sistem inform...
2848,android programmer,mengembangkan aplikasi android integrasinya la...,usia maksimal minimal pendidikan sistem inform...
2996,android programmer,mengembangkan aplikasi android integrasinya la...,usia maksimal minimal pendidikan sistem inform...


In [None]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [36]:
model = models.Word2Vec(min_count=10, window=2, vector_size=360, sample=6e-5,  alpha=0.03,  min_alpha=0.0007,  negative=20, workers=cores-1)

model.build_vocab(df_job.Description.map(str.split).values, progress_per=10000)
model.train(df_job.Description.map(str.split).values, total_examples=model.corpus_count, epochs=90, report_delay=1)

2023-04-18 22:16:56,186 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=360, alpha=0.03>', 'datetime': '2023-04-18T22:16:56.186588', 'gensim': '4.3.0', 'python': '3.11.0 | packaged by Anaconda, Inc. | (main, Mar  1 2023, 18:18:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}
2023-04-18 22:16:56,196 : INFO : collecting all words and their counts
2023-04-18 22:16:56,197 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-18 22:16:56,214 : INFO : collected 1732 word types from a corpus of 38772 raw words and 1008 sentences
2023-04-18 22:16:56,215 : INFO : Creating a fresh vocabulary
2023-04-18 22:16:56,226 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 734 unique words (42.38% of original 1732, drops 998)', 'datetime': '2023-04-18T22:16:56.225519', 'gensim': '4.3.0', 'python': '3.11.0 | packaged by Anaconda, Inc. | (main, Mar  1 2023, 18:18:21) [MSC v.1916 64

(730101, 3489480)

In [38]:
'hukum' in list(model.wv.index_to_key)

True

In [41]:
model.wv.most_similar('program')

[('utamanya', 0.8720858693122864),
 ('mengimplementasikan', 0.8421112298965454),
 ('pengoperasian', 0.7789627313613892),
 ('prima', 0.7574916481971741),
 ('disyaratkan', 0.7399214506149292),
 ('memenuhi', 0.7397032380104065),
 ('baku', 0.7390581369400024),
 ('peralatan', 0.7315595746040344),
 ('keselamatan', 0.7274753451347351),
 ('seminggu', 0.7268062233924866)]