In [2]:
import pyodbc
import re

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

from sqlalchemy import create_engine
from datetime import datetime, timedelta

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from googletrans import Translator

from transform_copy import *

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eats\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

In [4]:
user = 'huda'
password = 'Vancha12'
host = '127.0.0.1'
port = 1433
database = 'HRSystemDB'


def get_connection():         
    return create_engine(
        url=f"mssql+pyodbc://{user}:{password}@{host}:{port}/{database}?driver=SQL Server",
    )

engine = get_connection()
conn = engine.connect()

In [5]:
df_job = pd.DataFrame(engine.execute(
    """
    SELECT Job.JobID, Job.JobTitle, FunctionPosition.FunctionPositionName, EducationLevel.EducationLevelName, City.Name AS CityName, Province.Name AS ProvinceName, Major.MajorName, Job.Description, Job.Requirement
    FROM (((((Job
    RIGHT JOIN FunctionPosition ON Job.FunctionPositionID = FunctionPosition.FunctionPositionID)
    RIGHT JOIN EducationLevel ON Job.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN City ON Job.CityID = City.CityID)
    RIGHT JOIN Province ON Job.ProvinceID = Province.ProvinceID)
    RIGHT JOIN Major ON Job.MajorID = Major.MajorID)
    WHERE JobStatus='Publish'
    """
))

df_applicant = pd.DataFrame(engine.execute(
    """
    SELECT Applicant.ApplicantID, Applicant.Dob, Applicant.Strengthness, Applicant.Weaknesses, City.Name AS CityName, Province.Name AS ProvinceName
    FROM ((Applicant
    RIGHT JOIN City ON Applicant.CurrentAddressCityID = City.CityID)
    RIGHT JOIN Province ON Applicant.CurrentAddressProvinceID = Province.ProvinceID)
    WHERE ApplicantID=33513
    """
))

df_applicant_education = pd.DataFrame(engine.execute(
    """
    SELECT ApplicantEducation.ApplicantID, ApplicantEducation.DateStart, ApplicantEducation.DateEnd, EducationLevel.EducationLevelName, Major.MajorName
    FROM ((ApplicantEducation
    RIGHT JOIN EducationLevel ON ApplicantEducation.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN Major ON ApplicantEducation.MajorID = Major.MajorID)
    WHERE ApplicantID=33513
    """
))

df_applicant_experience = pd.DataFrame(engine.execute(
    """
    SELECT ApplicantExperience.ApplicantID, ApplicantExperience.DateFrom, ApplicantExperience.DateTo, ApplicantExperience.Industry, ApplicantExperience.JobDescription, ApplicantExperience.Position
    FROM ApplicantExperience
    WHERE ApplicantID=33513
    """
))

Applicant

In [6]:
df_applicant = df_applicant.drop_duplicates()
df_applicant = df_applicant.fillna('')
df_applicant_education = df_applicant_education.fillna('')
df_applicant_experience = df_applicant_experience.fillna('')

'''applicant'''
df_applicant['Age'] = pd.to_datetime(
    df_applicant.Dob.map(pick_date).apply(lambda x: filter_date(x, 1958, 2006))
).map(get_age)

df_applicant.drop(columns=['Dob'], inplace=True)

df_applicant.Age = df_applicant.Age.fillna(0).astype(int)

'''education'''
df_applicant_education.DateStart = pd.to_datetime(
    df_applicant_education.DateStart.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_education.DateEnd = pd.to_datetime(
    df_applicant_education.DateEnd.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_education = df_applicant_education[~(df_applicant_education.DateStart.isna()) & ~(df_applicant_education.DateEnd.isna())]
df_applicant_education = df_applicant_education.sort_values('DateStart').groupby(['ApplicantID']).agg('last')

df_applicant_education.drop(columns=['DateStart', 'DateEnd'], inplace=True)

'''experience'''
df_applicant_experience.DateFrom = pd.to_datetime(
    df_applicant_experience.DateFrom.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_experience.DateTo = pd.to_datetime(
    df_applicant_experience.DateTo.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_experience = df_applicant_experience[~(df_applicant_experience.DateFrom.isna()) & ~(df_applicant_experience.DateTo.isna())]

# add YearsOfExperience column
df_applicant_experience['YearsOfExperience'] = substract_months(
    df_applicant_experience.DateFrom, df_applicant_experience.DateTo
)

df_applicant_experience = df_applicant_experience.sort_values('DateFrom').groupby(['ApplicantID']).agg({
    'DateFrom': 'last',
    'DateTo': 'last',
    'Industry': ' '.join,
    'JobDescription': ' '.join,
    'Position': ' '.join,
    'YearsOfExperience': 'sum',
})

df_applicant_experience.drop(columns=['DateFrom', 'DateTo'], inplace=True)

'''merge'''
df_applicant = pd.merge(df_applicant, df_applicant_experience, on=['ApplicantID'])
df_applicant = pd.merge(df_applicant, df_applicant_education, on=['ApplicantID'])

'''remove weaknesses'''
df_applicant = df_applicant.drop(columns=['Weaknesses'])

'''preprocessing'''
df_applicant.set_index(['ApplicantID'], inplace=True)

df_applicant[df_applicant.select_dtypes(object).columns] = df_applicant[df_applicant.select_dtypes(object).columns].applymap(str.lower)

for col in ['Strengthness', 'JobDescription', 'Industry', 'Position']:
    df_applicant[col] = df_applicant[col].map(clean_text)

'''translate'''
translator = Translator(service_urls=['translate.googleapis.com'])
for col in ['Strengthness', 'Industry', 'JobDescription', 'Position']:
    df_applicant[col] = df_applicant[col].apply(lambda x: translator.translate(x, dest='id').text)


In [7]:
df_applicant

Unnamed: 0_level_0,Strengthness,CityName,ProvinceName,Age,Industry,JobDescription,Position,YearsOfExperience,EducationLevelName,MajorName
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
33513,kerja tim aktif yang berani dan rendah hati,surabaya,jawa timur,25,industri makanan industri kulit,merencanakan imolement memantau keseluruhan st...,manajemen rantai suplai chieff spg admin kontr...,1,d3,semua jurusan 2


Job

In [8]:
df_job.set_index(['JobID'], inplace=True)
df_job.fillna('', inplace=True)

df_job = df_job.applymap(str.lower)
df_job.EducationLevelName = df_job.EducationLevelName.replace('none', '')

translator = Translator(service_urls=['translate.googleapis.com'])

df_job.JobTitle = df_job.JobTitle.apply(lambda x: translator.translate(x, dest='id').text.lower())
df_job.FunctionPositionName = df_job.FunctionPositionName.apply(lambda x: translator.translate(x, dest='id').text.lower())
df_job.MajorName = df_job.MajorName.apply(lambda x: translator.translate(x, dest='id').text.lower())

df_job.Description = df_job.Description.map(clean_text).apply(lambda x: translator.translate(x, dest='id').text.lower())
df_job.Requirement = df_job.Requirement.map(clean_text).apply(lambda x: translator.translate(x, dest='id').text.lower())

In [9]:
df_job.head(2)

Unnamed: 0_level_0,JobTitle,FunctionPositionName,EducationLevelName,CityName,ProvinceName,MajorName,Description,Requirement
JobID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2775,sekretaris direksi,sekretaris,,surabaya,jawa timur,sekretaris,melakukan aktivitas kesekretariatan perusahaan...,usia maksimal 35 tahun pendidikan minimal s1 j...
2785,staff desain,grafis desain,s1,jakarta,dki jakarta,desain grafis,membuat desain yang menarik untuk kebutuhan pe...,usia max 30 tahun pendidikan minimal s1 desain...


In [10]:
import tensorflow_datasets as tfds

ratings = tfds.load("movielens/100k-ratings", split="train")



In [62]:
ds_job = tf.data.Dataset.from_tensor_slices(dict(df_job))


'''encoding'''
job_title = tf.keras.layers.StringLookup()
job_title.adapt(ds_job.map(lambda x: x['JobTitle']))

job_education = tf.keras.layers.StringLookup()
job_education.adapt(ds_job.map(lambda x: x['EducationLevelName']))

job_description = tf.keras.layers.TextVectorization(output_sequence_length=300)
job_description.adapt(ds_job.map(lambda x: x['Description']))

job_requirement = tf.keras.layers.TextVectorization()
job_requirement.adapt(ds_job.map(lambda x: x['Requirement']))


'''embedding'''
job_title_embedding = tf.keras.layers.Embedding(
    input_dim=job_title.vocabulary_size(),
    output_dim=32
)

job_education_embedding = tf.keras.layers.Embedding(
    input_dim=job_education.vocabulary_size(),
    output_dim=32
)

job_description_embedding = tf.keras.layers.Embedding(
    input_dim=job_description.vocabulary_size(),
    output_dim=32
)

job_requirement_embedding = tf.keras.layers.Embedding(
    input_dim=job_requirement.vocabulary_size(),
    output_dim=32
)


'''model'''
job_title_model = tf.keras.Sequential([job_title, job_title_embedding])
job_education_model = tf.keras.Sequential([job_education, job_education_embedding])
job_description_model = tf.keras.Sequential([job_description, job_description_embedding])
job_requirement_model = tf.keras.Sequential([job_requirement, job_requirement_embedding])

In [63]:
ds_applicant = tf.data.Dataset.from_tensor_slices(dict(df_applicant))


'''enconding'''
app_position = tf.keras.layers.StringLookup()
app_position.adapt(ds_applicant.map(lambda x: x['Position']))

app_education = tf.keras.layers.StringLookup()
app_education.adapt(ds_applicant.map(lambda x: x['EducationLevelName']))

app_description = tf.keras.layers.TextVectorization(output_sequence_length=300)
app_description.adapt(ds_applicant.map(lambda x: x['JobDescription']))

app_strengthness = tf.keras.layers.TextVectorization()
app_strengthness.adapt(ds_applicant.map(lambda x: x['Strengthness']))


'''embedding'''
app_position_embedding = tf.keras.layers.Embedding(
    input_dim=app_position.vocabulary_size(),
    output_dim=32
)

app_education_embedding = tf.keras.layers.Embedding(
    input_dim=app_education.vocabulary_size(),
    output_dim=32
)

app_description_embedding = tf.keras.layers.Embedding(
    input_dim=app_description.vocabulary_size(),
    output_dim=32
)

app_strengthness_embedding = tf.keras.layers.Embedding(
    input_dim=app_strengthness.vocabulary_size(),
    output_dim=32
)


'''model'''
app_position_model = tf.keras.Sequential([app_position, app_position_embedding])
app_education_model = tf.keras.Sequential([app_education, app_education_embedding])
app_description_model = tf.keras.Sequential([app_description, app_description_embedding])
app_strengthness_model = tf.keras.Sequential([app_strengthness, app_strengthness_embedding])

In [64]:
cosine_sim = tf.keras.losses.CosineSimilarity(axis=1)

In [66]:
jobdesc = job_description_model(df_job.Description.values)
appdesc = app_description_model(df_applicant.JobDescription.values)

In [67]:
jobdesc.shape, appdesc.shape

(TensorShape([42, 300, 32]), TensorShape([1, 300, 32]))

In [128]:
similarity = []
for i in jobdesc:
    similarity.append(cosine_sim(i, appdesc[0]))

In [129]:
similarity = [float(i)*100 for i in similarity]

In [130]:
df_job['Similarity'] = similarity

In [131]:
df_applicant

Unnamed: 0_level_0,Strengthness,CityName,ProvinceName,Age,Industry,JobDescription,Position,YearsOfExperience,EducationLevelName,MajorName
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
33513,kerja tim aktif yang berani dan rendah hati,surabaya,jawa timur,25,industri makanan industri kulit,merencanakan imolement memantau keseluruhan st...,manajemen rantai suplai chieff spg admin kontr...,1,d3,semua jurusan 2


In [138]:
df_job.sort_values(by=['Similarity'], ascending=False).head(5)

Unnamed: 0_level_0,JobTitle,FunctionPositionName,EducationLevelName,CityName,ProvinceName,MajorName,Description,Requirement,Similarity
JobID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3034,marketing communication (marcomm),komunikasi pemasaran,s1,jakarta,dki jakarta,semua jurusan,menerapkan rencana promosi mengatur acara prom...,kandidat harus memiliki setidaknya gelar sarja...,21.210168
2966,supervisor pajak,akuntansi,s1,jakarta,dki jakarta,akuntansi,usia maksimal 35 tahun pendidikan minimal s1 a...,mengusai sistem accounting menguasai system la...,21.095556
2933,akuntansi pengawas,akuntansi,s1,jakarta,dki jakarta,akuntansi,mengontrol laporan konstruksi baru mengontrol ...,max 30 tahun minimal pendidikan s1 ekonomi dan...,20.982513
2981,penelitian & pengembangan staf,pengembangan bisnis,s1,jakarta,dki jakarta,semua jurusan,melakukan negosiasi dengan pemilik lahan swast...,laki laki atau perempuan usia maksimal 35 tahu...,20.819232
3088,pengembangan bisnis senior/spv,pengembangan bisnis,d3,jakarta,dki jakarta,semua jurusan,melakukan negosiasi dengan pemilik lahan swast...,skill requirement wanita 20 40 thn penampilan ...,20.819232


In [14]:
metrics = tfrs.metrics.FactorizedTopK(
    candidates=ds_job.batch(128).map(lambda x: x['JobTitle'])
)

In [15]:
task = tfrs.tasks.Retrieval(
    metrics=metrics
)

In [16]:
# ds_job.element_spec

# for row in ds_job.batch(2).map(lambda x: x['JobTitle']).take(1):
#     print(job_title(row))

# job_title.get_vocabulary()

# job_title(["bisnis", "auditor anak kepemimpinan dan bisa melakukan auditor"])

In [17]:
model = tf.keras.models.Sequential()

model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(job_title)

model.predict(df_job.JobTitle.values)



array([[ 1],
       [10],
       [26],
       [27],
       [40],
       [34],
       [19],
       [ 8],
       [25],
       [ 3],
       [13],
       [18],
       [20],
       [14],
       [ 1],
       [33],
       [ 6],
       [28],
       [36],
       [24],
       [38],
       [ 1],
       [37],
       [16],
       [21],
       [ 9],
       [31],
       [15],
       [17],
       [11],
       [39],
       [ 2],
       [32],
       [ 4],
       [ 7],
       [23],
       [29],
       [35],
       [12],
       [30],
       [ 5],
       [22]], dtype=int64)