In [1]:
import pyodbc
import re

import numpy as np
import pandas as pd
import tensorflow as tf

from sqlalchemy import create_engine
from sklearn.metrics.pairwise import cosine_similarity

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from googletrans import Translator

from FlaskApp.transform import *

from job_model import JobModel
from app_model import AppModel

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

In [3]:
user = 'huda'
password = 'Vancha12'
host = '127.0.0.1'
port = 1433
database = 'HRSystemDB'


def get_connection():         
    return create_engine(
        url=f"mssql+pyodbc://{user}:{password}@{host}:{port}/{database}?driver=SQL Server",
    )

engine = get_connection()
conn = engine.connect()

In [28]:
applicant_id = 31790

df_function = pd.DataFrame(engine.execute(
    """
    SELECT FunctionPositionID, FunctionPositionName
    FROM FunctionPosition
    """
))

df_education = pd.DataFrame(engine.execute(
    """
    SELECT EducationLevelID, EducationLevelName
    FROM EducationLevel
    """
))

df_city = pd.DataFrame(engine.execute(
    """
    SELECT CityID, Name AS CityName
    FROM City
    """
))

df_province = pd.DataFrame(engine.execute(
    """
    SELECT ProvinceID, Name AS ProvinceName
    FROM Province
    """
))

df_major = pd.DataFrame(engine.execute(
    """
    SELECT MajorID, MajorName
    FROM Major
    """
))

df_applicant = pd.DataFrame(engine.execute(
    f"""
    SELECT Applicant.ApplicantID, Applicant.Dob, Applicant.ExpectedSalary, City.Name AS CityName, Province.Name AS ProvinceName, Applicant.DriverLicenseType, Applicant.IsUsingGlasses, Applicant.Gender, Applicant.MaritalStatus, Applicant.Strengthness, Job.JobTitle
    FROM ((((Applicant
    RIGHT JOIN City ON Applicant.CurrentAddressCityID = City.CityID)
    RIGHT JOIN Province ON Applicant.CurrentAddressProvinceID = Province.ProvinceID)
    LEFT JOIN Pipeline ON Applicant.ApplicantID = Pipeline.ApplicantID)
    LEFT JOIN Job ON Pipeline.JobID = Job.JobID)
    WHERE StageID=11
    """
))

df_applicant_education = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantEducation.ApplicantID, ApplicantEducation.DateStart, ApplicantEducation.DateEnd, EducationLevel.EducationLevelName, Major.MajorName
    FROM (((ApplicantEducation
    RIGHT JOIN EducationLevel ON ApplicantEducation.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN Major ON ApplicantEducation.MajorID = Major.MajorID)
    LEFT JOIN Pipeline ON ApplicantEducation.ApplicantID = Pipeline.ApplicantID)
    WHERE StageID=11
    """
))

df_applicant_experience = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantExperience.ApplicantID, ApplicantExperience.DateFrom, ApplicantExperience.DateTo, ApplicantExperience.Position, ApplicantExperience.JobDescription
    FROM (ApplicantExperience
    LEFT JOIN Pipeline ON ApplicantExperience.ApplicantID = Pipeline.ApplicantID)
    WHERE StageID=11
    """
))

In [34]:
df_applicant.JobTitle.value_counts()

ENGINEERING STAFF             22
LEGAL STAFF                   18
DRIVER                        15
PROJECT OFFICER               14
ACCOUNTING STAFF              11
                              ..
SITE MANAGER                   1
Technical Support              1
ADMIN GA                       1
ADMINISTRASI                   1
TEKNISI (MAINTENANCE) BALI     1
Name: JobTitle, Length: 232, dtype: int64

In [30]:
'''applicant'''
df_applicant = df_applicant.drop_duplicates()
df_applicant = df_applicant.fillna('')

df_applicant['Age'] = pd.to_datetime(
    df_applicant.Dob.map(pick_date).apply(lambda x: filter_date(x, 1958, 2006))
).map(get_age)

df_applicant.drop(columns=['Dob'], inplace=True)

df_applicant.Age = df_applicant.Age.fillna(0).astype(int)

In [11]:
df_applicant_education.head(3)

Unnamed: 0_level_0,EducationLevelName,MajorName
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1
14,S1,HUKUM
128,S1,PSIKOLOGI
522,S2,HUKUM


In [10]:
'''education'''
df_applicant_education = df_applicant_education.fillna('')
df_applicant_education.DateStart = pd.to_datetime(
    df_applicant_education.DateStart.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_education.DateEnd = pd.to_datetime(
    df_applicant_education.DateEnd.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_education = df_applicant_education[~(df_applicant_education.DateStart.isna()) & ~(df_applicant_education.DateEnd.isna())]
df_applicant_education = df_applicant_education.sort_values('DateStart').groupby(['ApplicantID']).agg('last')

df_applicant_education.drop(columns=['DateStart', 'DateEnd'], inplace=True)

In [14]:
df_applicant_experience.head(3)

Unnamed: 0_level_0,JobDescription,Position,YearsOfExperience
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
128,"<p>mempromosikan sebuah event dan exhibition,&...",Freelance Crew Freelance Marketing,3
775,"<p style=""language:id;margin-top:0pt;margin-bo...",Legal Staff,1
841,"<p class=""MsoListParagraphCxSpFirst"" style=""ma...",Personalia dan Konselor Staff Rekruitment HRD ...,4


In [13]:
'''experience'''
df_applicant_experience = df_applicant_experience.fillna('')
df_applicant_experience.DateFrom = pd.to_datetime(
    df_applicant_experience.DateFrom.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_experience.DateTo = pd.to_datetime(
    df_applicant_experience.DateTo.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_experience = df_applicant_experience[~(df_applicant_experience.DateFrom.isna()) & ~(df_applicant_experience.DateTo.isna())]

# add YearsOfExperience column
df_applicant_experience['YearsOfExperience'] = substract_months(
    df_applicant_experience.DateFrom, df_applicant_experience.DateTo
)

df_applicant_experience = df_applicant_experience.sort_values('DateFrom').groupby(['ApplicantID']).agg({
    'DateFrom': 'last',
    'DateTo': 'last',
    'JobDescription': ' '.join,
    'Position': ' '.join,
    'YearsOfExperience': 'sum',
})

df_applicant_experience.drop(columns=['DateFrom', 'DateTo'], inplace=True)

In [17]:
df_applicant.head(3)

Unnamed: 0,ApplicantID,ExpectedSalary,CityName,ProvinceName,DriverLicenseType,IsUsingGlasses,Gender,MaritalStatus,Strengthness,Age,JobDescription,Position,YearsOfExperience,EducationLevelName,MajorName
0,33513,3500000.0,SURABAYA,JAWA TIMUR,C,False,Female,Single,"Humble, brave, Active, Teamwork",25,"<p>- Plan, imolement, monitor the overall of s...",SUPPLY CHAIN MANAGEMENT & CHIEFF SPG ADMIN QUA...,1,D3,SEMUA JURUSAN 2
1,31861,15000000.0,SURABAYA,JAWA TIMUR,A,False,Male,Married,"Loyal, honest, persistent",51,"<p>Omset , sales, profit, tax, maintenance, dl...",Asisten direksi and operasional Direksi,8,S1,MANAJEMEN
2,31891,4500000.0,SURABAYA,JAWA TIMUR,A,True,Male,Single,"Rapi, teratur, disiplin, multitasking, dan ber...",30,"<p>Mengenalkan product LED, sound, lighting, d...",Product and Sales Event Production Product and...,2,S1,PSIKOLOGI


In [16]:
'''merge'''
df_applicant = pd.merge(df_applicant, df_applicant_experience, on=['ApplicantID'])
df_applicant = pd.merge(df_applicant, df_applicant_education, on=['ApplicantID'])

In [None]:
translator = Translator(service_urls=['translate.googleapis.com'])
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
'''PRE-PROCESSING APPLICANT'''
df_applicant.set_index(['ApplicantID'], inplace=True)

app_str = ['CityName', 'ProvinceName', 'EducationLevelName', 'MajorName', 'DriverLicenseType', 'Gender', 'MaritalStatus', 'Position', 'JobDescription', 'Strengthness']
app_num = ['Age', 'ExpectedSalary']
app_bol = ['IsUsingGlasses']

df_applicant = df_applicant[app_num + app_bol + app_str]

'''str'''
df_applicant[app_str] = df_applicant[app_str].applymap(str.lower)

df_applicant.JobDescription = df_applicant.JobDescription.map(clean_text).map(maintain_alphabet).map(remove_single).map(remove_morespace).apply(lambda x: translate_teks(translator, x)).apply(lambda x: stemmer_words(stemmer, x)).apply(lambda x: x.split(' ')).apply(lambda x: change_slangwords(slangwords, x)).apply(lambda x: [i for i in x if i not in stopwords]).apply(lambda x: ' '.join(x))
df_applicant.Strengthness = df_applicant.Strengthness.map(clean_text).map(maintain_alphabet).map(remove_single).map(remove_morespace).apply(lambda x: translate_teks(translator, x)).apply(lambda x: stemmer_words(stemmer, x)).apply(lambda x: x.split(' ')).apply(lambda x: change_slangwords(slangwords, x)).apply(lambda x: [i for i in x if i not in stopwords]).apply(lambda x: ' '.join(x))

df_applicant.Position = df_applicant.Position.apply(lambda x: translate_teks(translator, x))
df_applicant.MajorName = df_applicant.MajorName.apply(lambda x: translate_teks(translator, x))

# concat
df_applicant.JobDescription = df_applicant.JobDescription.str.cat(
    df_applicant.Strengthness, sep=' '
)
df_applicant.rename(columns={'JobDescription': 'DescriptionStrengthness'}, inplace=True)
df_applicant.drop(columns=['Strengthness'], inplace=True)

'''bool'''
df_applicant.IsUsingGlasses = df_applicant.IsUsingGlasses.astype(str).map(str.lower)




"""PRE-PROCESSING JOB"""
job_str = ['CityName', 'ProvinceName', 'EducationLevelName', 'MajorName', 'DriverLicenseType', 'Gender', 'MaritalStatus', 'JobTitle', 'FunctionPositionName', 'Description', 'Requirement']
job_num = ['UsiaMax', 'SalaryMin', 'SalaryMax']
job_bol = ['UsingGlasses']

'''general'''
df_job.set_index(['JobID'], inplace=True)
df_job.fillna('', inplace=True)

'''str'''
df_job[job_str] = df_job[job_str].applymap(str.lower)
df_job.replace('none', '', inplace=True)

df_job.JobTitle = df_job.JobTitle.apply(lambda x: translate_teks(translator, x))
df_job.FunctionPositionName = df_job.FunctionPositionName.apply(lambda x: translate_teks(translator, x))
df_job.MajorName = df_job.MajorName.apply(lambda x: translate_teks(translator, x))

df_job.Description = df_job.Description.map(clean_text).map(maintain_alphabet).map(remove_single).map(remove_morespace).apply(lambda x: translate_teks(translator, x)).apply(lambda x: stemmer_words(stemmer, x)).apply(lambda x: x.split(' ')).apply(lambda x: change_slangwords(slangwords, x)).apply(lambda x: [i for i in x if i not in stopwords]).apply(lambda x: ' '.join(x))
df_job.Requirement = df_job.Requirement.map(clean_text).map(maintain_alphabet).map(remove_single).map(remove_morespace).apply(lambda x: translate_teks(translator, x)).apply(lambda x: stemmer_words(stemmer, x)).apply(lambda x: x.split(' ')).apply(lambda x: change_slangwords(slangwords, x)).apply(lambda x: [i for i in x if i not in stopwords]).apply(lambda x: ' '.join(x))

# concatenate JobTitle and FunctionPositionName to be textual feature together
df_job.JobTitle = df_job.JobTitle.str.cat(
    df_job.FunctionPositionName, sep=' '
)
df_job.rename(columns={'JobTitle': 'JobTitlePosition'}, inplace=True)
df_job.drop(columns=['FunctionPositionName'], inplace=True)

# concatenate Description and Requirement to be textual feature together
df_job.Description = df_job.Description.str.cat(
    df_job.Requirement, sep=' '
)
df_job.rename(columns={'Description': 'DescriptionRequirement'}, inplace=True)
df_job.drop(columns=['Requirement'], inplace=True)

'''int'''
df_job[job_num] = df_job[job_num].replace('', 0)
df_job[job_num] = df_job[job_num].astype(int)

# get mean from SalaryMin and SalaryMax
df_job.SalaryMin = (df_job.SalaryMax + df_job.SalaryMin) // 2
df_job.rename(columns={'SalaryMin': 'SalaryMean'}, inplace=True)
df_job.SalaryMean = df_job.SalaryMean.apply(lambda x: 0 if x < 1_000_000 else x)
df_job.drop(columns=['SalaryMax'], inplace=True)

'''bool'''
df_job.UsingGlasses = df_job.UsingGlasses.astype(str).map(str.lower)



# load table for vocabulary
df_function.FunctionPositionName = df_function.FunctionPositionName.map(str.lower).apply(lambda x: function_replacement(x)).map(remove_insideparentheses).map(remove_morespace).map(str.strip)
df_education.EducationLevelName = df_education.EducationLevelName.map(str.lower)
df_city.CityName = df_city.CityName.map(str.lower)
df_province.ProvinceName = df_province.ProvinceName.map(str.lower)
df_major.MajorName = df_major.MajorName.map(str.lower)

In [None]:
job_model = JobModel(df_job, df_function, df_education, df_city, df_province, df_major)
app_model = AppModel(job_model, df_applicant)