Import Requirements

In [19]:
import pyodbc
import re

import numpy as np
import pandas as pd

from sqlalchemy import create_engine
from datetime import datetime, timedelta

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from transformx import remove_html, maintain_alpha, remove_single, remove_morespace, remove_enumerate, clean_text, remove_insideparentheses, remove_standalonesymbols, stopwords_remover

from eris import ErisRecommender

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eats\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Function Definition

In [None]:
def remove_stopwords(stopwords, text):
    list_text = text.split(' ')

    for text in list_text:
        if text in stopwords:
            list_text.remove(text)
            # print(text)
    return ' '.join(list_text)

Notebook Settings

In [20]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

Connection

In [21]:
user = 'huda'
password = 'Vancha12'
host = '127.0.0.1'
port = 1433
database = 'HRSystemDB'


def get_connection():         
    return create_engine(
        url=f"mssql+pyodbc://{user}:{password}@{host}:{port}/{database}?driver=SQL Server",
    )

engine = get_connection()
conn = engine.connect()

In [23]:
df_applicant = pd.DataFrame(engine.execute('SELECT ApplicantID, DiseaseHistory, Dob, ExpectedSalary, IsAbleToWorkRemote, CityID, ProvinceID, DriverLicenseType, Gender, IsUsingGlasses, Height, MaritalStatus, Nationality, Strengthness, Weaknesses, TypeOfVehicle FROM Applicant'))
df_applicant_education = pd.DataFrame(engine.execute('SELECT ApplicantEducationID, ApplicantID, DateStart, DateEnd, EducationInstituteName, Score, EducationLevelID, MajorID, Degree FROM ApplicantEducation'))
df_applicant_experience = pd.DataFrame(engine.execute('SELECT ApplicantExperienceID, ApplicantID, DateFrom, DateTo, Industry, CompanyName, JobDescription, Position, Salary FROM ApplicantExperience'))
df_applicant_document = pd.DataFrame(engine.execute('SELECT ApplicantID, DocumentName FROM ApplicantDocument'))
df_applicant_certificate = pd.DataFrame(engine.execute('SELECT ApplicantID, Description FROM ApplicantCertificate'))

df_pipeline = pd.DataFrame(engine.execute('SELECT PipelineID, ApplicantID, JobID, StageID FROM Pipeline'))
df_stage = pd.DataFrame(engine.execute('SELECT StageID, Label FROM Stage'))

df_job = pd.DataFrame(engine.execute('SELECT * FROM Job'))
df_function_position = pd.DataFrame(engine.execute('SELECT FunctionPositionID, FunctionPositionName FROM FunctionPosition'))
df_department = pd.DataFrame(engine.execute('SELECT DepartmentID, Name AS DepartmentName FROM Department'))
df_city = pd.DataFrame(engine.execute('SELECT CityID, Name AS CityName FROM City'))
df_province = pd.DataFrame(engine.execute('SELECT ProvinceID, Name AS ProvinceName FROM Province'))
df_major = pd.DataFrame(engine.execute('SELECT MajorID, MajorName FROM Major'))
df_education_level = pd.DataFrame(engine.execute('SELECT EducationLevelID, EducationLevelName FROM EducationLevel'))
df_company = pd.DataFrame(engine.execute('SELECT CompanyID, Name AS CompanyName FROM Company'))

# df_job = df_job[['JobID', 'Description', 'EducationLevelID', 'FunctionPositionID', 'DepartmentID', 'JobTitle', 'Requirement', 'CityID', 'ProvinceID', 'MajorID']]

In [None]:
# df_job.head(3)

In [26]:
'''fillna'''
df_job.Description.fillna('', inplace=True)
df_job.Requirement.fillna('', inplace=True)

'''merge'''
job_merged = pd.merge(df_job, df_education_level, on=['EducationLevelID'])
job_merged = pd.merge(job_merged, df_city, on=['CityID'])
job_merged = pd.merge(job_merged, df_province, on=['ProvinceID'])
job_merged = pd.merge(job_merged, df_function_position, on=['FunctionPositionID'])
job_merged = pd.merge(job_merged, df_department, on=['DepartmentID'])
job_merged = pd.merge(job_merged, df_major, on=['MajorID'])

job_merged.drop(columns=['EducationLevelID', 'CityID', 'ProvinceID', 'FunctionPositionID', 'DepartmentID', 'MajorID'], inplace=True)

job_merged.drop(
    index=job_merged.index[job_merged.JobTitle.map(str.lower).str.contains('test|123')].values, inplace=True
)

job_merged = job_merged[['JobID', 'JobTitle', 'FunctionPositionName', 'EducationLevelName', 'CityName', 'ProvinceName', 'Description', 'Requirement', 'MajorName']]


'''cleansing'''
job_merged.Description = job_merged.Description.map(clean_text)

job_merged.Requirement = job_merged.Requirement.map(remove_html).map(remove_enumerate).map(maintain_alpha).map(remove_single).map(remove_morespace).map(str.strip).map(str.lower)

job_merged.JobTitle = job_merged.JobTitle.map(str.lower).map(remove_insideparentheses).map(remove_standalonesymbols).map(remove_morespace)

job_merged.EducationLevelName = job_merged.EducationLevelName.replace('None', '').map(str.lower)

job_merged.CityName = job_merged.CityName.map(str.lower)
job_merged.ProvinceName = job_merged.ProvinceName.map(str.lower)

job_merged.FunctionPositionName = job_merged.FunctionPositionName.map(remove_standalonesymbols).apply(lambda x: re.sub('[\(\)0-9]', '', x)).map(remove_morespace).map(str.strip).map(str.lower)

job_merged.MajorName = job_merged.MajorName.map(str.lower)


'''stopwords'''
sastrawi_stopwords = StopWordRemoverFactory().get_stop_words()
nltk_stopwords_in =  stopwords.words('indonesian')
nltk_stopwords_en =  stopwords.words('english')
user_stopwords = ['perusahaan', 'sesuai', 'become', 'becoming', 'gunawangsa', 'hotel', 'merr']

stopwords_in = list(set(sastrawi_stopwords + nltk_stopwords_in + nltk_stopwords_en + user_stopwords))

'''cleansing'''
# cara 1
job_merged.Description = job_merged.Description.apply(lambda x: re.sub('\s+', '   ', '   ' + x + '   ')).apply(lambda x: re.sub('(' + ' | '.join(stopwords_in) + ')', ' ', x)).map(remove_morespace).map(str.strip)

# cara 2
# job_merged.Description.apply(lambda x: remove_stopwords(stopwords_in, x))

job_merged.Requirement = job_merged.Requirement.apply(lambda x: re.sub('\s+', '   ', '   ' + x + '   ')).apply(lambda x: re.sub('(' + ' | '.join(stopwords_in) + ')', ' ', x)).map(remove_morespace).map(str.strip)

'''concatenation'''
# job_merged = job_merged.set_index(['JobID'])

job_train = pd.DataFrame([], index=job_merged.index)

job_train['Text'] = job_merged.JobTitle.str.cat(
    job_merged.FunctionPositionName.str.cat(
        job_merged.EducationLevelName.str.cat(
            job_merged.MajorName.str.cat(
                job_merged.CityName.str.cat(
                    job_merged.ProvinceName.str.cat(
                        job_merged.Description.str.cat(
                            job_merged.Requirement
                        , sep=' ')
                    , sep=' ')
                , sep=' ')
            , sep=' ')
        , sep=' ')
    , sep=' ')
, sep=' ')

'''recommender'''
encoder = TfidfVectorizer(vocabulary=)
bank = encoder.fit_transform(job_train.Text)

# code = encoder.transform(job_train.Text)
# dist = cosine_similarity(code, bank)[0]*100

eris = ErisRecommender(job_merged, job_train.copy(), 'Text')
eris.fit()
eris.recommend('android')

Applicant

In [10]:
df_applicant = pd.read_csv('data/cdf_applicant.csv')
df_applicant_experience = pd.read_csv('data/cdf_applicant_experience.csv')
df_applicant_experience.fillna('', inplace=True)
df_applicant_education = pd.read_csv('data/cdf_applicant_education.csv')

df_city = pd.read_csv('data/df_city.csv')
df_province = pd.read_csv('data/df_province.csv')

df_applicant_education = pd.merge(df_applicant_education, df_education_level, on=['EducationLevelID'])
df_applicant_education = pd.merge(df_applicant_education, df_major, on=['MajorID'])
df_applicant_education = df_applicant_education[['ApplicantID', 'EducationLevelName', 'MajorName']]

df_applicant = df_applicant[['ApplicantID', 'Age', 'CityID', 'ProvinceID', 'Strengthness', 'Weaknesses']]
df_applicant_experience = df_applicant_experience[['ApplicantID', 'Industry', 'JobDescription', 'Position', 'YearsOfExperience']]

app_merged = pd.merge(df_applicant, df_applicant_experience, on=['ApplicantID'])
app_merged = pd.merge(app_merged, df_city, on=['CityID'])
app_merged = pd.merge(app_merged, df_province, on=['ProvinceID'])
app_merged = pd.merge(app_merged, df_applicant_education, on=['ApplicantID'])
app_merged.drop(columns=['CityID', 'ProvinceID'], inplace=True)

'''menjadi text feature'''
app_merged.Age = app_merged.Age.apply(lambda x: 'usia ' + str(x) + ' tahun' if x != 0 else '')
app_merged.YearsOfExperience = app_merged.YearsOfExperience.apply(lambda x: 'pengalaman ' + str(x) + ' tahun' if x != 0 else '')
app_merged.EducationLevelName = app_merged.EducationLevelName.apply(lambda x: 'lulusan ' + x.lower())
app_merged.MajorName = app_merged.MajorName.map(str.lower)

app_merged.Strengthness = app_merged.Strengthness.map(clean_text)
app_merged.Weaknesses = app_merged.Weaknesses.map(clean_text)

app_merged.Strengthness = app_merged.Strengthness.apply(lambda x: re.sub('\s+', '   ', '   ' + x + '   ')).apply(lambda x: re.sub('(' + ' | '.join(stopwords_in) + ')', ' ', x)).map(remove_morespace).map(str.strip)
app_merged.Weaknesses = app_merged.Weaknesses.apply(lambda x: re.sub('\s+', '   ', '   ' + x + '   ')).apply(lambda x: re.sub('(' + ' | '.join(stopwords_in) + ')', ' ', x)).map(remove_morespace).map(str.strip)
app_merged.JobDescription = app_merged.JobDescription.apply(lambda x: re.sub('\s+', '   ', '   ' + x + '   ')).apply(lambda x: re.sub('(' + ' | '.join(stopwords_in) + ')', ' ', x)).map(remove_morespace).map(str.strip)

'''concatenate'''
app_train = pd.DataFrame([], index=app_merged.index)

app_train['Text'] = app_merged.Position.str.cat(
    app_merged.EducationLevelName.str.cat(
        app_merged.MajorName.str.cat(
            app_merged.CityName.str.cat(
                app_merged.ProvinceName.str.cat(
                    app_merged.Age.str.cat(
                        app_merged.YearsOfExperience.str.cat(
                            app_merged.Strengthness.str.cat(
                                app_merged.Weaknesses.str.cat(
                                    app_merged.JobDescription
                                , sep=' ')
                            , sep=' ')
                        , sep=' ')
                    , sep=' ')
                , sep=' ')
            , sep=' ')
        , sep=' ')
    , sep=' ')
, sep=' ')


'''recommender'''
app_encoder = TfidfVectorizer()
app_bank = app_encoder.fit_transform(app_train.Text)

job_encoder = TfidfVectorizer()
job_bank = job_encoder.fit_transform(job_train.Text)

codes = job_encoder.transform([app_train.Text[0]])
dist = cosine_similarity(codes, job_bank).T

job_merged['Similarity'] = dist

Kemiripan applicant dan job harus ditingkatkan dengan nilai similarity. Pada hasil tersebut, nilai similaritynya 0.2 atau 20% meskipun track recordnya sangat sesuai. Normalnya bisa mencapai 75% ke atas. Untuk mendapatkan hal demikian diperlukan cleansing yang lebih akurat.

In [47]:
# TODO: Done, tinggal menambahkan improvement!