In [221]:
import pyodbc
import re
import json
import spacy

import numpy as np
import pandas as pd
import tensorflow as tf

from sqlalchemy import create_engine
from sklearn.metrics.pairwise import cosine_similarity

from collections import defaultdict
from gensim import corpora, models, similarities
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
from gensim.models.word2vec import Word2Vec

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import gensim.downloader as api
from gensim.models import doc2vec

from googletrans import Translator
from deep_translator import GoogleTranslator

from FlaskApp.transform import *

from job_model import JobModel
from app_model import AppModel

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import multiprocessing

In [222]:
# translator = Translator(service_urls=['translate.googleapis.com'])
factory = StemmerFactory()
stemmer = factory.create_stemmer()

rootwords = [i[0] for i in pd.read_csv('data/rootwords.txt').values]
stopwords = [i[0] for i in pd.read_csv('data/stopwords.csv').values]

with open('data/slangwords.json', 'r') as file:
    slangwords = json.load(file)

with open('data/englishwords.json', 'r') as file:
    englishwords = json.load(file)

with open('data/slangjobs.json', 'r') as file:
    slangjobs = json.load(file)

cores = multiprocessing.cpu_count()

In [139]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

In [140]:
user = 'huda'
password = 'Vancha12'
host = '127.0.0.1'
port = 1433
database = 'HRSystemDB'


def get_connection():         
    return create_engine(
        url=f"mssql+pyodbc://{user}:{password}@{host}:{port}/{database}?driver=SQL Server",
    )

engine = get_connection()
conn = engine.connect()

In [205]:
applicant_id = 31790

df_job = pd.DataFrame(engine.execute(
    """
    SELECT Job.JobID, Job.UsiaMax, Job.SalaryMin, Job.SalaryMax, City.Name AS CityName, Province.Name AS ProvinceName, EducationLevel.EducationLevelName, Major.MajorName, Job.DriverLicenseType, Job.UsingGlasses, Job.Gender, Job.MaritalStatus, Job.JobTitle, FunctionPosition.FunctionPositionName, Job.Description, Job.Requirement
    FROM (((((Job
    RIGHT JOIN FunctionPosition ON Job.FunctionPositionID = FunctionPosition.FunctionPositionID)
    RIGHT JOIN EducationLevel ON Job.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN City ON Job.CityID = City.CityID)
    RIGHT JOIN Province ON Job.ProvinceID = Province.ProvinceID)
    RIGHT JOIN Major ON Job.MajorID = Major.MajorID)
    """
))

df_function = pd.DataFrame(engine.execute(
    """
    SELECT FunctionPositionID, FunctionPositionName
    FROM FunctionPosition
    """
))

df_education = pd.DataFrame(engine.execute(
    """
    SELECT EducationLevelID, EducationLevelName
    FROM EducationLevel
    """
))

df_city = pd.DataFrame(engine.execute(
    """
    SELECT CityID, Name AS CityName
    FROM City
    """
))

df_province = pd.DataFrame(engine.execute(
    """
    SELECT ProvinceID, Name AS ProvinceName
    FROM Province
    """
))

df_major = pd.DataFrame(engine.execute(
    """
    SELECT MajorID, MajorName
    FROM Major
    """
))

df_applicant = pd.DataFrame(engine.execute(
    f"""
    SELECT Applicant.ApplicantID, Applicant.Dob, Applicant.ExpectedSalary, City.Name AS CityName, Province.Name AS ProvinceName, Applicant.DriverLicenseType, Applicant.IsUsingGlasses, Applicant.Gender, Applicant.MaritalStatus, Applicant.Strengthness
    FROM (((Applicant
    RIGHT JOIN City ON Applicant.CurrentAddressCityID = City.CityID)
    RIGHT JOIN Province ON Applicant.CurrentAddressProvinceID = Province.ProvinceID)
    LEFT JOIN Pipeline ON Applicant.ApplicantID = Pipeline.ApplicantID)
    """
))

df_applicant_education = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantEducation.ApplicantID, ApplicantEducation.DateStart, ApplicantEducation.DateEnd, EducationLevel.EducationLevelName, Major.MajorName
    FROM (((ApplicantEducation
    RIGHT JOIN EducationLevel ON ApplicantEducation.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN Major ON ApplicantEducation.MajorID = Major.MajorID)
    LEFT JOIN Pipeline ON ApplicantEducation.ApplicantID = Pipeline.ApplicantID)
    """
))

df_applicant_experience = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantExperience.ApplicantID, ApplicantExperience.DateFrom, ApplicantExperience.DateTo, ApplicantExperience.Position, ApplicantExperience.JobDescription
    FROM (ApplicantExperience
    LEFT JOIN Pipeline ON ApplicantExperience.ApplicantID = Pipeline.ApplicantID)
    """
))

In [207]:
job_str = ['CityName', 'ProvinceName', 'EducationLevelName', 'MajorName', 'DriverLicenseType', 'Gender', 'MaritalStatus', 'JobTitle', 'FunctionPositionName', 'Description', 'Requirement']
job_num = ['UsiaMax', 'SalaryMin', 'SalaryMax']
job_bol = ['UsingGlasses']

'''general'''
df_job.set_index(['JobID'], inplace=True)
df_job.fillna('', inplace=True)

'''str'''
df_job[job_str] = df_job[job_str].applymap(str.lower)
df_job.replace('none', '', inplace=True)

In [208]:
'''applicant'''
df_applicant = df_applicant.dropna(subset=['ApplicantID'])
df_applicant.ApplicantID = df_applicant.ApplicantID.astype(int)
df_applicant = df_applicant.drop_duplicates()
df_applicant = df_applicant.fillna('')
# age column
df_applicant['Age'] = pd.to_datetime(
    df_applicant.Dob.map(pick_date).apply(lambda x: filter_date(x, 1958, 2006))
).map(get_age)

df_applicant.drop(columns=['Dob'], inplace=True)

df_applicant.Age = df_applicant.Age.fillna(0).astype(int)

In [209]:
'''education'''
df_applicant_education = df_applicant_education.fillna('')
# datetime column
df_applicant_education.DateStart = pd.to_datetime(
    df_applicant_education.DateStart.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)
df_applicant_education.DateEnd = pd.to_datetime(
    df_applicant_education.DateEnd.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)
df_applicant_education = df_applicant_education[~(df_applicant_education.DateStart.isna()) & ~(df_applicant_education.DateEnd.isna())]
df_applicant_education = df_applicant_education.sort_values('DateStart').groupby(['ApplicantID']).agg('last')
df_applicant_education.drop(columns=['DateStart', 'DateEnd'], inplace=True)

In [210]:
'''experience'''
df_applicant_experience = df_applicant_experience.fillna('')

# datetime column
df_applicant_experience.DateFrom = pd.to_datetime(
    df_applicant_experience.DateFrom.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)
df_applicant_experience.DateTo = pd.to_datetime(
    df_applicant_experience.DateTo.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)
df_applicant_experience = df_applicant_experience[~(df_applicant_experience.DateFrom.isna()) & ~(df_applicant_experience.DateTo.isna())]

# add YearsOfExperience column
df_applicant_experience['YearsOfExperience'] = substract_months(
    df_applicant_experience.DateFrom, df_applicant_experience.DateTo
)
df_applicant_experience = df_applicant_experience.sort_values('DateFrom').groupby(['ApplicantID']).agg({
    'DateFrom': 'last',
    'DateTo': 'last',
    'JobDescription': ' '.join,
    'Position': ' '.join,
    'YearsOfExperience': 'sum',
})
df_applicant_experience.drop(columns=['DateFrom', 'DateTo'], inplace=True)

df_applicant_experience = df_applicant_experience[df_applicant_experience.YearsOfExperience != 0]

In [211]:
'''merge'''
df_applicant = pd.merge(df_applicant, df_applicant_experience, on=['ApplicantID'])
df_applicant = pd.merge(df_applicant, df_applicant_education, on=['ApplicantID'])

'''PRE-PROCESSING APPLICANT'''
df_applicant.set_index(['ApplicantID'], inplace=True)

app_str = ['CityName', 'ProvinceName', 'EducationLevelName', 'MajorName', 'DriverLicenseType', 'Gender', 'MaritalStatus', 'Position', 'JobDescription', 'Strengthness']
app_num = ['Age', 'ExpectedSalary', 'YearsOfExperience']
app_bol = ['IsUsingGlasses']

df_applicant = df_applicant[app_num + app_bol + app_str]

'''str'''
df_applicant[app_str] = df_applicant[app_str].applymap(str.lower)


'''bool'''
df_applicant.IsUsingGlasses = df_applicant.IsUsingGlasses.astype(str).map(str.lower)

In [334]:
def remove_stopwords(stopwords: list, text: str):
    list_text = text.split()

    new_text = []
    for word in list_text:
        if word not in stopwords:
            new_text.append(word)
    return ' '.join(new_text)

In [351]:
job_slangwords1 = {
    "r&d": "research development",
    "f&a": "finance accountant",
    "pod&wms": "proof delivery warehouse management system",
    "f&b": "food beverage",
    "r&a": "research analyst",
    "m&e": "mechanical engineer",
    "p&ga": "procurement general affair",
}
job_slangwords2 = {
    "affairs": "affair",
    "hr": "human resources development",
    "hrd": "human resources development",
    "intern": "internship",
    "asisten": "assistant",
    "teknisi": "technician",
    "freelance": "freelancer",
    "ahli": "specialist",
    "akuntansi": "accountant",
    "analis": "analyst",
    "anggota": "member",
    "asistant": "assistant",
    "asistant": "assistant",
    "assistan": "assistant",
    "assisten": "assistant",
    "audit": "auditor",
    "banking": "bank",
    "billing": "bill",
    "bm": "building maintenance",
    "bussiness": "business",
    "cabang": "branch",
    "ceo": "chief executive officer",
    "cmo": "chief marketing officer",
    "consultan": "consultant",
    "coordinator": "supervisor",
    "cs": "customer service",
    "departemen": "department",
    "dept": "department",
    "desain": "design",
    "designer": "design",
    "direktur": "director",
    "divisi": "division",
    "dokumen": "document",
    "dosen": "lecturer",
    "fb": "food beverage",
    "financial": "finance",
    "ga": "general affair",
    "csr": "corporate social responsibility",
    "gudang": "warehouse",
    "gm": "genelar manager",
    "guru": "teacher",
    "hr": "human resources development",
    "hrd": "human resources development",
    "hrga": "human resources development general affair",
    "hukum": "legal",
    "humas": "public relation",
    "ijin": "permission",
    "informasi": "information",
    "infrastruktur": "infrastructure",
    "intern": "internship",
    "intership": "internship",
    "jurnalis": "journalist",
    "kantor": "office",
    "karyawan": "staff",
    "kepala": "chief",
    "kerja": "work",
    "keuangan": "finance",
    "komputer": "computer",
    "konsultan": "consultant",
    "kontrak": "contract",
    "koordinator": "supervisor",
    "kredit": "credit",
    "kurir": "courier",
    "produksi": "production",
    "teknik": "technician",
    "marcomm": "marketing communication",
    "pajak": "tax",
    "lapangan": "field",
    "qc": "quality control",
    "paralegal": "legal",
    "personalia": "personal",
    "pengawas": "supervisor",
    "asisstant": "assistant",
    "penjualan": "sales",
    "mekanik": "mechanical",
    "mesin": "engineer",
    "pengajar": "teacher",
    "services": "service",
    "genelar": "general",
    "manajemen": "management",
    "resepsionis": "receptionist",
    "qa": "quality assurance",
    "toko": "store",
    "assitant": "assistant",
    "kordinator": "supervisor",
    "operations": "operation",
    "sekertaris": "secretary",
    "pelayanan": "service",
    "pegawai": "staff",
    "jaringan": "network",

    "ga": "general affair",
    "spv": "supervisor",
    "asst": "assistant",
    "arsitek": "architect",
    "lokasi": "location",
    "perencana": "planner",
    "proyek": "project",
    "sekretaris": "secretary",
    "direksi": "director",
    "administrasi": "administration",
    "aplicant": "applicant",
    "execuitve": "executive",
    "acc": "account",
    "accont": "account",
    "grafis": "graphic",
    "desainer": "design",
    "magang": "internship",
    "admin": "administration",
    "operasional": "operational",
    "umum": "general",
    "tehnisi": "technician",
    "me": "mechanical engineer",
    "technican": "technician",
    "acara": "event",
    "koordinator": "supervisor",
    "payble": "payable",
    "accounting": "accountant",
    "staf": "staff",
    "accouting": "accountant",
    "salesfinance": "sales finance",
    "asistaint": "asistant",
    "architech": "architect",
    "supervisior": "supervisor",
    "adimin": "admin",
    "engineering": "engineer",
    "kasir": "cashier",
    "fotografer": "photographer",
    "academiccontributing": "academic contributing",
    "acccount": "account",
    "acccounting": "accountant",
    "accoumting": "accountant",
    "accountantaccounting": "accountant",
    "accountingsupervisor": "accountant supervisor",
    "accountung": "accountant",
    "acount": "account",
    "acounting": "accountant",
    "acquasition": "acquisition",
    "admim": "administration",
    "admininstrasi": "administration",
    "administation": "administration",
    "administative": "administration",
    "administator": "administration",
    "administrasiaccounting": "accountant administration",
    "administrasiiklan": "advertising administration",
    "administrasion": "administration",
    "administrasistaff": "administration staff",
    "administrasito": "administration",
    "administrasu": "administration",
    "administratif": "administration",
    "administrationaccounting": "accountant administration",
    "administrative": "administration",
    "administrator": "administration",
    "administratorproperty": "property administration",
    "administtrator": "administration",
    "adminitrasi": "administration",
    "adminitration": "administration",
    "adminstrative": "administration",
    "advanced": "advance",
    "advidor": "advisor",
    "advocate": "advocat",
    "advokad": "advocat",
    "advokasi": "advocat",
    "advokat": "advocat",
    "afair": "affair",
    "affaif": "affair",
    "affail": "affair",
    "affakr": "affair",
    "agency": "agent",
    "agen": "agent",
    "agendaris": "planner",
    "ahi": "specialist",
    "akademikstaff": "academic staff",
    "coordinator": "supervisor",
    "akuntaing": "accountant",
    "akuntan": "accountant",
    "akuntansi": "accountant",
    "akuntansistaff": "accountant staff",
    "akunting": "accountant",
    "akuntingback": "accountant",
    "analis": "analyst",
    "analisis": "analyst",
    "analist": "analyst",
    "analys": "analyst",
    "analysis": "analyst",
    "analystedp": "analyst edp",
    "analysts": "analyst",
    "analyze": "analyst",
    "anggotaketua": "member",
    "announcermanager": "announcer manager",
    "apartemen": "apartment",
    "aplikasi": "application",
    "apotek": "supervisor",
    "app": "supervisor",
    "apprentice": "supervisor",
    "apprentices": "supervisor",
    "apps": "supervisor",
    "aquisition": "supervisor",
    "aquistion": "supervisor",
    "architectural": "supervisor",
    "architecture": "supervisor",
}

job_stopwords = ['dan', 'and', 'of', 'at', 'bagian', 'di', 'for', 'in', 'on', 'to', 'sebagai', 'the']

In [373]:
job_title = pd.concat([df_job.JobTitle, df_applicant_experience.Position]).reset_index(drop=True)

job_title = job_title.map(
    str.lower
).map(
    remove_parenthesesnumber
).map(
    remove_standalonesymbols
).map(
    remove_morespace
).map(
    str.strip
).apply(
    lambda x: ' '.join(list(set(x.split())))
).apply(
    lambda x: change_words(job_slangwords1, x)
).apply(
    lambda x: remove_stopwords(job_stopwords, x)
).map(
    clean_text
).apply(
    lambda x: change_words(job_slangwords2, x)
)



In [439]:
w2v_model = models.Word2Vec(min_count=2, window=2, vector_size=120, sample=6e-5,  alpha=0.03,  min_alpha=0.0007,  negative=20, workers=cores-1)

w2v_model.build_vocab(job_title.map(str.split).values, progress_per=10000)
w2v_model.train(job_title.map(str.split).values, total_examples=w2v_model.corpus_count, epochs=9, report_delay=1)

2023-04-19 14:02:05,340 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=120, alpha=0.03>', 'datetime': '2023-04-19T14:02:05.340751', 'gensim': '4.3.1', 'python': '3.10.0 | packaged by conda-forge | (default, Nov 10 2021, 13:20:59) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}
2023-04-19 14:02:05,343 : INFO : collecting all words and their counts
2023-04-19 14:02:05,344 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-19 14:02:05,348 : INFO : collected 1902 word types from a corpus of 19451 raw words and 5430 sentences
2023-04-19 14:02:05,348 : INFO : Creating a fresh vocabulary
2023-04-19 14:02:05,352 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 803 unique words (42.22% of original 1902, drops 1099)', 'datetime': '2023-04-19T14:02:05.352713', 'gensim': '4.3.1', 'python': '3.10.0 | packaged by conda-forge | (default, Nov 10 2021, 13:20:59) [MSC v.1916 64

(30622, 175059)

In [421]:
(w2v_model.wv.similarity('ios', 'java') + w2v_model.wv.similarity('developer', 'programmer'))/2

0.9945213794708252

In [440]:
w2v_model.wv.n_similarity('research analyst'.split(), 'cleaning'.split())

0.9791732

In [400]:
w2v_model.wv.similarity('advocat', 'waiter')

0.99722224

In [411]:
cosine_similarity(w2v_model.wv['advocat'].reshape(1, -1), w2v_model.wv['waiter'].reshape(1, -1))

array([[0.9972223]], dtype=float32)

In [441]:
words = w2v_model.wv.index_to_key
vectors = w2v_model.wv.vectors

d2v_words = d2v_model.wv.index_to_key
d2v_vectors = d2v_model.wv.vectors

In [442]:
with open('data/vectors.tsv', 'w', encoding='utf-8') as f:
    for i, word in enumerate(words):
        vector_str = '\t'.join([str(num) for num in vectors[i]])
        f.write(f'{vector_str}\n')

with open('data/metadata.tsv', 'w', encoding='utf-8') as f:
    for word in words:
        f.write(f'{word}\n')

In [407]:
with open('data/d2v_vectors.tsv', 'w', encoding='utf-8') as f:
    for i, word in enumerate(d2v_words):
        vector_str = '\t'.join([str(num) for num in d2v_vectors[i]])
        f.write(f'{vector_str}\n')

with open('data/d2v_metadata.tsv', 'w', encoding='utf-8') as f:
    for word in d2v_words:
        f.write(f'{word}\n')

In [None]:
for i,j in zip(pd.Series(' '.join(job_title.values).split()).value_counts().index, pd.Series(' '.join(job_title.values).split()).value_counts()):
    print(j, i)

In [None]:
corpus = list(set(' '.join(job_title.values).split()))
corpus.sort()

for i in corpus:
    print(i)

In [343]:
for i in sorted(set(job_title.values)):
    print(i)


ac maintenance
ac maintenance operational general manager technician waiter
ac staff technician mechanical engineer technician
access network engineer
account executive
account executive frontliner
account payable
account receivable
account service customer executive
account staff payable
accountant
accountant asisstant finance manager
accountant assistant internship
accountant assistant lawyer
accountant cashier
accountant finance chief manager tax
accountant finance manager account secretary administration
accountant finance staff
accountant junior auditor asistant architect freelancer supervisor project
accountant staff
accountant staff general
accountant staff junior auditor
accountant supervisor finance pjs kasie trainee
accountant support assistant staff sales finance it manager tax administration
accountant taxation staff
acquisition user volunteer
act leader
activator frontliner
adh head administration
adjunct teacher internship
admin supervisor marketing communication team ev

In [342]:
for i in sorted(w2v_model.wv.index_to_key):
    print(i)

academic
account
accountant
acquisition
adm
administration
advisor
advocat
affair
agent
analyst
android
announcer
ar
architect
area
artist
as
asisstant
ass
asset
assistant
assitant
associate
assurance
atm
attendant
auditor
back
bali
bancassurance
bank
banquet
bar
barista
beverage
bill
biro
bod
branch
brand
building
business
buyer
call
capital
captain
card
care
cash
cashier
casual
center
centre
channel
chief
civil
claim
cleaning
clerk
collection
commercial
communication
community
compliance
computer
construction
consultant
consumer
contact
content
contract
control
controller
cook
corporate
cost
counter
courier
creative
creator
credit
crew
cum
customer
data
database
department
deputy
design
desk
desktop
developer
development
digital
director
distribution
division
doc
document
drafter
driver
editor
edp
education
electrical
engineer
english
entertainment
entry
estimator
event
executive
fa
fasilitator
field
finance
food
foodcourt
foreman
freelancer
front
frontliner
funding
genelar
general
g

In [9]:
df_job = df_job[['JobTitle', 'Description', 'Requirement']]
df_applicant = df_applicant[['Position', 'JobDescription', 'Strengthness']]

Clean Text

In [10]:
'''cleansing'''
df_job.Description = df_job.Description.map(clean_text)
df_job.Requirement = df_job.Requirement.map(clean_text)
df_job.JobTitle = df_job.JobTitle.map(remove_insideparentheses).map(clean_text)

df_applicant.JobDescription = df_applicant.JobDescription.map(clean_text)
df_applicant.Strengthness = df_applicant.Strengthness.map(clean_text)
df_applicant.Position = df_applicant.Position.map(remove_insideparentheses).map(clean_text)



Slangwords

In [10]:
'''cleansing'''
df_job.Description = df_job.Description.apply(lambda x: change_words(slangwords, x))
df_job.Requirement = df_job.Requirement.apply(lambda x: change_words(slangwords, x))
df_job.JobTitle = df_job.JobTitle.apply(lambda x: change_words(slangwords, x))

df_applicant.JobDescription = df_applicant.JobDescription.apply(lambda x: change_words(slangwords, x))
df_applicant.Strengthness = df_applicant.Strengthness.apply(lambda x: change_words(slangwords, x))
df_applicant.Position = df_applicant.Position.apply(lambda x: change_words(slangwords, x))

Englishwords

In [11]:
'''cleansing'''
df_job.Description = df_job.Description.apply(lambda x: change_words(englishwords, x))
df_job.Requirement = df_job.Requirement.apply(lambda x: change_words(englishwords, x))
df_job.JobTitle = df_job.JobTitle.apply(lambda x: change_words(englishwords, x))

df_applicant.JobDescription = df_applicant.JobDescription.apply(lambda x: change_words(englishwords, x))
df_applicant.Strengthness = df_applicant.Strengthness.apply(lambda x: change_words(englishwords, x))
df_applicant.Position = df_applicant.Position.apply(lambda x: change_words(englishwords, x))

Stopwords

In [90]:
'''cleansing'''
df_job.Description = df_job.Description.apply(lambda x: ' '.join([i for i in x.split() if i not in stopwords]))
df_job.Requirement = df_job.Requirement.apply(lambda x: ' '.join([i for i in x.split() if i not in stopwords]))
df_job.JobTitle = df_job.JobTitle.apply(lambda x: ' '.join([i for i in x.split() if i not in stopwords]))

df_applicant.JobDescription = df_applicant.JobDescription.apply(lambda x: ' '.join([i for i in x.split() if i not in stopwords]))
df_applicant.Strengthness = df_applicant.Strengthness.apply(lambda x: ' '.join([i for i in x.split() if i not in stopwords]))
df_applicant.Position = df_applicant.Position.apply(lambda x: ' '.join([i for i in x.split() if i not in stopwords]))

Stemming

In [None]:
'''cleansing'''
df_job.Description = df_job.Description.apply(lambda x: stemmer_words(stemmer, x))
# df_job.Requirement = df_job.Requirement.apply(lambda x: stemmer_words(stemmer, x))
# df_job.JobTitle = df_job.JobTitle.apply(lambda x: stemmer_words(stemmer, x))

df_applicant.JobDescription = df_applicant.JobDescription.apply(lambda x: stemmer_words(stemmer, x))
# df_applicant.Strengthness = df_applicant.Strengthness.apply(lambda x: stemmer_words(stemmer, x))
# df_applicant.Position = df_applicant.Position.apply(lambda x: stemmer_words(stemmer, x))

In [89]:
df_job

Unnamed: 0_level_0,JobTitle,Description,Requirement
JobID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,night audit,under asst front office manager at gunawangsa ...,with year minimum experiences in the same posi...
2,book keeper,becoming book keeper for gunawangsa hotel merr,bachelor degree of accounting with years minim...
3,it android programmer,developing of android application and their in...,usia maksimal tahun minimal pendidikan it atau...
4,sekretaris legal,melakukan tugas kesekretariatan menggunakan te...,pendidikan minimal hukum usia maksimal tahun p...
5,sekretaris direksi,bantu handle pekerjaan atasan membuat surat me...,usia maksimal tahun pendidikan min semua jurus...
...,...,...,...
3090,general manager hotel,memimpin perusahaan dan menjadi motivator bagi...,usia maksimal tahun pendidikan minimal segala ...
3091,marketing staff,,
3092,manager finance accounting,menguasi brevet dan pelaporan pajak dapat memb...,usia maksimal tahun pendidikan jurusan akuntan...
3093,general affair,mengerti dan memahami peraturan perundangan ya...,pria wanita usia maksimal tahun pendidikan min...


In [46]:
def df_to_tokenize(df_column: pd.Series):
    return list(df_column.apply(lambda x: x.split()).values)

In [401]:
# dataset = list(df_job.Description.apply(lambda x: x.split()).values) + list(df_job.Requirement.apply(lambda x: x.split()).values) + list(df_job.JobTitle.apply(lambda x: x.split()).values) + list(df_applicant.JobDescription.apply(lambda x: x.split()).values) + list(df_applicant.Strengthness.apply(lambda x: x.split()).values) + list(df_applicant.Position.apply(lambda x: x.split()).values)
dataset = job_title.map(str.split).values
data =[]
for w in dataset:
    data.append(w)

def tagged_document(list_of_ListOfWords):
    for x, ListOfWords in enumerate(list_of_ListOfWords):
        yield doc2vec.TaggedDocument(ListOfWords, [x])

data_train = list(tagged_document(data))


d2v_model = doc2vec.Doc2Vec(vector_size=120, min_count=10, epochs=30, window=2)

d2v_model.build_vocab(data_train)
d2v_model.train(data_train, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

2023-04-19 13:35:53,824 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d120,n5,w2,mc10,s0.001,t3>', 'datetime': '2023-04-19T13:35:53.824569', 'gensim': '4.3.1', 'python': '3.10.0 | packaged by conda-forge | (default, Nov 10 2021, 13:20:59) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}
2023-04-19 13:35:53,825 : INFO : collecting all words and their counts
2023-04-19 13:35:53,825 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-04-19 13:35:53,832 : INFO : collected 1902 word types and 5430 unique tags from a corpus of 5430 examples and 19451 words
2023-04-19 13:35:53,832 : INFO : Creating a fresh vocabulary
2023-04-19 13:35:53,834 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=10 retains 201 unique words (10.57% of original 1902, drops 1701)', 'datetime': '2023-04-19T13:35:53.834544', 'gensim': '4.3.1', 'python': '3.10.0 | packaged by conda-forge | (default, Nov 10 2021, 13:20

In [403]:
# clean text
d2v_model.similarity_unseen_docs(df_job.JobTitle.loc[1607].split(), df_applicant.Position.loc[39348].split())

0.9960553

In [147]:
# clean text > slangwords
d2v_model.similarity_unseen_docs(df_job.Description.loc[1607].split(), df_applicant.JobDescription.loc[39348].split())

0.48006448

In [13]:
# clean text > slangwords > englishwords
d2v_model.similarity_unseen_docs(df_job.Description.loc[1607].split(), df_applicant.JobDescription.loc[39348].split())

0.37745318

In [92]:
# clean text > slangwords > englishwords > stopwords
d2v_model.similarity_unseen_docs(df_job.Description.loc[1607].split(), df_applicant.JobDescription.loc[39348].split())

-0.1910725

In [100]:
# clean text > slangwords > englishwords > stopwords > stemm
d2v_model.similarity_unseen_docs(df_job.Description.loc[1607].split(), df_applicant.JobDescription.loc[39348].split())

-0.049069412

In [63]:
for i in sorted(list(d2v_model.wv.key_to_index)):
    print(i)

acara
account
accountant
accounting
adm
admin
administrasi
administration
administrative
administrator
advisor
advokat
affair
affairs
agent
ahli
akta
akuntansi
analis
analyst
and
anggota
ar
architect
area
arsitek
asisstant
asistant
asisten
ass
asset
assistant
assisten
assitant
associate
asst
assurance
at
audit
auditor
back
bagian
banking
barista
bartender
beverage
branch
building
business
bussiness
buyer
call
capital
card
care
cash
cashier
casual
center
ceo
chief
civil
claim
cleaning
collection
communication
computer
consultan
consultant
contact
content
control
controller
cook
coordinator
corporate
cost
creative
creator
credit
crew
cs
csr
customer
dan
data
departement
department
deputy
desain
desainer
design
designer
desk
developer
development
di
digital
director
direksi
direktur
divisi
division
document
dokumen
dosen
drafter
driver
editor
edp
education
electrical
engineer
engineering
english
entry
estimator
event
executive
field
finance
financial
food
foodcourt
for
freelance
front
fro

In [404]:
df_job.Description.loc[1607], df_applicant.JobDescription.loc[39348]

('<ul><li>mengembangkan aplikasi ios dan integrasinya dengan layanan back-and sesuai dengan layanan pengguna</li><li>menganalisis dan mengoptimalkan kode aplikasi untuk efisiensi, reabilitas dan kinerja</li></ul>',
 '<div class="page" title="page 1"><div class="section"><div class="layoutarea"><div class="column"><p><span style="font-size: 11pt; font-family: arial; font-weight: 700; color: rgb(3, 12, 15);">research and problem validation</span></p><p><span style="font-size: 11pt; font-family: arialmt; color: rgb(3, 12, 15);">conduct general and domain research using cbl (challenge based learning) framework and arrange them into solution concepts.</span></p><p><span style="font-size: 11pt; font-family: arial; font-weight: 700; color: rgb(3, 12, 15);">application development</span></p><p><span style="font-size: 11pt; font-family: arialmt; color: rgb(3, 12, 15);">developing application technically using swift programming language and supported by ios framework and services.</span></p><p><

In [405]:
df_job.JobTitle.loc[1607], df_applicant.Position.loc[39348]

('ios developer', 'ios developer')

In [94]:
df_applicant[df_applicant.JobDescription.str.contains(' android ')]

Unnamed: 0_level_0,Position,JobDescription,Strengthness
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
21,jawa situs web pengembang android pengembang,cakupan responsibility menganalisa mengembangk...,
3367,mendukung mendukung mendukung mendukung menduk...,bertugas mengisi berisi situs web intern klien...,belajar motto dipelajari hidup
18385,staf,penyelesaian jaringan perangkat keras mengemba...,cepat belajar ramah
21102,magang staf magang staf magang staf,proyek perangkat android pkbl lapangan perangk...,rajin suka belajar keras
22239,teknisi android permainan pengembang,merawat instalasi jaringan komputer penarikan ...,sabar teliti grusa grusu
26272,penjualan pemasaran staf freelance programmer,penjualan pemasaran menganalisis pasar penjual...,analisis kuat teliti mudah bersosialisasi adap...
35606,internship,situs web berdasarkan android devepment proyek...,hardworker multitasking


In [95]:
df_job[df_job.Description.str.contains(' android ')]

Unnamed: 0_level_0,JobTitle,Description,Requirement
JobID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,android programmer,mengembangkan android aplikasi milik integrasi...,usia maksimal minimal pendidikan sistem inform...
156,android programmer,mengembangkan aplikasi android integrasinya la...,usia maksimal minimal pendidikan sistem inform...
2736,android programmer,mengembangkan aplikasi android integrasinya la...,usia maksimal minimal pendidikan sistem inform...
2738,android programmer,mengembangkan aplikasi android integrasinya la...,usia maksimal minimal pendidikan sistem inform...
2848,android programmer,mengembangkan aplikasi android integrasinya la...,usia maksimal minimal pendidikan sistem inform...
2996,android programmer,mengembangkan aplikasi android integrasinya la...,usia maksimal minimal pendidikan sistem inform...
