In [2]:
import pyodbc
import re

import numpy as np
import pandas as pd
import tensorflow as tf

from sqlalchemy import create_engine
from sklearn.metrics.pairwise import cosine_similarity

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from googletrans import Translator

from FlaskApp.transform import *

from job_model import JobModel
from app_model import AppModel

In [None]:
translator = Translator(service_urls=['translate.googleapis.com'])
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

In [4]:
user = 'huda'
password = 'Vancha12'
host = '127.0.0.1'
port = 1433
database = 'HRSystemDB'


def get_connection():         
    return create_engine(
        url=f"mssql+pyodbc://{user}:{password}@{host}:{port}/{database}?driver=SQL Server",
    )

engine = get_connection()
conn = engine.connect()

In [5]:
'''
JOB>>>MATCH<<<APPLICANT

> numerical
UsiaMax == Dob
SalaryMean == ExpectedSalary

> categorical
CityName == CurrentCityName
ProvinceName == CurrentProvinceName
EducationLevelName
MajorName
DriverLicenseType
IsUsingGlasses
Gender
MaritalStatus

> textual
JobTile, FunctionPositionName == Position
Description, Requirement == JobDescription, Strengthness
'''

'\nJOB>>>MATCH<<<APPLICANT\n\n> numerical\nUsiaMax == Dob\nSalaryMean == ExpectedSalary\n\n> categorical\nCityName == CurrentCityName\nProvinceName == CurrentProvinceName\nEducationLevelName\nMajorName\nDriverLicenseType\nIsUsingGlasses\nGender\nMaritalStatus\n\n> textual\nJobTile, FunctionPositionName == Position\nDescription, Requirement == JobDescription, Strengthness\n'

In [6]:
applicant_id = 31790

df_job = pd.DataFrame(engine.execute(
    """
    SELECT Job.JobID, Job.UsiaMax, Job.SalaryMin, Job.SalaryMax, City.Name AS CityName, Province.Name AS ProvinceName, EducationLevel.EducationLevelName, Major.MajorName, Job.DriverLicenseType, Job.UsingGlasses, Job.Gender, Job.MaritalStatus, Job.JobTitle, FunctionPosition.FunctionPositionName, Job.Description, Job.Requirement
    FROM (((((Job
    RIGHT JOIN FunctionPosition ON Job.FunctionPositionID = FunctionPosition.FunctionPositionID)
    RIGHT JOIN EducationLevel ON Job.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN City ON Job.CityID = City.CityID)
    RIGHT JOIN Province ON Job.ProvinceID = Province.ProvinceID)
    RIGHT JOIN Major ON Job.MajorID = Major.MajorID)
    WHERE JobStatus='Publish'
    """
))

df_function = pd.DataFrame(engine.execute(
    """
    SELECT FunctionPositionID, FunctionPositionName
    FROM FunctionPosition
    """
))

df_education = pd.DataFrame(engine.execute(
    """
    SELECT EducationLevelID, EducationLevelName
    FROM EducationLevel
    """
))

df_city = pd.DataFrame(engine.execute(
    """
    SELECT CityID, Name AS CityName
    FROM City
    """
))

df_province = pd.DataFrame(engine.execute(
    """
    SELECT ProvinceID, Name AS ProvinceName
    FROM Province
    """
))

df_major = pd.DataFrame(engine.execute(
    """
    SELECT MajorID, MajorName
    FROM Major
    """
))

df_applicant = pd.DataFrame(engine.execute(
    f"""
    SELECT Applicant.ApplicantID, Applicant.Dob, Applicant.ExpectedSalary, City.Name AS CityName, Province.Name AS ProvinceName, Applicant.DriverLicenseType, Applicant.IsUsingGlasses, Applicant.Gender, Applicant.MaritalStatus, Applicant.Strengthness
    FROM (((Applicant
    RIGHT JOIN City ON Applicant.CurrentAddressCityID = City.CityID)
    RIGHT JOIN Province ON Applicant.CurrentAddressProvinceID = Province.ProvinceID)
    LEFT JOIN Pipeline ON Applicant.ApplicantID = Pipeline.ApplicantID)
    WHERE StageID=9
    """
))

df_applicant_education = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantEducation.ApplicantID, ApplicantEducation.DateStart, ApplicantEducation.DateEnd, EducationLevel.EducationLevelName, Major.MajorName
    FROM (((ApplicantEducation
    RIGHT JOIN EducationLevel ON ApplicantEducation.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN Major ON ApplicantEducation.MajorID = Major.MajorID)
    LEFT JOIN Pipeline ON ApplicantEducation.ApplicantID = Pipeline.ApplicantID)
    WHERE StageID=9
    """
))

df_applicant_experience = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantExperience.ApplicantID, ApplicantExperience.DateFrom, ApplicantExperience.DateTo, ApplicantExperience.Position, ApplicantExperience.JobDescription
    FROM (ApplicantExperience
    LEFT JOIN Pipeline ON ApplicantExperience.ApplicantID = Pipeline.ApplicantID)
    WHERE StageID=9
    """
))

In [9]:
df_applicant.head(3)

Unnamed: 0,ApplicantID,ExpectedSalary,CityName,ProvinceName,DriverLicenseType,IsUsingGlasses,Gender,MaritalStatus,Strengthness,Age
0,33513,3500000.0,SURABAYA,JAWA TIMUR,C,False,Female,Single,"Humble, brave, Active, Teamwork",25
1,31690,4500000.0,SURABAYA,JAWA TIMUR,C,False,Male,Single,"Jujur, mampu bekerja secara tim, bertanggung j...",27
3,31861,15000000.0,SURABAYA,JAWA TIMUR,A,False,Male,Married,"Loyal, honest, persistent",51


In [19]:
stopwords = [i[0] for i in pd.read_csv('data/stopwords.csv').astype(str).values] + ['usia', 'maksimal', 'didik', 'minimal', 'laki', 'perempuan', 'alam', 'milik', 'sd', 'smp', 'sma', 's1', 's2', 's3', 'd1', 'd2', 'd3', 'd4', 'kuasa', 'awas', 'kumpul', 'pria', 'jurus', 'sedia', 'sarjana', 'diploma', 'magister', 'bidang', 'kandidat', 'skill', 'oriented', 'tampil', 'tarik', 'good', 'up', 'paham', 'intermediate', 'baca', 'gambar', 'badan', 'laku', 'laku', 'kena', 'shooting', 'meni', 'buat', 'ipk', 'lulus', 'suka', 'atu', 'hadap', 'penuh', 'jam', 'tara', 'non', 'warna', 'warni', 'smk', 'shift', 'familiar', 'laksana', 'gelar', 'sitac', 'tipu', 'nya', 'ii', 'iii', 'salam', 'to', 'wwm', 'kece', 'kesiap', 'seluru', 'angkat', 'usaha', 'upaya', 'masuk', 'kait', 'syarat', 'susun', 'awat', 'masuk', 'tugas', ]

slangwords = {
    'max': 'maksimal',
    'div': 'divisi',
    'tenant': 'sewa',
    'hokum': 'hukum',
    'branding': 'promosi',
    'stok': 'sedia',
    'termasukaplikasikomputer': 'komputer',
    'telekomunikais': 'telekomunikasi',
    'perinci': 'rinci',
    'risalah': 'surat',
    'mereview': 'ulas',
    'klien': 'langgan',
    'thn': 'tahun',
    'th': 'tahun',
    'min': 'minimal',
    'miniman': 'minimal',
    'jurnalid': 'jurnalis',
    'mayob': 'myob',
    'paja': 'pajak',
    'perban': 'bank',
    'maksimum': 'maksimal',
    'minimum': 'minimal',
    'bhs': 'bahasa',
    'pengorganisasia': 'organisasi',
    'analitis': 'analisis',
    'analisa': 'analisis',
    'memimiliki': 'milik',
    'manejer': 'manajer',
    'menejer': 'manajer',
    'bussines': 'bisnis',
    'development': 'kembang',

    'order': 'pesan',
    'driver': 'supir',
    'mengkoordinir': 'koordinasi',
    'mensupport': 'dukung',
    'padapenyediaan': 'sedia',
    'file': 'berkas',
    'mesan': 'pesan',
    'base': 'dasar',
    'maximum': 'maksimal',
    'accounting': 'akuntansi',
    'system': 'sistem',
    'building': 'bangun',
    'maintenance': 'pelihara',
    'team': 'tim',
    'deadline': 'tenggat',
    'vehicle': 'kendara',
    'networking': 'jaring',
    'installation': 'instalasi',
    'trouble': 'masalah',
    'presentation': 'presentasi',
    'customer': 'langgan',
    'satisfaction': 'puas',
    'planning': 'rencana',
    'tax': 'pajak',
    'mobile': 'seluler',
    'fast': 'cepat',
    'solving': 'pecah',
    'decision': 'putus',
    'making': 'buat',
    'marketing': 'pasar',
    'communication': 'komunikasi',
    'negotiation': 'negosiasi',
    'problem': 'masalah',
    'leasing': 'sewa',
    'service': 'layan',
    'leadership': 'pimpin',
    'manager': 'manajemen',
    'auditing': 'periksa',
    'auditor': 'periksa',
    'audit': 'periksa',
    'apartment': 'apartemen',
    'engineering': 'teknik',
    'chief': 'pimpin',
    'general': 'umum',
    'research': 'teliti',
    'trend': 'cenderung',
    'cash': 'uang',
    'flow': 'alir',
    'payment': 'bayar',
    'supplier': 'pasok',
    'finance': 'uang',
    'journal': 'jurnal',
    'standing': 'duduk',
    'charge': 'biaya',
    'cleaning': 'bersih',
    'lift': 'angkat',
    'adjustment': 'atur',
    'correction': 'koreksi',
    'schedule': 'jadwal',
    'cust': 'langgan',


    'memposting': 'unggah',
    'utilitas': 'guna',
    'ms': 'microsoft',
    'perbankan': 'bank',
    'spv': 'supervisor',
    'komunikatif': 'komunikasi',
    'perencanaan': 'rencana',
    'operasionalisasi': 'operasional',
    'diatahkan': 'arah',
}

In [8]:
'''applicant'''
df_applicant = df_applicant.drop_duplicates()
df_applicant = df_applicant.fillna('')

df_applicant['Age'] = pd.to_datetime(
    df_applicant.Dob.map(pick_date).apply(lambda x: filter_date(x, 1958, 2006))
).map(get_age)

df_applicant.drop(columns=['Dob'], inplace=True)

df_applicant.Age = df_applicant.Age.fillna(0).astype(int)

In [46]:
df_applicant_education.head(3)

Unnamed: 0_level_0,EducationLevelName,MajorName
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1
14,S1,HUKUM
128,S1,PSIKOLOGI
522,S2,HUKUM


In [45]:
'''education'''
df_applicant_education = df_applicant_education.fillna('')
df_applicant_education.DateStart = pd.to_datetime(
    df_applicant_education.DateStart.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_education.DateEnd = pd.to_datetime(
    df_applicant_education.DateEnd.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_education = df_applicant_education[~(df_applicant_education.DateStart.isna()) & ~(df_applicant_education.DateEnd.isna())]
df_applicant_education = df_applicant_education.sort_values('DateStart').groupby(['ApplicantID']).agg('last')

df_applicant_education.drop(columns=['DateStart', 'DateEnd'], inplace=True)

In [48]:
df_applicant_experience.head(3)

Unnamed: 0_level_0,JobDescription,Position,YearsOfExperience
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
128,"<p>mempromosikan sebuah event dan exhibition,&...",Freelance Crew Freelance Marketing,3
775,"<p style=""language:id;margin-top:0pt;margin-bo...",Legal Staff,1
841,"<p class=""MsoListParagraphCxSpFirst"" style=""ma...",Personalia dan Konselor Staff Rekruitment HRD ...,4


In [47]:
'''experience'''
df_applicant_experience = df_applicant_experience.fillna('')
df_applicant_experience.DateFrom = pd.to_datetime(
    df_applicant_experience.DateFrom.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_experience.DateTo = pd.to_datetime(
    df_applicant_experience.DateTo.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_experience = df_applicant_experience[~(df_applicant_experience.DateFrom.isna()) & ~(df_applicant_experience.DateTo.isna())]

# add YearsOfExperience column
df_applicant_experience['YearsOfExperience'] = substract_months(
    df_applicant_experience.DateFrom, df_applicant_experience.DateTo
)

df_applicant_experience = df_applicant_experience.sort_values('DateFrom').groupby(['ApplicantID']).agg({
    'DateFrom': 'last',
    'DateTo': 'last',
    'JobDescription': ' '.join,
    'Position': ' '.join,
    'YearsOfExperience': 'sum',
})

df_applicant_experience.drop(columns=['DateFrom', 'DateTo'], inplace=True)

In [49]:
'''merge'''
df_applicant = pd.merge(df_applicant, df_applicant_experience, on=['ApplicantID'])
df_applicant = pd.merge(df_applicant, df_applicant_education, on=['ApplicantID'])

In [50]:
df_applicant.head(1)

Unnamed: 0,ApplicantID,ExpectedSalary,CityName,ProvinceName,DriverLicenseType,IsUsingGlasses,Gender,MaritalStatus,Strengthness,Age,JobDescription,Position,YearsOfExperience,EducationLevelName,MajorName
0,33513,3500000.0,SURABAYA,JAWA TIMUR,C,False,Female,Single,"Humble, brave, Active, Teamwork",25,"<p>- Plan, imolement, monitor the overall of s...",SUPPLY CHAIN MANAGEMENT & CHIEFF SPG ADMIN QUA...,1,D3,SEMUA JURUSAN 2


In [16]:
'''PRE-PROCESSING APPLICANT'''
df_applicant.set_index(['ApplicantID'], inplace=True)

app_str = ['CityName', 'ProvinceName', 'EducationLevelName', 'MajorName', 'DriverLicenseType', 'Gender', 'MaritalStatus', 'Position', 'JobDescription', 'Strengthness']
app_num = ['Age', 'ExpectedSalary']
app_bol = ['IsUsingGlasses']

df_applicant = df_applicant[app_num + app_bol + app_str]

'''str'''
df_applicant[app_str] = df_applicant[app_str].applymap(str.lower)

df_applicant.JobDescription = df_applicant.JobDescription.map(clean_text).map(maintain_alphabet).map(remove_single).map(remove_morespace).apply(lambda x: translate_teks(translator, x)).apply(lambda x: stemmer_words(stemmer, x)).apply(lambda x: x.split(' ')).apply(lambda x: change_slangwords(slangwords, x)).apply(lambda x: [i for i in x if i not in stopwords]).apply(lambda x: ' '.join(x))
df_applicant.Strengthness = df_applicant.Strengthness.map(clean_text).map(maintain_alphabet).map(remove_single).map(remove_morespace).apply(lambda x: translate_teks(translator, x)).apply(lambda x: stemmer_words(stemmer, x)).apply(lambda x: x.split(' ')).apply(lambda x: change_slangwords(slangwords, x)).apply(lambda x: [i for i in x if i not in stopwords]).apply(lambda x: ' '.join(x))

df_applicant.Position = df_applicant.Position.apply(lambda x: translate_teks(translator, x))
df_applicant.MajorName = df_applicant.MajorName.apply(lambda x: translate_teks(translator, x))

# concat
df_applicant.JobDescription = df_applicant.JobDescription.str.cat(
    df_applicant.Strengthness, sep=' '
)
df_applicant.rename(columns={'JobDescription': 'DescriptionStrengthness'}, inplace=True)
df_applicant.drop(columns=['Strengthness'], inplace=True)

'''bool'''
df_applicant.IsUsingGlasses = df_applicant.IsUsingGlasses.astype(str).map(str.lower)




"""PRE-PROCESSING JOB"""
job_str = ['CityName', 'ProvinceName', 'EducationLevelName', 'MajorName', 'DriverLicenseType', 'Gender', 'MaritalStatus', 'JobTitle', 'FunctionPositionName', 'Description', 'Requirement']
job_num = ['UsiaMax', 'SalaryMin', 'SalaryMax']
job_bol = ['UsingGlasses']

'''general'''
df_job.set_index(['JobID'], inplace=True)
df_job.fillna('', inplace=True)

'''str'''
df_job[job_str] = df_job[job_str].applymap(str.lower)
df_job.replace('none', '', inplace=True)

df_job.JobTitle = df_job.JobTitle.apply(lambda x: translate_teks(translator, x))
df_job.FunctionPositionName = df_job.FunctionPositionName.apply(lambda x: translate_teks(translator, x))
df_job.MajorName = df_job.MajorName.apply(lambda x: translate_teks(translator, x))

df_job.Description = df_job.Description.map(clean_text).map(maintain_alphabet).map(remove_single).map(remove_morespace).apply(lambda x: translate_teks(translator, x)).apply(lambda x: stemmer_words(stemmer, x)).apply(lambda x: x.split(' ')).apply(lambda x: change_slangwords(slangwords, x)).apply(lambda x: [i for i in x if i not in stopwords]).apply(lambda x: ' '.join(x))
df_job.Requirement = df_job.Requirement.map(clean_text).map(maintain_alphabet).map(remove_single).map(remove_morespace).apply(lambda x: translate_teks(translator, x)).apply(lambda x: stemmer_words(stemmer, x)).apply(lambda x: x.split(' ')).apply(lambda x: change_slangwords(slangwords, x)).apply(lambda x: [i for i in x if i not in stopwords]).apply(lambda x: ' '.join(x))

# concatenate JobTitle and FunctionPositionName to be textual feature together
df_job.JobTitle = df_job.JobTitle.str.cat(
    df_job.FunctionPositionName, sep=' '
)
df_job.rename(columns={'JobTitle': 'JobTitlePosition'}, inplace=True)
df_job.drop(columns=['FunctionPositionName'], inplace=True)

# concatenate Description and Requirement to be textual feature together
df_job.Description = df_job.Description.str.cat(
    df_job.Requirement, sep=' '
)
df_job.rename(columns={'Description': 'DescriptionRequirement'}, inplace=True)
df_job.drop(columns=['Requirement'], inplace=True)

'''int'''
df_job[job_num] = df_job[job_num].replace('', 0)
df_job[job_num] = df_job[job_num].astype(int)

# get mean from SalaryMin and SalaryMax
df_job.SalaryMin = (df_job.SalaryMax + df_job.SalaryMin) // 2
df_job.rename(columns={'SalaryMin': 'SalaryMean'}, inplace=True)
df_job.SalaryMean = df_job.SalaryMean.apply(lambda x: 0 if x < 1_000_000 else x)
df_job.drop(columns=['SalaryMax'], inplace=True)

'''bool'''
df_job.UsingGlasses = df_job.UsingGlasses.astype(str).map(str.lower)



# load table for vocabulary
df_function.FunctionPositionName = df_function.FunctionPositionName.map(str.lower).apply(lambda x: function_replacement(x)).map(remove_insideparentheses).map(remove_morespace).map(str.strip)
df_education.EducationLevelName = df_education.EducationLevelName.map(str.lower)
df_city.CityName = df_city.CityName.map(str.lower)
df_province.ProvinceName = df_province.ProvinceName.map(str.lower)
df_major.MajorName = df_major.MajorName.map(str.lower)



Translating plan imolement monitor th ...
Translating omset sales profit tax ma ...
Translating mengenalkan product led s ...
Translating menerima dokumen tagihan  ...
Translating daily administrative of t ...
Translating repaired and maintenance  ...
Translating menyusun laporan keuangan ...
Translating menerima komplaint atau k ...
Translating managing food and beverag ...
Translating membuat progres harian me ...
Translating kasir stock opname pengan ...
Translating selama magang saya banyak ...
Translating menghitung volume pekerja ...
Translating sebagai teknisi komputer  ...
Translating menghitung dan mentransfe ...
Translating tugas dan pokok pejerjaan ...
Translating to manage roadway project ...
Translating menjaling hubungan baik d ...
Translating bertanggungjawab akan des ...
Translating pelunasan piutang cust ca ...
Translating membuat laporan harian mi ...
Translating merencanakan bangunan mul ...
Translating membuat koten foto dan vi ...
Translating menghandle customer vi

In [17]:
df_applicant.head(3)

Unnamed: 0_level_0,Age,ExpectedSalary,IsUsingGlasses,CityName,ProvinceName,EducationLevelName,MajorName,DriverLicenseType,Gender,MaritalStatus,Position,DescriptionStrengthness
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
33513,25,3500000.0,False,surabaya,jawa timur,d3,semua jurusan 2,c,female,single,manajemen rantai pasokan & kontrol kualitas ke...,rencana imolement pantau strategi rantai pasok...
31861,51,15000000.0,False,surabaya,jawa timur,s1,manajemen,a,male,married,asisten direktur dan operasional direktur,omset laba jual pajak pelihara omset laba jual...
31891,30,4500000.0,True,surabaya,jawa timur,s1,psikologi,a,male,single,produksi produk dan acara penjualan produk dan...,kenal product led sound lighting handle event ...


In [18]:
province = ["Aceh", "Sumatera Utara", "Sumatera Barat", "Riau", "Kepulauan Riau", "Jambi", "Bengkulu", "Sumatera Selatan", "Bangka Belitung", "Lampung", "DKI Jakarta", "Banten", "Jawa Barat", "Jawa Tengah", "Yogyakarta", "Jawa Timur", "Bali", "Nusa Tenggara Barat", "Nusa Tenggara Timur", "Kalimantan Barat", "Kalimantan Tengah", "Kalimantan Selatan", "Kalimantan Timur", "Kalimantan Utara", "Sulawesi Utara", "Gorontalo", "Sulawesi Tengah", "Sulawesi Barat", "Sulawesi Selatan", "Sulawesi Tenggara", "Maluku", "Maluku Utara", "Papua Barat", "Papua"]

In [39]:
df_job.loc[3091]

UsiaMax                                                    0
SalaryMean                                           4150000
CityName                                            surabaya
ProvinceName                                      jawa timur
EducationLevelName                                          
MajorName                                      semua jurusan
DriverLicenseType                                           
UsingGlasses                                                
Gender                                                      
MaritalStatus                                               
JobTitlePosition          staf pemasaran eksekutif pemasaran
DescriptionRequirement                                      
Name: 3091, dtype: object

In [44]:
df_job = df_job[~(df_job.DescriptionRequirement == ' ')]

In [46]:
df_applicant[df_applicant.DescriptionStrengthness == ' ']

Unnamed: 0_level_0,Age,ExpectedSalary,IsUsingGlasses,CityName,ProvinceName,EducationLevelName,MajorName,DriverLicenseType,Gender,MaritalStatus,Position,DescriptionStrengthness
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
31698,26,0.0,False,surabaya,jawa timur,s1,semua jurusan 2,,female,single,pendaftaran dan administrasi pasien,
28763,29,4500000.0,False,jakarta,dki jakarta,d3,teknik informatika,a,male,single,kasir,
28015,27,5000000.0,False,jakarta,dki jakarta,sma,sma segala jurusan,a,male,single,layanan teller,
27439,43,5500000.0,True,bandung,jawa barat,s1,akuntansi,c,male,married,pemantauan kredit dan pelaporan hubungan penyewa,
38334,24,5000000.0,False,tangerang,banten,s1,teknik informatika,c,female,single,entri data admin senior,
29571,25,0.0,False,jakarta timur,dki jakarta,s1,semua jurusan 2,a,male,single,magang mahasiswa magang,
28913,25,3800000.0,False,jakarta utara,dki jakarta,smk,manajemen,c,male,single,staf administrasi staf administrasi,


In [47]:
job_model = JobModel(df_job, df_function, df_education, df_city, df_province, df_major)
app_model = AppModel(job_model, df_applicant)



In [48]:
app_model.app_vector.shape, job_model.job_vector.shape

((361, 1273), (40, 1273))

In [49]:
app_model.df_app.index

Int64Index([33513, 31861, 31891, 31790, 31797, 31698, 31700, 31604, 31379,
            29821,
            ...
            35778, 35187, 33631, 38226, 27292, 29243, 39827, 31567, 30254,
            31968],
           dtype='int64', name='ApplicantID', length=361)

In [50]:
dict_similarity = {}

for id in app_model.df_app.index:
    dict_similarity[id] = None

In [51]:
app_model.app_vector.shape, job_model.job_vector.shape

((361, 1273), (40, 1273))

In [52]:
for key, vec in zip(dict_similarity, app_model.app_vector):
    similarity = cosine_similarity(vec.reshape(1, -1), job_model.job_vector)[0]
    df_temp = df_job.copy()
    df_temp['similarity'] = similarity
    df_temp = df_temp.sort_values(by='similarity', ascending=False)
    
    dict_similarity[key] = df_temp

In [53]:
dict_similarity.keys()

dict_keys([33513, 31861, 31891, 31790, 31797, 31698, 31700, 31604, 31379, 29821, 29824, 29838, 29840, 29844, 29194, 29226, 29240, 29242, 28879, 28892, 28882, 29808, 29864, 29935, 29942, 30035, 30237, 30741, 27068, 27253, 27317, 27355, 27524, 27696, 27726, 27729, 27837, 27849, 28683, 28808, 128, 775, 841, 1001, 1178, 1619, 1700, 1847, 1958, 1998, 2054, 3936, 3939, 3979, 4046, 4156, 4400, 4895, 7026, 6121, 8027, 8997, 9189, 9215, 10736, 11118, 12167, 12179, 12374, 12953, 16413, 16629, 16764, 18424, 18866, 14197, 19622, 20267, 19905, 21311, 21576, 22216, 22023, 22027, 23522, 23787, 23881, 26133, 26389, 26436, 26822, 26842, 26951, 26492, 23877, 25510, 23061, 23136, 21928, 22550, 22700, 22771, 22826, 21757, 19166, 16795, 12609, 11326, 10305, 10634, 9235, 8557, 6613, 5362, 5742, 5867, 28763, 28638, 28152, 28157, 28440, 28473, 28496, 28085, 28015, 28058, 27659, 27674, 27515, 27494, 31020, 31046, 30920, 31006, 30814, 30358, 30236, 30302, 30315, 30086, 29815, 29707, 29164, 29008, 28925, 28837, 

In [61]:
df_applicant.loc[29240]

Age                                                                        0
ExpectedSalary                                                     3500000.0
IsUsingGlasses                                                         false
CityName                                                            surabaya
ProvinceName                                                      jawa timur
EducationLevelName                                                        d3
MajorName                                                       teknik sipil
DriverLicenseType                                                          c
Gender                                                                female
MaritalStatus                                                         single
Position                                                              magang
DescriptionStrengthness    magang manajemen konstruksi ecek jadwal proyek...
Name: 29240, dtype: object

In [60]:
dict_similarity[29240]

Unnamed: 0_level_0,UsiaMax,SalaryMean,CityName,ProvinceName,EducationLevelName,MajorName,DriverLicenseType,UsingGlasses,Gender,MaritalStatus,JobTitlePosition,DescriptionRequirement,similarity
JobID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3066,0,0,jakarta,dki jakarta,s1,teknik sipil,,False,,,petugas proyek petugas proyek,survei titik lokasi bangun konstruksi rencana ...,0.231394
2963,0,0,jakarta,dki jakarta,smk,teknik sipil,,False,,,qc lapangan operasi umum,qc hasil kerja subkon progress pek lapang qc u...,0.186663
3035,0,0,jakarta,dki jakarta,s1,semua jurusan,,False,,,guru bahasa mandarin (laoshi) guru,kerja anak lapor ajar administrasi manajemen k...,0.154044
3033,0,0,jakarta,dki jakarta,s1,semua jurusan,,False,,,tempat penitipan anak guru guru,lingkung ajar aman area main tidur ganti pakai...,0.103635
2984,0,0,jakarta,dki jakarta,d3,semua jurusan,,False,,,urusan umum staf urusan umum,administrasi berkas berkas check list supir ja...,0.092814
3055,0,0,jakarta,dki jakarta,d3,semua jurusan,,False,,,staf bm-saya tukang listrik perawatan,giat aset bangun periodik stock bahan bakar so...,0.074768
2994,0,0,jakarta,dki jakarta,d3,semua jurusan 2,,False,,,spv bm-me tukang listrik perawatan,giat aset bangun periodik stock bahan bakar so...,0.074768
3093,0,4650000,surabaya,jawa timur,,semua jurusan 2,,False,,,urusan umum (ga) urusan umum,erti atur undang proses urus urus perijinan bu...,0.066203
2933,0,0,jakarta,dki jakarta,s1,akuntansi,,False,,,akuntansi pengawas akuntansi,kontrol lapor konstruksi kontrol lapor beli ba...,0.065073
3069,0,5250000,surabaya,jawa timur,s1,akuntansi,,False,,,kepala akuntansi kepala akuntansi,akuntansi kerja supervisor akuntansi teliti or...,0.063637


In [14]:
df_job.fillna('', inplace=True)

In [20]:
df_job.Description = df_job.Description.map(clean_text).map(maintain_alphabet).map(remove_single).map(remove_morespace).apply(lambda x: translate_teks(translator, x)).apply(lambda x: stemmer_words(stemmer, x)).apply(lambda x: x.split(' ')).apply(lambda x: change_slangwords(slangwords, x)).apply(lambda x: [i for i in x if i not in stopwords]).apply(lambda x: ' '.join(x))

Translating melakukan aktivitas kesek ...
Translating melakukan greeting kepada ...
Translating mengelola dan memantau op ...
Translating mengontrol laporan konstr ...
Translating memahami dan mampu memper ...
Translating usia maksimal tahun pendi ...
Translating melakukan negosiasi denga ...
Translating melakukan administrasi be ...
Translating membuat laporan keuangan  ...
Translating melakukan kegiatan perawa ...
Translating membantu dalam menyusun r ...
Translating bertanggungjawab pada kes ...
Translating performing all executive  ...
Translating menyusun surat somasi pen ...
Translating menyediakan lingkungan be ...
Translating implement promotion plan  ...
Translating able to work children age ...
Translating membuat jalannya sistem d ...
Translating melakukan koordinasi deng ...
Translating achieves the sales and bu ...
Translating melakukan kegiatan perawa ...
Translating melakukan survei titik lo ...
Translating membuat legal opinion dal ...
Translating usia maksimal tahun pe

In [54]:
df_applicant.JobDescription = df_applicant.JobDescription.map(clean_text).map(maintain_alphabet).map(remove_single).map(remove_morespace).apply(lambda x: translate_teks(translator, x)).apply(lambda x: stemmer_words(stemmer, x)).apply(lambda x: x.split(' ')).apply(lambda x: change_slangwords(slangwords, x)).apply(lambda x: [i for i in x if i not in stopwords]).apply(lambda x: ' '.join(x))



Translating plan imolement monitor th ...
Translating omset sales profit tax ma ...
Translating mengenalkan product led s ...
Translating menerima dokumen tagihan  ...
Translating daily administrative of t ...
Translating repaired and maintenance  ...
Translating menyusun laporan keuangan ...
Translating menerima komplaint atau k ...
Translating managing food and beverag ...
Translating membuat progres harian me ...
Translating kasir stock opname pengan ...
Translating selama magang saya banyak ...
Translating menghitung volume pekerja ...
Translating sebagai teknisi komputer  ...
Translating menghitung dan mentransfe ...
Translating tugas dan pokok pejerjaan ...
Translating to manage roadway project ...
Translating menjaling hubungan baik d ...
Translating bertanggungjawab akan des ...
Translating pelunasan piutang cust ca ...
Translating membuat laporan harian mi ...
Translating merencanakan bangunan mul ...
Translating membuat koten foto dan vi ...
Translating menghandle customer vi

In [26]:
df_job.Description = df_job.Description.apply(lambda x: x.split(' '))

In [126]:
from collections import defaultdict
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex

In [27]:
frequency = defaultdict(int)

for text in df_job.Description.values:
    for token in text:
        frequency[token] += 1

In [29]:
processed_corpus = [[token for token in text if frequency[token] > 1] for text in df_job.Description.values]

In [36]:
dictionary = corpora.Dictionary(processed_corpus)

In [64]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

In [66]:
# df_applicant.JobDescription.values[2].split(' ')

In [67]:
# dictionary.doc2bow(df_applicant.JobDescription.values[2].split(' '))

In [68]:
tfidf = models.TfidfModel(bow_corpus)

In [134]:
corpus_tfidf = tfidf[bow_corpus]

In [69]:
tfidf[dictionary.doc2bow(df_applicant.JobDescription.values[0].split(' '))]

[(3, 0.40589904017902434),
 (5, 0.2659146649026402),
 (14, 0.07806451404859151),
 (29, 0.14312330255257263),
 (35, 0.20294952008951217),
 (41, 0.10812720873347662),
 (42, 0.11563724659262947),
 (46, 0.10812720873347662),
 (48, 0.5255514752529035),
 (62, 0.26277573762645173),
 (63, 0.1329573324513201),
 (71, 0.20294952008951217),
 (72, 0.32260195516339124),
 (103, 0.20294952008951217),
 (126, 0.16795342627041615),
 (134, 0.11563724659262947),
 (224, 0.26277573762645173)]

240

In [125]:
index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=max(dictionary.token2id.values()))

In [81]:
query_bow = dictionary.doc2bow(df_applicant.JobDescription.values[0].split(' '))
sims = index[tfidf[query_bow]]

In [82]:
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

6 0.3197701
30 0.24638744
5 0.20985234
4 0.13979308
33 0.11689468
19 0.11647218
0 0.11550988
8 0.093095064
35 0.093095064
39 0.0838868
25 0.08277217
27 0.08277217
31 0.074033074
36 0.06851476
20 0.066940285
14 0.0629413
22 0.059795912
10 0.055047236
24 0.05066949
37 0.050073773
17 0.045263108
40 0.043937907
12 0.04015802
11 0.035114087
23 0.035114087
28 0.03345056
34 0.032244336
16 0.024616037
3 0.024598265
41 0.020230494
18 0.016244106
15 0.015820041
2 0.005325924
1 0.0
7 0.0
9 0.0
13 0.0
21 0.0
26 0.0
29 0.0
32 0.0
38 0.0


In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [103]:
vectorizer = TfidfVectorizer()
bank = vectorizer.fit_transform(df_job.Description.apply(lambda x: ' '.join(x)).values)

In [123]:
np.sort(cosine_similarity(vectorizer.transform([df_applicant.JobDescription.values[0]]), bank)[0])

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.01074197,
       0.02164985, 0.02597597, 0.02854001, 0.03061196, 0.03391413,
       0.03580383, 0.04478356, 0.04613208, 0.04792123, 0.05087761,
       0.05563123, 0.05563123, 0.05582026, 0.06263841, 0.0675622 ,
       0.06880838, 0.07231975, 0.07263057, 0.07609725, 0.07686092,
       0.08458751, 0.09325582, 0.09744606, 0.09744606, 0.11047734,
       0.11047734, 0.11762975, 0.13938208, 0.15274651, 0.15864219,
       0.19290029, 0.20341144])

In [34]:
[' '.join(i) for i in processed_corpus]

['aktivitas lancar giat direktur hukum operasional kembang data simpan dokumen komunikasi bijak internal komunikasi bijak eksternal koordinasi bahan bahan rapat rapat lapor giat',
 'desain butuh kerja',
 'langgan terima pesan langgan langgan pesan langgan terima kritik saran langgan lapor bahan jual',
 'kelola pantau operasional layan operasional layan prosedur langgan terima pesan langgan langgan terima kritik saran langgan lapor bahan jual',
 'kontrol lapor konstruksi kontrol lapor beli barang kontrol lapor barang prosedur operasional sistem disiplin lapor uang periksa internal tangan pajak',
 'jual beli ikat bank proses kerjasama hukum jual beli lengkap kredit bank koordinasi staf legal kerja jalan dukung kelola dokumen khusus hubung janji kerjasama kontrak hukum ulas kontrak hukum janji kerjasama dokumen legal hubung proyek batas surat surat isi dokumen tinjau surat janji pantau validitas selesai sengketa hukum undang undang undang atur tanah hukum pajak atur hubung bangun proyek p

In [22]:
df_job.Description.values

array(['aktivitas sekretariat lancar agenda giat direktur rektor aspek hukum operasional kembang data dasar simpan dokumen asli komunikasi bijak internal komunikasi bijak eksternal koordinasi bahan bahan rapat komisaris rapat pegang saham lapor giat sekertaris',
       'desain butuh kerja arah',
       'langgan terima pesan langgan langgan pesan langgan terima kritik saran langgan lapor bahan jual',
       'kelola pantau operasional layan operasional layan solusi prosedur greeting langgan terima pesan langgan langgan terima kritik saran langgan lapor bahan jual',
       'kontrol lapor konstruksi kontrol lapor beli barang kontrol lapor barang prosedur operasional sistem acc disiplin lapor uang periksa internal tangan pajak',
       'jual beli ikat bank proses kerjasama hukum notaris ppat jual beli lengkap akad kredit bank kreditur koordinasi staf legal kerja jalan dukung kelola dokumen khusus hubung janji kerjasama kontrak hukum ulas kontrak hukum janji kerjasama dokumen legal hubung pr