In [28]:
import re
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

from transform import remove_morespace, txt_tolist, totext_age, totext_iq, remove_html, maintain_alpha, remove_single, clean_text

from eris import ErisRecommender

In [2]:
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.max_columns', None)

In [3]:
'''masih gagal'''
def totext_salary(salmin, salmax):
    # salary = []
    # for sal in range(salmin, salmax + 1, 500_000):
    #     salary.append('SALARY' + str(sal))
    # return ' '.join(salary)
    salary = []
    for smin, smax, in zip(salmin.values, salmax.values):
        saltxt = []

        if smin == 0 and smax == 0:
            saltxt.append(str(0))
        for sal in range(smin, smax + 1, 100_000):
            saltxt.append(str(sal))

        saltxt = ' '.join(saltxt)
    else:
        salary.append(saltxt)
    return salary

Job

In [4]:
# read
cdf_applicant = pd.read_csv('data/cdf_applicant.csv')
cdf_applicant_experience = pd.read_csv('data/cdf_applicant_experience.csv')
cdf_pipeline = pd.read_csv('data/cdf_pipeline.csv')
cdf_stage = pd.read_csv('data/cdf_stage.csv')

cdf_job = pd.read_csv('data/cdf_job.csv')
cdf_job.drop(columns=['EducationLevelID', 'JobStatus', 'HiredQuota', 'AdsStatisticID', 'CompanyID', 'SkipTest', 'DbName', 'UsingGlasses', 'IsSpecificEducationLevel', 'IsSpesificDriverLicense', 'IsSpecificAge', 'IsSpecificGender', 'IsSpecificIQ', 'IsSpecificMarital'], inplace=True)

df_function_position = pd.read_csv('data/df_function_position.csv')
df_city = pd.read_csv('data/df_city.csv')
df_province = pd.read_csv('data/df_province.csv')
df_major = pd.read_csv('data/df_major.csv')
df_department = pd.read_csv('data/df_department.csv')

# merge
df_job = cdf_job.copy()

df_job = pd.merge(df_job, df_function_position, on=['FunctionPositionID'])
df_job.drop(columns=['FunctionPositionID'], inplace=True)

df_job = pd.merge(df_job, df_city, on=['CityID'])
df_job.drop(columns=['CityID'], inplace=True)

df_job = pd.merge(df_job, df_province, on=['ProvinceID'])
df_job.drop(columns=['ProvinceID'], inplace=True)

df_job = pd.merge(df_job, df_major, on=['MajorID'])
df_job.drop(columns=['MajorID'], inplace=True)

df_job = pd.merge(df_job, df_department, on=['DepartmentID'])
df_job.drop(columns=['DepartmentID'], inplace=True)

# transform
df_job.fillna('', inplace=True)
df_job = df_job[~df_job.JobTitle.str.contains('test')]

df_job[['SalaryMax', 'SalaryMin', 'UsiaMax', 'IQMin']] = df_job[['SalaryMax', 'SalaryMin', 'UsiaMax', 'IQMin']].fillna(0).replace('', 0).astype(int)

df_job[df_job.select_dtypes(object).columns] = df_job[df_job.select_dtypes(object).columns].applymap(str.lower)

df_job.UsiaMax = df_job.UsiaMax.apply(lambda x: totext_age(usiamax=x) if x != 0 else totext_age())
df_job.IQMin = df_job.IQMin.apply(lambda x: totext_iq(iqmin=x) if x != 0 else totext_iq())
df_job.DriverLicenseType = df_job.DriverLicenseType.apply(lambda x: x + 'driver' if x != str(0) else '')
df_job.Gender = df_job.Gender.apply(lambda x: '' if x == 0 else x)
df_job.MaritalStatus = df_job.MaritalStatus.apply(lambda x: '' if x == 0 else x)
df_job.FunctionPositionName = df_job.FunctionPositionName.apply(lambda x: re.sub('[^a-zA-Z\s]', ' ', x))

stopwords_DepartmentName = ['media', 'gunawangsa', 'warna', 'warni', 'group', 'operasional', 'jakarta', 'dan', 'and', 'surabaya', 'anang', 'karaoke', 'testdepartment', 'testcompany', 'hotel', 'merr', 'manyar', 'indonesia', 'inovasi']
df_job.DepartmentName = df_job.DepartmentName.apply(lambda x: x.split('pt')[0] if 'pt' in x else x)
df_job.DepartmentName = df_job.DepartmentName.apply(lambda x: re.sub('(' + '|'.join(stopwords_DepartmentName) + ')', '', x))
df_job.DepartmentName = df_job.DepartmentName.apply(lambda x: remove_morespace(x).strip())

df_job.drop(columns=['Gender', 'MaritalStatus', 'SalaryMax', 'SalaryMin'], inplace=True)

job_train = df_job[['JobID', 'JobTitle']]

coltex_df = df_job.select_dtypes(object).columns
df_job[coltex_df] = df_job[coltex_df].astype(str).applymap(str.lower)

df_job.set_index(['JobID'], inplace=True)

job_train = pd.DataFrame([], index=df_job.index)
job_train['Text'] = df_job[df_job.columns].agg(lambda x: ' '.join(x.values), axis=1)

Applicant

In [98]:
# read
cdf_applicant = pd.read_csv('data/cdf_applicant.csv')
cdf_applicant_experience = pd.read_csv('data/cdf_applicant_experience.csv')
cdf_pipeline = pd.read_csv('data/cdf_pipeline.csv')
cdf_stage = pd.read_csv('data/cdf_stage.csv')

df_city = pd.read_csv('data/df_city.csv')
df_province = pd.read_csv('data/df_province.csv')

# merge
df_app = pd.merge(cdf_applicant, cdf_applicant_experience, on=['ApplicantID'])
df_app = pd.merge(df_app, cdf_pipeline, on=['ApplicantID']).drop(columns=['PipelineID', 'JobID'])
df_app = pd.merge(df_app, cdf_stage, on=['StageID']).drop(columns=['StageID'])
df_app = pd.merge(df_app, df_city, on=['CityID']).drop(columns=['CityID'])
df_app = pd.merge(df_app, df_province, on=['ProvinceID']).drop(columns=['ProvinceID'])

# # transform
# col_txt = df_app.select_dtypes(object).columns
# df_app[col_txt] = df_app[col_txt].astype(str).applymap(str.lower)

# df_app.DiseaseHistory = df_app.DiseaseHistory.replace('0', '')
# df_app.Age = df_app.Age.apply(lambda x: 'u' + str(x) if x != 0 else '')
# df_app.DriverLicenseType = df_app.DriverLicenseType.apply(lambda x: x + 'driver' if x != '0' else '')
# df_app.IsUsingGlasses = df_app.IsUsingGlasses.apply(lambda x: 'glasses' if x == 'true' else 'noglasses')
# df_app.MaritalStatus = df_app.MaritalStatus.apply(lambda x: '' if x == '0' else x)
# df_app.Nationality = df_app.Nationality.apply(lambda x: '' if x == '0' else x)
# df_app.Strengthness = df_app.Strengthness.map(maintain_alpha).map(remove_morespace)
# df_app.Weaknesses = df_app.Weaknesses.map(maintain_alpha).map(remove_morespace)
# df_app.TypeOfVehicle = df_app.TypeOfVehicle.apply(lambda x: '' if x == '0' else x)
# df_app.drop(columns=['Height'], inplace=True)
# df_app.Salary = df_app.Salary.apply(lambda x: round(x, -5))
# df_app.YearsOfExperience = df_app.YearsOfExperience.apply(lambda x: str(x) + 'yearsexperience')
# df_app = df_app.astype(str)
# df_app = df_app.set_index(['ApplicantID'])

# app_train = pd.DataFrame([], index=df_app.index)
# app_train['Text'] = df_app[df_app.columns].agg(lambda x: ' '.join(x.values), axis=1)
# app_train.Text = app_train.Text.map(remove_morespace).map(str.strip)

In [121]:
hired_and_rejected = [int(i) for i in hired_and_rejected]

In [107]:
df_app.set_index('ApplicantID', inplace=True)

In [124]:
df_app.columns

Index(['DiseaseHistory', 'Age', 'DriverLicenseType', 'Gender',
       'IsUsingGlasses', 'MaritalStatus', 'Nationality', 'Strengthness',
       'Weaknesses', 'TypeOfVehicle', 'Height', 'ExpectedSalary', 'Industry',
       'CompanyName', 'JobDescription', 'Position', 'Salary',
       'YearsOfExperience', 'Label', 'CityName', 'ProvinceName'],
      dtype='object')

In [130]:
df_app.loc[hired_and_rejected][['Age', 'DriverLicenseType', 'Gender', 'IsUsingGlasses', 'MaritalStatus', 'Nationality', 'JobDescription', 'Position', 'Label']]

Unnamed: 0_level_0,Age,DriverLicenseType,Gender,IsUsingGlasses,MaritalStatus,Nationality,JobDescription,Position,Label
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
13,0,c,male,false,single,indonesia,,it,Rejected
13,0,c,male,false,single,indonesia,,it,Hired
128,0,c,male,false,single,indonesia,mempromosikan se...,freelance crew f...,Rejected
128,0,c,male,false,single,indonesia,mempromosikan se...,freelance crew f...,Rejected
128,0,c,male,false,single,indonesia,mempromosikan se...,freelance crew f...,Hired
...,...,...,...,...,...,...,...,...,...
39387,35,c,male,true,single,indonesia,monitoring absen...,industrial relat...,Rejected
39387,35,c,male,true,single,indonesia,monitoring absen...,industrial relat...,Rejected
39387,35,c,male,true,single,indonesia,monitoring absen...,industrial relat...,Hired
20963,31,c,female,false,married,indonesia,tax ppnrekap pen...,admin staff acco...,Rejected


In [119]:
df_app.index

Int64Index([    3,    10,    13,    21,    25,    29,    62,    69,    70,
               81,
            ...
            33490, 33490, 33490, 33200, 39690, 39690, 39640, 35586, 35586,
            39812],
           dtype='int64', name='ApplicantID', length=6566)

Build Model

In [6]:
stopwords_in = txt_tolist('tala-stopwords-indonesia.txt')

There are 758 data.


In [14]:
df_app.head(3)

Unnamed: 0_level_0,DiseaseHistory,Age,DriverLicenseType,Gender,IsUsingGlasses,MaritalStatus,Nationality,Strengthness,Weaknesses,TypeOfVehicle,ExpectedSalary,Industry,CompanyName,JobDescription,Position,Salary,YearsOfExperience,Label,CityName,ProvinceName
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3,,u29,cdriver,male,noglasses,single,indonesia,pekerja keras te...,pelupa tidak cep...,motor,4000000,telekomunikasi r...,pt icool interna...,maintenance deal...,sales executive ...,3700000,1yearsexperience,rejected,surabaya,jawa timur
10,,u29,cdriver,male,noglasses,single,indonesia,konsisten mampu ...,terlalu taat den...,motor,3700000,event organizer ...,victory event or...,bertugas sebagai...,marketing admini...,3200000,2yearsexperience,rejected,surabaya,jawa timur
13,,,cdriver,male,noglasses,single,indonesia,swot,swot,motor,0,rudi,mr,,it,0,0yearsexperience,rejected,surabaya,jawa timur


In [59]:
encoder = TfidfVectorizer()
bank = encoder.fit_transform(app_train.Text)
code = encoder.transform(job_train.Text)
dist = cosine_similarity(code, bank)[0]*100

Data dengan duplicates Rejected & Hired<br>
ApplicantID = [12166]

In [62]:
df_app['Similarity'] = dist

In [92]:
hired = df_app[['Gender', 'Strengthness', 'Weaknesses', 'ExpectedSalary', 'Industry', 'JobDescription', 'Position', 'Salary', 'Label', 'Similarity']][(df_app.index.duplicated(False)) & (df_app.Label == 'hired')].index

rejected = df_app[['Gender', 'Strengthness', 'Weaknesses', 'ExpectedSalary', 'Industry', 'JobDescription', 'Position', 'Salary', 'Label', 'Similarity']][(df_app.index.duplicated(False)) & (df_app.Label == 'rejected')].index

In [95]:
hired_and_rejected = [i for i in hired if i in rejected]

In [97]:
df_app.loc[hired_and_rejected]

Unnamed: 0_level_0,DiseaseHistory,Age,DriverLicenseType,Gender,IsUsingGlasses,MaritalStatus,Nationality,Strengthness,Weaknesses,TypeOfVehicle,ExpectedSalary,Industry,CompanyName,JobDescription,Position,Salary,YearsOfExperience,Label,CityName,ProvinceName,Similarity
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13,,,cdriver,male,noglasses,single,indonesia,swot,swot,motor,0,rudi,mr,,it,0,0yearsexperience,rejected,surabaya,jawa timur,1.444724
13,,,cdriver,male,noglasses,single,indonesia,swot,swot,motor,0,rudi,mr,,it,0,0yearsexperience,hired,surabaya,jawa timur,1.421133
128,amandel,,cdriver,male,noglasses,single,indonesia,negosiasi mobili...,bahasa inggris s...,motor,3600000,jasa general kon...,pt dyandra promo...,mempromosikan se...,freelance crew f...,2800000,3yearsexperience,rejected,surabaya,jawa timur,0.549599
128,amandel,,cdriver,male,noglasses,single,indonesia,negosiasi mobili...,bahasa inggris s...,motor,3600000,jasa general kon...,pt dyandra promo...,mempromosikan se...,freelance crew f...,2800000,3yearsexperience,rejected,surabaya,jawa timur,0.549599
128,amandel,,cdriver,male,noglasses,single,indonesia,negosiasi mobili...,bahasa inggris s...,motor,3600000,jasa general kon...,pt dyandra promo...,mempromosikan se...,freelance crew f...,2800000,3yearsexperience,hired,surabaya,jawa timur,0.548272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39387,,u35,cdriver,male,glasses,single,indonesia,leadership analy...,my age is going ...,motor,7000000,cement productio...,pt semen bosowa ...,monitoring absen...,industrial relat...,3000000,2yearsexperience,rejected,yogyakarta,di yogyakarta,1.729575
39387,,u35,cdriver,male,glasses,single,indonesia,leadership analy...,my age is going ...,motor,7000000,cement productio...,pt semen bosowa ...,monitoring absen...,industrial relat...,3000000,2yearsexperience,rejected,yogyakarta,di yogyakarta,1.729575
39387,,u35,cdriver,male,glasses,single,indonesia,leadership analy...,my age is going ...,motor,7000000,cement productio...,pt semen bosowa ...,monitoring absen...,industrial relat...,3000000,2yearsexperience,hired,yogyakarta,di yogyakarta,1.723683
20963,,u31,cdriver,female,noglasses,married,indonesia,,,,4000000,perdagangan hosp...,pt ponti mitra s...,tax ppnrekap pen...,admin staff acco...,3100000,4yearsexperience,rejected,pontianak,kalimantan barat,2.424063


In [67]:
df_app[['Gender', 'Strengthness', 'Weaknesses', 'ExpectedSalary', 'Industry', 'JobDescription', 'Position', 'Salary', 'Label', 'Similarity']].sort_values(by='Similarity', ascending=False).head(5)

Unnamed: 0_level_0,Gender,Strengthness,Weaknesses,ExpectedSalary,Industry,JobDescription,Position,Salary,Label,Similarity
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10,male,konsisten mampu ...,terlalu taat den...,3700000,event organizer ...,bertugas sebagai...,marketing admini...,3200000,rejected,17.738386
12166,male,tegas jujur loya...,kasian jika phk ...,14000000,highrise buildin...,high implementat...,hrga manager hrg...,9600000,rejected,16.639338
12166,male,tegas jujur loya...,kasian jika phk ...,14000000,highrise buildin...,high implementat...,hrga manager hrg...,9600000,hired,16.636087
12166,male,tegas jujur loya...,kasian jika phk ...,14000000,highrise buildin...,high implementat...,hrga manager hrg...,9600000,hired,16.636087
12356,male,persistence trus...,impatient,18000000,hospitality hosp...,direct report to...,sales sales mana...,14300000,rejected,16.10965


In [60]:
dist.shape

(6566,)

In [58]:
dist.shape, df_job.shape, app_train.shape

((925,), (923, 11), (6566, 1))

In [15]:
w = 'maintenance'

job_train = pd.DataFrame(job_train.Text.append(pd.Series(w), ignore_index=True), columns=['Text'])
# eris = ErisRecommender(df_job, job_train, 'Text')
eris = ErisRecommender(df_job.copy().reset_index(), job_train.copy().reset_index(), 'Text')
eris.fit()
eris.recommend(w)

Index yang ada 40


  job_train = pd.DataFrame(job_train.Text.append(pd.Series(w), ignore_index=True), columns=['Text'])


Unnamed: 0,index,JobTitle,Description,Requirement,Similarity
40,40,senior accounting,membuat jalannya...,usia maksimal ta...,99
53,53,chief accounting,membuat jalannya...,usia maksimal ta...,98
546,546,chief accounting,usia maksimal ta...,membuat jalannya...,97
56,56,chief finance ac...,membuat jalannya...,usia maksimal ta...,97
55,55,chief accounting,membuat jalannya...,usia maksimal ta...,97
...,...,...,...,...,...
456,456,marketing superv...,supervise all ma...,candidate must p...,1
504,504,staff legal kontrak,to prepare draft...,candidate must p...,1
831,831,staff legal kontrak,to prepare draft...,candidate must p...,1
506,506,staff legal kontrak,to prepare draft...,candidate must p...,1


In [11]:
tes = TfidfVectorizer()
tesx = tes.fit_transform(['python scala golang', 'java javascript python golang css html scala', 'scala golang java', 'scala'])
cosine_similarity(tesx[0], tesx[1]), cosine_similarity(tesx[0], tesx[2]), cosine_similarity(tesx[3], tesx[0])

(array([[0.51413975]]), array([[0.52233738]]), array([[0.4574528]]))

In [12]:
tes.get_feature_names_out()

array(['css', 'golang', 'html', 'java', 'javascript', 'python', 'scala'],
      dtype=object)