In [22]:
import re
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

from transform import remove_morespace

In [200]:
pd.set_option('display.max_colwidth', 10)
pd.set_option('display.max_columns', None)

In [23]:
cdf_applicant = pd.read_csv('data/cdf_applicant.csv')
cdf_applicant_experience = pd.read_csv('data/cdf_applicant_experience.csv')
cdf_pipeline = pd.read_csv('data/cdf_pipeline.csv')
cdf_stage = pd.read_csv('data/cdf_stage.csv')
cdf_job = pd.read_csv('data/cdf_job.csv')

df = cdf_job[['JobID', 'Description']].dropna(how='any')

In [30]:
bow = CountVectorizer()
bank = bow.fit_transform(df.Description)

idx = 0
content = df.loc[idx, 'Description']

code = bow.transform([content])

dist = cosine_distances(code, bank)
rec_idx = dist.argsort()[0, 1:11]

df.loc[rec_idx]

Unnamed: 0,JobID,Description
46,57,become asst food beverages manager at gunawang...
271,1298,become asst chief engineering at gunawangsa ho...
272,1299,become asst chief accounting at gunawangsa hot...
133,153,become sales executive at gunawangsa hotel merr
807,2942,become daily worker at gunawangsa hotel merr
802,2937,become order taker at gunawangsa hotel merr
801,2936,become housekeeping supervisor at gunawangsa h...
16,22,becoming engineering staff at hotel gunawangsa...
734,2861,become kitchen staff at gunawangsa hotel merr
375,1411,become banquet attendant at gunawangsa hotel merr


In [31]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.DataFrame(columns=["ID","DESCRIPTION"], data=np.matrix([[10,"Cancel ASN WMS Cancel ASN"],
                                                                [11,"MAXPREDO Validation is corect"],
                                                                [12,"Move to QC"],
                                                                [13,"Cancel ASN WMS Cancel ASN"],
                                                                [14,"MAXPREDO Validation is right"],
                                                                [15,"Verify files are sent every hours for this interface from Optima"],
                                                                [16,"MAXPREDO Validation are correct"],
                                                                [17,"Move to QC"],
                                                                [18,"Verify files are not sent"]
                                                                ]))

corpus = list(df["DESCRIPTION"].values)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

threshold = 0.4

for x in range(0,X.shape[0]):
  for y in range(x,X.shape[0]):
    if(x!=y):
      if(cosine_similarity(X[x],X[y])>threshold):
        print(df["ID"][x],":",corpus[x])
        print(df["ID"][y],":",corpus[y])
        print("Cosine similarity:",cosine_similarity(X[x],X[y]))
        print()

10 : Cancel ASN WMS Cancel ASN
13 : Cancel ASN WMS Cancel ASN
Cosine similarity: [[1.]]

11 : MAXPREDO Validation is corect
14 : MAXPREDO Validation is right
Cosine similarity: [[0.64183024]]

12 : Move to QC
17 : Move to QC
Cosine similarity: [[1.]]

15 : Verify files are sent every hours for this interface from Optima
18 : Verify files are not sent
Cosine similarity: [[0.44897995]]



##### Content-Based

In [366]:
def totext_age(usiamin=15, usiamax=65):
    age = []
    for usia in range(usiamin, usiamax + 1):
        age.append('U' + str(usia))
    return ' '.join(age)

def totext_iq(iqmin=80, iqmax=200):
    iq = []
    for q in range(iqmin, iqmax + 1):
        iq.append('IQ' + str(q))
    return ' '.join(iq)

def totext_salary(salmin, salmax):
    # salary = []
    # for sal in range(salmin, salmax + 1, 500_000):
    #     salary.append('SALARY' + str(sal))
    # return ' '.join(salary)
    salary = []
    for smin, smax, in zip(salmin.values, salmax.values):
        saltxt = []

        if smin == 0 and smax == 0:
            saltxt.append(str(0))
        for sal in range(smin, smax + 1, 100_000):
            saltxt.append(str(sal))

        saltxt = ' '.join(saltxt)
    else:
        salary.append(saltxt)
    return salary
    

def totext_dl(dl):
    if dl == '':
        return ''
    elif dl == '0':
        return ''
    else:
        return 'DRIVER' + str(dl)

In [367]:
# for i,j in zip(df.SalaryMin.values, df.SalaryMax.values):
#     print('i, j', i, j)
#     for sal in range(i, j+1, 100000):
#         print(sal, end=' ')
#     print()

In [368]:
cdf_applicant = pd.read_csv('data/cdf_applicant.csv')
cdf_applicant_experience = pd.read_csv('data/cdf_applicant_experience.csv')
cdf_pipeline = pd.read_csv('data/cdf_pipeline.csv')
cdf_stage = pd.read_csv('data/cdf_stage.csv')

cdf_job = pd.read_csv('data/cdf_job.csv')
cdf_job.drop(columns=['IsSpecificEducationLevel', 'IsSpesificDriverLicense', 'IsSpecificAge', 'IsSpecificGender', 'IsSpecificIQ', 'IsSpecificMarital'], inplace=True)

df_function_position = pd.read_csv('data/df_function_position.csv')
df_function_position.drop(columns=['Unnamed: 0'], inplace=True)

df_city = pd.read_csv('data/df_city.csv')
df_city.drop(columns=['Unnamed: 0'], inplace=True)
df_city.rename(columns={'Name': 'City'}, inplace=True)

df_province = pd.read_csv('data/df_province.csv')
df_province.drop(columns=['Unnamed: 0'], inplace=True)
df_city.rename(columns={'Name': 'Province'}, inplace=True)

df_major = pd.read_csv('data/df_major.csv')
df_major.drop(columns=['Unnamed: 0'], inplace=True)

In [369]:
df = cdf_job[['JobID', 'Description', 'FunctionPositionID', 'JobTitle', 'Requirement', 'SalaryMax', 'SalaryMin', 'CityID', 'ProvinceID', 'MajorID', 'UsiaMax', 'IQMin', 'DriverLicenseType', 'Gender', 'MaritalStatus']]

df = pd.merge(df, df_function_position, on=['FunctionPositionID'])
df.drop(columns=['FunctionPositionID'], inplace=True)

df = pd.merge(df, df_city, on=['CityID'])
df.drop(columns=['CityID'], inplace=True)

df = pd.merge(df, df_province, on=['ProvinceID'])
df.drop(columns=['ProvinceID'], inplace=True)

df = pd.merge(df, df_major, on=['MajorID'])
df.drop(columns=['MajorID'], inplace=True)

Fillna

In [370]:
df.fillna('', inplace=True)

Drop

In [371]:
df = df[~df.JobTitle.str.contains('test')]

In [372]:
df.head(2)

Unnamed: 0,JobID,Description,JobTitle,Requirement,SalaryMax,SalaryMin,UsiaMax,IQMin,DriverLicenseType,Gender,MaritalStatus,FunctionPositionName,City,Name,MajorName
0,1,under asst front office manager at gunawangsa ...,night audit,with year minimum experiences in the same posi...,3600000,3300000,0.0,0,0,0,0,NIGHT AUDIT,SURABAYA,JAWA TIMUR,PERHOTELAN
2,2801,menjalin dan membina hubungan baik dengan semu...,tenant relation,usia maksimal tahunpendidikan semua jurusanpen...,0,0,0.0,0,0,0,0,NIGHT AUDIT,SURABAYA,JAWA TIMUR,PERHOTELAN


In [373]:
df[['SalaryMax', 'SalaryMin', 'UsiaMax', 'IQMin']] = df[['SalaryMax', 'SalaryMin', 'UsiaMax', 'IQMin']].fillna(0).replace('', 0).astype(int)

In [374]:
df[df.select_dtypes(object).columns] = df[df.select_dtypes(object).columns].applymap(str.upper)

In [375]:
df.UsiaMax = df.UsiaMax.apply(lambda x: totext_age(usiamax=x) if x != 0 else totext_age())

In [376]:
df.IQMin = df.IQMin.apply(lambda x: totext_iq(iqmin=x) if x != 0 else totext_iq())

In [377]:
df.DriverLicenseType.unique()

array(['0', 'A'], dtype=object)

In [378]:
df.DriverLicenseType = df.DriverLicenseType.apply(lambda x: totext_dl(x) if x != str(0) else '')

In [379]:
df.Gender = df.Gender.apply(lambda x: '' if x == 0 else x)

In [380]:
df.MaritalStatus = df.MaritalStatus.apply(lambda x: '' if x == 0 else x)

In [381]:
df.FunctionPositionName = df.FunctionPositionName.apply(lambda x: re.sub('[^a-zA-Z\s]', ' ', x))

In [382]:
df.head(2)

Unnamed: 0,JobID,Description,JobTitle,Requirement,SalaryMax,SalaryMin,UsiaMax,IQMin,DriverLicenseType,Gender,MaritalStatus,FunctionPositionName,City,Name,MajorName
0,1,UNDER ASST FRONT OFFICE MANAGER AT GUNAWANGSA ...,NIGHT AUDIT,WITH YEAR MINIMUM EXPERIENCES IN THE SAME POSI...,3600000,3300000,U15 U16 U17 U18 U19 U20 U21 U22 U23 U24 U25 U2...,IQ80 IQ81 IQ82 IQ83 IQ84 IQ85 IQ86 IQ87 IQ88 I...,,,,NIGHT AUDIT,SURABAYA,JAWA TIMUR,PERHOTELAN
2,2801,MENJALIN DAN MEMBINA HUBUNGAN BAIK DENGAN SEMU...,TENANT RELATION,USIA MAKSIMAL TAHUNPENDIDIKAN SEMUA JURUSANPEN...,0,0,U15 U16 U17 U18 U19 U20 U21 U22 U23 U24 U25 U2...,IQ80 IQ81 IQ82 IQ83 IQ84 IQ85 IQ86 IQ87 IQ88 I...,,,,NIGHT AUDIT,SURABAYA,JAWA TIMUR,PERHOTELAN


In [383]:
df.isna().sum()

JobID                   0
Description             0
JobTitle                0
Requirement             0
SalaryMax               0
SalaryMin               0
UsiaMax                 0
IQMin                   0
DriverLicenseType       0
Gender                  0
MaritalStatus           0
FunctionPositionName    0
City                    0
Name                    0
MajorName               0
dtype: int64

In [384]:
df.columns

Index(['JobID', 'Description', 'JobTitle', 'Requirement', 'SalaryMax',
       'SalaryMin', 'UsiaMax', 'IQMin', 'DriverLicenseType', 'Gender',
       'MaritalStatus', 'FunctionPositionName', 'City', 'Name', 'MajorName'],
      dtype='object')

In [401]:
df_jobtitle = df.JobTitle
df['Text'] = df.Description + df.JobTitle + df.Requirement + df.UsiaMax + df.IQMin + df.DriverLicenseType + df.FunctionPositionName + df.City + df.Name + df.MajorName
df.Text = df.Text.map(str.lower)
df.JobTitle = df.JobTitle.map(str.lower)
df_train = df[['JobID', 'JobTitle', 'Text']]

In [402]:
df_train.head(2)

Unnamed: 0,JobID,JobTitle,Text
0,1,night audit,under asst front office manager at gunawangsa ...
2,2801,tenant relation,menjalin dan membina hubungan baik dengan semu...


In [425]:
class ErisRecommender:
    def __init__(self, df, col):
        self.df = df
        self.col = col
        self.encoder = None
        self.bank = None
    
    def fit(self):
        self.encoder = TfidfVectorizer()
        self.bank = self.encoder.fit_transform(self.df[self.col])

    def recommend(self, keyword, top=10):
        # content = df.loc[idx, self.col]
        idx = self.df[self.col][self.df[self.col].str.contains(keyword)].index[0]
        content = self.df.loc[idx, self.col]
        # print('Keyword match "' + content + '" content.')
        code = self.encoder.transform([content])

        dist = cosine_similarity(code, self.bank)
        self.df['Similarity'] = dist[0]

        # rec_idx = dist.argsort()[0, 1:top + 1]
        # return self.df.loc[rec_idx]
        
        # return self.df[(self.df.Similarity > .5) & (self.df.Similarity != 1)].sort_values(by=['Similarity'], ascending=False)
        return self.df.sort_values(by='Similarity', ascending=False)


In [426]:
df_train.head()

Unnamed: 0,JobID,JobTitle,Text,Similarity
0,1,night audit,under asst front office manager at gunawangsa ...,0.141482
2,2801,it android programmer,menjalin dan membina hubungan baik dengan semu...,0.053319
5,3086,marketing executive,complete bills payments control accounts recei...,0.113978
6,3087,senior estimator,complete bills payments control accounts recei...,0.113978
7,2750,web programmer,membantu kelancaran pelaksanaan tugas pimpinan...,0.114963


In [427]:
eris = ErisRecommender(df_train, 'Text')
eris.fit()

In [428]:
eris.df[eris.df.JobID == 3]

Unnamed: 0,JobID,JobTitle,Text,Similarity
388,3,admin leasing,developing of android application and their in...,1.0


In [429]:
df[df.JobID == 3]

Unnamed: 0,JobID,Description,JobTitle,Requirement,SalaryMax,SalaryMin,UsiaMax,IQMin,DriverLicenseType,Gender,MaritalStatus,FunctionPositionName,City,Name,MajorName,Text
388,3,DEVELOPING OF ANDROID APPLICATION AND THEIR IN...,it android programmer,USIA MAKSIMAL TAHUN MINIMAL PENDIDIKAN IT ATAU...,5000000,4000000,U15 U16 U17 U18 U19 U20 U21 U22 U23 U24 U25 U2...,IQ80 IQ81 IQ82 IQ83 IQ84 IQ85 IQ86 IQ87 IQ88 I...,,,,IT ANDROID PROGRAMMER,SURABAYA,JAWA TIMUR,TEKNIK INFORMATIKA,developing of android application and their in...


In [430]:
eris.recommend('programmer')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Similarity'] = dist[0]


Unnamed: 0,JobID,JobTitle,Text,Similarity
388,3,admin leasing,developing of android application and their in...,1.000000
389,156,sosial media,mengembangkan aplikasi android dan integrasiny...,0.664288
390,2736,chief engineering,mengembangkan aplikasi android dan integrasiny...,0.493283
464,2738,staff outlet,mengembangkan aplikasi android dan integrasiny...,0.483664
393,2996,supervisor ga,mengembangkan aplikasi android dan integrasiny...,0.457252
...,...,...,...,...
847,3068,android programmer,usia minimal tahun pendidikan minimal segala j...,0.042299
816,1274,engineering staff,technical responsibilities menyiapkan dan mela...,0.042287
825,1488,marcomm mall,technical responsibilities menyiapkan dan mela...,0.042094
589,1319,driver,technical responsibilities menyiapkan dan mela...,0.041280


In [51]:
df = cdf_job[['JobID', 'JobTitle']].dropna(how='any')

stopwords = [
    'become', 'for', 'hotel', 'merr', 'at', 'gunawangsa', 'the', 'in', 'on', 'of', 'their', 'and', 'with'
] + [
    'dan', 'untuk'
]
df.JobTitle = df.JobTitle.apply(lambda x: ' '.join([y for y in x.split(' ') if y not in stopwords])).map(remove_morespace).map(str.strip)

In [52]:
eris = ErisRecommender(df, 'JobTitle')
eris.fit()

In [55]:
eris.recommend('legal')

Keyword match "sekretaris legal" content.


Unnamed: 0,JobID,JobTitle,Similarity
719,2841,sekretaris,0.749965
450,1507,sekretaris,0.749965
352,1384,sekretaris,0.749965
327,1358,sekretaris,0.749965
420,1472,sekretaris,0.749965
...,...,...,...
238,1264,sekretaris accounting,0.546369
198,1223,sekretaris accounting,0.546369
835,2976,sekretaris accounting,0.546369
34,44,sekretaris accounting,0.546369
