In [232]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transform import remove_html, maintain_alpha, remove_single, remove_morespace

In [233]:
pd.set_option('display.max_colwidth', 10)
pd.set_option('display.max_columns', None)

In [234]:
df_job = pd.read_csv('data/cdf_job.csv')
df_applicant = pd.read_csv('data/cdf_applicant.csv')
df_applicant_experience = pd.read_csv('data/cdf_applicant_experience.csv')

df_function_position = pd.read_csv('data/df_function_position.csv', index_col=['FunctionPositionID'])
df_job = pd.merge(df_job, df_function_position, on=['FunctionPositionID'])

df_job = df_job[['JobID', 'Description', 'JobTitle', 'Requirement', 'FunctionPositionName']]

df_applicant = df_applicant[['ApplicantID', 'Strengthness', 'Weaknesses']]
df_applicant_experience = df_applicant_experience[['ApplicantID', 'Industry', 'JobDescription', 'Position']]

df_applicant = pd.merge(df_applicant, df_applicant_experience, on=['ApplicantID'])

In [235]:
df_applicant.head(2)

Unnamed: 0,ApplicantID,Strengthness,Weaknesses,Industry,JobDescription,Position
0,1,0,0,jasa k...,pembua...,staff ...
1,3,pekerj...,pelupa...,teleko...,mainte...,sales ...


In [236]:
df_job.set_index('JobID', inplace=True)
df_job = df_job.astype(str)

df_applicant.set_index('ApplicantID', inplace=True)
df_applicant = df_applicant.astype(str)

In [237]:
df_job = df_job.applymap(str.lower).applymap(remove_html).applymap(maintain_alpha).applymap(remove_single).applymap(remove_morespace).applymap(str.strip)
df_applicant = df_applicant.applymap(str.lower).applymap(remove_html).applymap(maintain_alpha).applymap(remove_single).applymap(remove_morespace).applymap(str.strip)



  return BeautifulSoup(text).get_text()


In [238]:
df_applicant.head(3)

Unnamed: 0_level_0,Strengthness,Weaknesses,Industry,JobDescription,Position
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,,,jasa k...,pembua...,staff ...
3,pekerj...,pelupa...,teleko...,mainte...,sales ...
10,konsis...,terlal...,event ...,bertug...,market...


In [181]:
df_job['JobID'] = df_job.index
df_job['Text'] = df_job.Description + df_job.JobTitle + df_job.Requirement + df_job.FunctionPositionName
df_job.drop(columns=['Description', 'JobTitle', 'Requirement', 'FunctionPositionName'], inplace=True)

df_applicant['ApplicantID'] = df_applicant.index
df_applicant['History'] = df_applicant.Strengthness + df_applicant.Weaknesses + df_applicant.Industry + df_applicant.JobDescription + df_applicant.Position
df_applicant.drop(columns=['Strengthness', 'Weaknesses', 'Industry', 'JobDescription', 'Position'])

Unnamed: 0_level_0,ApplicantID,History
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,jasa k...
3,3,pekerj...
10,10,konsis...
13,13,swotsw...
18,18,pemeri...
...,...,...
44921,44921,furnit...
44922,44922,pt har...
44925,44925,keuang...
44928,44928,indust...


In [182]:
df_applicant.drop(columns=['ApplicantID'], inplace=True)

In [183]:
df_job.drop(columns=['JobID'], inplace=True)

In [184]:
from sklearn.cluster import KMeans
import re

In [185]:
df_job.Text = df_job.Text.apply(lambda x: re.sub('yang|tahun|membuat|di|dengan|dan|kepada|secara|at|and|to|in|become|untuk|di|years|as|for|be|with|of|the|in|memiliki|mimal|jawab|sur|all|reled|let|or|au|melakukan|mengerjakan|pada|shootg|apliki|admistri|hil|gunakan|ngkel|targetnyamelakukan|samamemiliki|prospektifmengembangkan|lebih|networkg|marketg|able', '', x))

In [186]:
tfidf_vectorizer = TfidfVectorizer()

job_tfidf = tfidf_vectorizer.fit_transform((df_job.Text))
user_tfidf = tfidf_vectorizer.transform(df_applicant.History)

kmeans = KMeans(random_state=42)
kmeans.fit(job_tfidf)

clusters = kmeans.labels_



In [208]:
from sklearn.metrics.pairwise import linear_kernel

In [210]:
cosine_sim = linear_kernel(job_tfidf, job_tfidf)

In [212]:
original = pd.read_csv('data/cdf_job.csv')

In [219]:
original.head(2)

Unnamed: 0,JobID,Description,EducationLevelID,FunctionPositionID,JobStatus,JobTitle,Requirement,SalaryMax,SalaryMin,HiredQuota,DepartmentID,CityID,CompanyID,ProvinceID,SkipTest,MajorID,DbName,UsiaMax,UsingGlasses,IQMin,IsSpecificEducationLevel,DriverLicenseType,IsSpesificDriverLicense,Gender,IsSpecificAge,IsSpecificGender,IsSpecificIQ,IsSpecificMarital,MaritalStatus,AdsStatisticID
0,1,under ...,4,1,Close,night ...,with y...,3600000,3300000,1,1,1,7,1,True,1,WarnaW...,0.0,False,0,False,0,False,0,False,False,False,False,0,0
1,2,becomi...,2,2,Close,book k...,bachel...,4250000,3500000,1,2,1,7,1,True,2,WarnaW...,0.0,False,0,False,0,False,0,False,False,False,False,0,0


In [221]:
indices

JobTitle
night audit                     0
book keeper                     1
it android programmer           2
sekretaris legal                3
sekretaris direksi              4
                             ... 
general manager hotel         921
marketing staff               922
manager finance accounting    923
general affair ga             924
social media officer          925
Length: 926, dtype: int64

In [214]:
indices = pd.Series(original.index, index=original['JobTitle']).drop_duplicates()

In [225]:
def get_recommendations_cb(title, cosine_sim=cosine_sim):
    index = indices.str.contains(title)
    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    print(movie_indices)
    return original['JobTitle'].iloc[movie_indices]

In [187]:
df = pd.DataFrame(job_tfidf.todense()).groupby(clusters).mean()

In [188]:
terms = tfidf_vectorizer.get_feature_names_out()

In [189]:
for i,r in df.iterrows():
    print('cluster', i)
    print(','.join([terms[t] for t in np.argsort(r)[-100:]]))

cluster 0
kegian,pelaksanaan,apartment,memrikan,manager,penagihan,hubungan,charge,legusia,fance,tempo,juh,setiap,voice,utility,calon,rtanggung,keluhan,barang,out,program,menjalankan,pertanggung,penawaranadm,big,setelah,segala,perubahan,meetg,menyusun,dokumen,kerja,peralan,la,star,sebagai,sewa,gunawangsa,perusahaan,jadwal,keselaman,chief,lapan,relion,property,faktur,dap,membantu,sesuai,pengalaman,jurusan,unit,koni,target,maksimal,dari,gedung,akuntansi,accountg,baik,staff,proses,bank,penkan,admistri,karyawan,memonir,ada,pekerjaan,skill,serta,khusus,sipl,penempan,kecil,engeerg,pajak,komunikif,iented,penyewa,isiif,keuangan,tugkan,prosedur,leg,presention,tanggung,detail,jujur,mempunyai,oleh,penghuni,lanya,departemen,memtikan,mimal,menyiapkan,pembayaran,tenant,semua
cluster 1
jurusanpengalaman,maksimal,la,membantu,mengkonikan,management,pengadaan,target,keuangan,menarik,tug,mengenai,penunjang,persiapan,mampu,kebutuhan,et,menyur,jurusan,reksiusia,manager,mengevalui,eksternal,time,sekretaris,p

In [190]:
kmeans.predict(user_tfidf[0, :])

array([7])

In [191]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import surprise
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import Reader, Dataset
from surprise import KNNBasic

In [192]:
df_applicant['ApplicantID'] = df_applicant.index

In [193]:
df_pipeline = pd.read_csv('data/cdf_pipeline.csv')
df_stage = pd.read_csv('data/cdf_stage.csv')

df_stage = pd.merge(df_pipeline, df_stage, on=['StageID'])

In [194]:
df_stage = df_stage[['ApplicantID', 'StageID']]

In [195]:
df_applicant.index.names = ['Index']

In [197]:
df_applicant = pd.merge(df_applicant, df_stage, on=['ApplicantID'])

In [199]:
df_applicant = df_applicant[['History', 'ApplicantID', 'StageID']]

In [200]:
reader = Reader()
data = Dataset.load_from_df(df_applicant, reader)

# Train & Test
trainset, testset = train_test_split(data, test_size=0.20, random_state=50)

In [202]:
algo_svd = SVD()
prediction_mf = algo_svd.fit(trainset).test(testset)

In [206]:
recom_svd = algo_svd.predict(uid='data warehouse', iid='AWMjT0WguC1rwyj_rFh3')
recom_svd

Prediction(uid='data warehouse', iid='AWMjT0WguC1rwyj_rFh3', r_ui=None, est=5, details={'was_impossible': False})

In [245]:
df_applicant.head(2)

Unnamed: 0_level_0,Strengthness,Weaknesses,Industry,JobDescription,Position
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,,,jasa k...,pembua...,staff ...
3,pekerj...,pelupa...,teleko...,mainte...,sales ...


In [296]:
tes = df_job.JobTitle

In [307]:
for i in tes:
    print(i)

night audit
internal audit staff
staff
internal audit
test
manager marketing
staff hrd
tenant relation
admin support business development
leasing apartemen
marketing staff
admin support research development
leasing apartment
test data hilang
test job
finance accounting
finance accounting
book keeper
book keeper accounting department
it android programmer
it support
it android programmer
admin it
it android programmer
it android programmer
network dan it security specialist
android programmer
android programmer
sekretaris legal
sekretaris legal
sekretaris legal pajak ijin
sekretaris direksi
executive secretary
senior sekretaris
sekretaris direksi
sekretaris
sekretaris lokasi
sekretaris lokasi
senior sekretaris
sekretaris direksi
sekretaris lokasi
sekretaris
sekretaris direksi
sekretaris
sekretaris
sekretaris direksi
sekretaris
sekretaris
sekretaris direksi
sekretaris direksi
sekretaris direksi
sekretaris
sekretaris
sekretaris direksi
sekretaris direksi
sekretaris direksi
sekretaris dire

In [316]:
df_job.JobTitle.value_counts()

legal staff              35
driver                   27
engineering staff        26
sekretaris               18
sekretaris direksi       17
                         ..
sous chef                 1
kitchen                   1
internal audit staff      1
waiter waitress resto     1
surveyor lapangan         1
Name: JobTitle, Length: 372, dtype: int64

In [313]:
tfidf = TfidfVectorizer()
matrix = tfidf.fit_transform(tes.drop_duplicates().values)

pairwise_similarity = matrix * matrix.T

arr = pairwise_similarity.toarray()
np.fill_diagonal(arr, 0)

input_doc = 'estimator'

input_idx = list(tes.values).index(input_doc)

result_idx = np.nanargmax(arr[input_idx])

list(tes.values)[result_idx]

'night audit'

In [303]:
arr

array([[       nan, 0.44131453, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.44131453,        nan, 0.2546966 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.2546966 ,        nan, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ...,        nan, 0.50447859,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.50447859,        nan,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
               nan]])

Unnamed: 0_level_0,Description,JobTitle,Requirement,FunctionPositionName
JobID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
