In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

In [16]:
pd.set_option('display.max_colwidth', 10)
pd.set_option('display.max_columns', None)

In [45]:
cdf_applicant = pd.read_csv('data/cdf_applicant.csv')
cdf_applicant_experience = pd.read_csv('data/cdf_applicant_experience.csv')
cdf_pipeline = pd.read_csv('data/cdf_pipeline.csv')
cdf_stage = pd.read_csv('data/cdf_stage.csv')
cdf_job = pd.read_csv('data/cdf_job.csv')

In [46]:
df = cdf_job[['JobID', 'Description']].dropna(how='any')

In [47]:
bow = CountVectorizer()
bank = bow.fit_transform(df.Description)

In [48]:
idx = 0
content = df.loc[idx, 'Description']

In [49]:
content

'under asst front office manager at gunawangsa hotel merr surabaya'

In [50]:
code = bow.transform([content])

In [51]:
code.todense()

matrix([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [52]:
from sklearn.metrics.pairwise import cosine_distances

In [53]:
dist = cosine_distances(code, bank)

In [56]:
df.shape

(924, 2)

In [79]:
dist[0].shape

(924,)

In [11]:
rec_idx = dist.argsort()[0, 1:11]

In [12]:
df.loc[rec_idx]

Unnamed: 0,JobID,Description
46,57,become asst food beverages manager at gunawang...
271,1298,become asst chief engineering at gunawangsa ho...
272,1299,become asst chief accounting at gunawangsa hot...
133,153,become sales executive at gunawangsa hotel merr
807,2942,become daily worker at gunawangsa hotel merr
802,2937,become order taker at gunawangsa hotel merr
801,2936,become housekeeping supervisor at gunawangsa h...
16,22,becoming engineering staff at hotel gunawangsa...
734,2861,become kitchen staff at gunawangsa hotel merr
375,1411,become banquet attendant at gunawangsa hotel merr


In [133]:
class ErisRecommender:
    def __init__(self, df, col):
        self.df = df
        self.col = col
        self.encoder = None
        self.bank = None
    
    def fit(self):
        self.encoder = CountVectorizer()
        self.bank = self.encoder.fit_transform(self.df[self.col])

    def recommend(self, keyword, top=10):
        # content = df.loc[idx, self.col]
        idx = df[self.col][df[self.col].str.contains(keyword)].index[0]
        content = df.loc[idx, self.col]
        print('Keyword match "' + content + '" content.')
        code = self.encoder.transform([content])

        dist = cosine_distances(code, self.bank)
        self.df['Similarity'] = dist[0]

        # rec_idx = dist.argsort()[0, 1:top + 1]
        # return self.df.loc[rec_idx]
        
        return self.df[(self.df.Similarity < .3) & (self.df.Similarity != 0)].sort_values(by=['Similarity'], ascending=True)


In [134]:
import re

from transform import remove_morespace

In [135]:
df = cdf_job[['JobID', 'Description']].dropna(how='any')

In [136]:
stopwords = ['become', 'for', 'hotel', 'merr', 'at', 'gunawangsa', 'the', 'in', 'on', 'of', 'their', 'and', 'with'] + ['dan', 'untuk']
df.Description = df.Description.apply(lambda x: ' '.join([y for y in x.split(' ') if y not in stopwords])).map(remove_morespace).map(str.strip)

In [137]:
eris = ErisRecommender(df, 'Description')
eris.fit()

In [138]:
eris.recommend('sekretaris')

Keyword match "bertanggung jawab melakukan support atas segala kebutuhan marketingbertanggung jawab menerima telpon masuk baik dari klien baru ataupun klien existingmembuat penawaran harga klienmengirimkan penawaran harga kepada klienmelakukan follow up kepada klienmengelola segala kebutuhan promosi komunikasi baik hard maupun softcopymembuat materi keperluan marketing misalnya email blast korespondensi dengan klien dllmelakukan support dalam administrasi tender melakukan pengklasifikasian menjaga data datamemasukan data data ke sistem erp menjalin komunikasi yang baik dengan klien mempertahankan klien yang telah adamelakukan koordinasi dengan divisi lain yang terkait dengan kegiatan marketing misalnya maintenancemelakukan pelaporan mengenai produk yang telah disewa digunakan oleh klien misalnya laporan foto billboardmengerjakan pekerjaan marketing administrasi kesekretarisan lainnyamembuat laporan harianmengirimkan penawaran perpanjangan harga" content.


Unnamed: 0,JobID,Description,Similarity
13,17,bertanggung jawab melakukan support atas segal...,4.440892e-16
199,1224,bertanggung jawab melakukan support atas segal...,4.440892e-16
302,1329,bertanggung jawab melakukan support atas segal...,4.440892e-16
561,1639,bertanggung jawab melakukan support atas segal...,4.440892e-16
574,1656,bertanggung jawab melakukan support atas segal...,4.440892e-16
660,2769,bertanggung jawab melakukan support atas segal...,4.440892e-16
752,2880,bertanggung jawab melakukan support atas segal...,4.440892e-16
498,1566,description bertanggung jawab melakukan suppor...,0.002296214


In [16]:
df.loc[271, 'Description']

'become asst chief engineering at gunawangsa hotel merr'