In [199]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

In [200]:
pd.set_option('display.max_colwidth', 10)
pd.set_option('display.max_columns', None)

In [201]:
cdf_applicant = pd.read_csv('data/cdf_applicant.csv')
cdf_applicant_experience = pd.read_csv('data/cdf_applicant_experience.csv')
cdf_pipeline = pd.read_csv('data/cdf_pipeline.csv')
cdf_stage = pd.read_csv('data/cdf_stage.csv')
cdf_job = pd.read_csv('data/cdf_job.csv')

In [202]:
df = cdf_job[['JobID', 'Description']].dropna(how='any')

In [203]:
bow = CountVectorizer()
bank = bow.fit_transform(df.Description)

In [204]:
idx = 0
content = df.loc[idx, 'Description']

In [205]:
content

'under asst front office manager at gunawangsa hotel merr surabaya'

In [206]:
code = bow.transform([content])

In [207]:
code.todense()

matrix([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [208]:
from sklearn.metrics.pairwise import cosine_distances

In [209]:
dist = cosine_distances(code, bank)

In [210]:
df.shape

(924, 2)

In [211]:
dist[0].shape

(924,)

In [212]:
rec_idx = dist.argsort()[0, 1:11]

In [213]:
df.loc[rec_idx]

Unnamed: 0,JobID,Description
46,57,become...
271,1298,become...
272,1299,become...
133,153,become...
807,2942,become...
802,2937,become...
801,2936,become...
16,22,becomi...
734,2861,become...
375,1411,become...


In [214]:
class ErisRecommender:
    def __init__(self, df, col):
        self.df = df
        self.col = col
        self.encoder = None
        self.bank = None
    
    def fit(self):
        self.encoder = CountVectorizer()
        self.bank = self.encoder.fit_transform(self.df[self.col])

    def recommend(self, keyword, top=10):
        # content = df.loc[idx, self.col]
        idx = df[self.col][df[self.col].str.contains(keyword)].index[0]
        content = df.loc[idx, self.col]
        print('Keyword match "' + content + '" content.')
        code = self.encoder.transform([content])

        dist = cosine_distances(code, self.bank)
        self.df['Similarity'] = dist[0]

        self.df['JobTitle'] = cdf_job.JobTitle.dropna(how='any')

        # rec_idx = dist.argsort()[0, 1:top + 1]
        # return self.df.loc[rec_idx]
        
        return self.df[(self.df.Similarity < .5) & (self.df.Similarity != 0)].sort_values(by=['Similarity'], ascending=True)


In [215]:
import re

from transform import remove_morespace

In [216]:
df = cdf_job[['JobID', 'JobTitle']].dropna(how='any')

In [218]:
stopwords = ['become', 'for', 'hotel', 'merr', 'at', 'gunawangsa', 'the', 'in', 'on', 'of', 'their', 'and', 'with'] + ['dan', 'untuk']
df.JobTitle = df.JobTitle.apply(lambda x: ' '.join([y for y in x.split(' ') if y not in stopwords])).map(remove_morespace).map(str.strip)

In [219]:
eris = ErisRecommender(df, 'JobTitle')
eris.fit()

In [224]:
eris.recommend('programmer')

Keyword match "it android programmer" content.


Unnamed: 0,JobID,JobTitle,Similarity
725,2848,androi...,0.183503
847,2996,androi...,0.183503
872,3036,it pro...,0.183503


In [16]:
df.loc[271, 'Description']

'become asst chief engineering at gunawangsa hotel merr'