In [7]:
import pyodbc
import re
import json

import numpy as np
import pandas as pd
import tensorflow as tf

from sqlalchemy import create_engine
from sklearn.metrics.pairwise import cosine_similarity

from collections import defaultdict
from gensim import corpora, models, similarities
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
from gensim.models.word2vec import Word2Vec

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from googletrans import Translator
from deep_translator import GoogleTranslator

from FlaskApp.transform import *

from job_model import JobModel
from app_model import AppModel

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [55]:
translator = Translator(service_urls=['translate.googleapis.com'])
factory = StemmerFactory()
stemmer = factory.create_stemmer()

rootwords = [i[0] for i in pd.read_csv('data/rootwords.txt').values]

with open('data/slangwords.json', 'r') as file:
    slangwords = json.load(file)

with open('data/englishwords.json', 'r') as file:
    englishwords = json.load(file)

In [9]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

In [10]:
user = 'huda'
password = 'Vancha12'
host = '127.0.0.1'
port = 1433
database = 'HRSystemDB'


def get_connection():         
    return create_engine(
        url=f"mssql+pyodbc://{user}:{password}@{host}:{port}/{database}?driver=SQL Server",
    )

engine = get_connection()
conn = engine.connect()

In [59]:
applicant_id = 31790

df_job = pd.DataFrame(engine.execute(
    """
    SELECT Job.JobID, Job.UsiaMax, Job.SalaryMin, Job.SalaryMax, City.Name AS CityName, Province.Name AS ProvinceName, EducationLevel.EducationLevelName, Major.MajorName, Job.DriverLicenseType, Job.UsingGlasses, Job.Gender, Job.MaritalStatus, Job.JobTitle, FunctionPosition.FunctionPositionName, Job.Description, Job.Requirement
    FROM (((((Job
    RIGHT JOIN FunctionPosition ON Job.FunctionPositionID = FunctionPosition.FunctionPositionID)
    RIGHT JOIN EducationLevel ON Job.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN City ON Job.CityID = City.CityID)
    RIGHT JOIN Province ON Job.ProvinceID = Province.ProvinceID)
    RIGHT JOIN Major ON Job.MajorID = Major.MajorID)
    """
))

df_function = pd.DataFrame(engine.execute(
    """
    SELECT FunctionPositionID, FunctionPositionName
    FROM FunctionPosition
    """
))

df_education = pd.DataFrame(engine.execute(
    """
    SELECT EducationLevelID, EducationLevelName
    FROM EducationLevel
    """
))

df_city = pd.DataFrame(engine.execute(
    """
    SELECT CityID, Name AS CityName
    FROM City
    """
))

df_province = pd.DataFrame(engine.execute(
    """
    SELECT ProvinceID, Name AS ProvinceName
    FROM Province
    """
))

df_major = pd.DataFrame(engine.execute(
    """
    SELECT MajorID, MajorName
    FROM Major
    """
))

df_applicant = pd.DataFrame(engine.execute(
    f"""
    SELECT Applicant.ApplicantID, Applicant.Dob, Applicant.ExpectedSalary, City.Name AS CityName, Province.Name AS ProvinceName, Applicant.DriverLicenseType, Applicant.IsUsingGlasses, Applicant.Gender, Applicant.MaritalStatus, Applicant.Strengthness
    FROM (((Applicant
    RIGHT JOIN City ON Applicant.CurrentAddressCityID = City.CityID)
    RIGHT JOIN Province ON Applicant.CurrentAddressProvinceID = Province.ProvinceID)
    LEFT JOIN Pipeline ON Applicant.ApplicantID = Pipeline.ApplicantID)
    """
))

df_applicant_education = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantEducation.ApplicantID, ApplicantEducation.DateStart, ApplicantEducation.DateEnd, EducationLevel.EducationLevelName, Major.MajorName
    FROM (((ApplicantEducation
    RIGHT JOIN EducationLevel ON ApplicantEducation.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN Major ON ApplicantEducation.MajorID = Major.MajorID)
    LEFT JOIN Pipeline ON ApplicantEducation.ApplicantID = Pipeline.ApplicantID)
    """
))

df_applicant_experience = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantExperience.ApplicantID, ApplicantExperience.DateFrom, ApplicantExperience.DateTo, ApplicantExperience.Position, ApplicantExperience.JobDescription
    FROM (ApplicantExperience
    LEFT JOIN Pipeline ON ApplicantExperience.ApplicantID = Pipeline.ApplicantID)
    """
))

In [60]:
job_str = ['CityName', 'ProvinceName', 'EducationLevelName', 'MajorName', 'DriverLicenseType', 'Gender', 'MaritalStatus', 'JobTitle', 'FunctionPositionName', 'Description', 'Requirement']
job_num = ['UsiaMax', 'SalaryMin', 'SalaryMax']
job_bol = ['UsingGlasses']

'''general'''
df_job.set_index(['JobID'], inplace=True)
df_job.fillna('', inplace=True)

'''str'''
df_job[job_str] = df_job[job_str].applymap(str.lower)
df_job.replace('none', '', inplace=True)

In [61]:
df_job.Description = df_job.Description.map(
    clean_text
).apply(
    lambda x: change_words(slangwords, x)
).map(
    stemmer.stem
)

In [62]:
df_job.Description

JobID
1       under asisten front office manager at hotel me...
2                     becoming book keeper for hotel merr
3       developing of android application and their in...
4       laku tugas sekretariat guna terminologi prosed...
5       bantu handle kerja atas buat surat surat atur ...
                              ...                        
3090    pimpin usaha dan jadi motivator bagi karyawan ...
3091                                                     
3092    kuasa brevet dan lapor pajak dapat buat dan au...
3093    erti dan paham atur undang yang laku serta pro...
3099    pikir kreatif untuk hasil ide ide dan konsep k...
Name: Description, Length: 1008, dtype: object

In [74]:
stemmer.stem('bertanggung jawab')

'tanggung jawab'

In [63]:
corpus = list(set(' '.join([i for i in df_job.Description.values]).split()))

In [64]:
corpus = [i for i in corpus if i not in rootwords]

In [71]:
corpus.sort()

In [72]:
for i in corpus:
    print(i)

abilities
able
about
acc
accommodation
accordance
account
accounting
accounts
accrued
accumatica
accurate
achieve
achieves
across
act
action
activities
activity
acurate
address
adjustment
administration
administrative
ads
advertise
advertisement
advertisements
advertising
advice
advise
affair
affairs
after
age
agencies
agency
agent
agents
aging
agreement
all
also
alterations
always
an
analysing
analysis
analytics
analyze
analyzing
android
annual
apartement
apartment
applicable
applicant
applicants
application
applications
apply
appointment
appointments
appraising
apprentice
approaches
areas
arrange
arrangement
arrangements
arranging
art
articles
as
aspects
assess
assessing
assessment
assest
asset
assets
assigned
assist
assistance
at
attendance
attendant
attentation
attract
attractive
authority
autocad
awareness
back
backup
balance
banner
banquet
bap
base
bast
become
becoming
bekerjasama
benefit
berfikir
bertanggungjawab
bertannggung
best
beverage
beverages
bi
bill
billboard
billing
bil

In [46]:
english_corpus = []
for i in corpus:
    translated = translator.detect(i)
    if translated.lang == 'en' and translated.confidence > .75:
        english_corpus.append(i)

ConnectTimeout: _ssl.c:980: The handshake operation timed out

In [None]:
indonesian_corpus = []
for i in corpus:
    translated = translator.detect(i)
    if translated.lang == 'in' and translated.confidence > .75:
        indonesian_corpus.append(i)

In [None]:
another_corpus = []

for i in corpus:
    if i not in english_corpus and i not in indonesian_corpus:
        another_corpus.append(i)

In [None]:
english_corpus

['from',
 'warehouse',
 'inspection',
 'informative',
 'efficiency',
 'should',
 'reach',
 'consistency',
 'head',
 'account',
 'followers',
 'includes',
 'cook',
 'becoming',
 'learners',
 'establish',
 'agency',
 'research',
 'promoting',
 'relationship',
 'back',
 'art',
 'interests',
 'foodcourt',
 'monthly',
 'information',
 'outlet',
 'testing',
 'the',
 'prosedur',
 'photoshop',
 'troubleshooting',
 'with',
 'waitress',
 'about',
 'internal',
 'vehicles',
 'which',
 'eats',
 'cover',
 'microsoft',
 'office',
 'official',
 'optimally',
 'closing',
 'selecting',
 'creating',
 'identity',
 'effictiveness',
 'health',
 'asersi',
 'cashflow',
 'field',
 'function',
 'business',
 'early',
 'house',
 'treatment',
 'softcopy',
 'hardcopi',
 'influencers',
 'workstations',
 'action',
 'operating',
 'highest',
 'leadership',
 'subordinates',
 'offline',
 'apply',
 'intermediate',
 'rectification',
 'specifications',
 'vehicle',
 'creativity',
 'including',
 'nursery',
 'their',
 'or',
 's

In [18]:
english_corpus

['from',
 'warehouse',
 'inspection',
 'informative',
 'efficiency',
 'should',
 'reach',
 'consistency',
 'head',
 'account',
 'followers',
 'includes',
 'cook',
 'becoming',
 'learners',
 'establish',
 'agency',
 'research',
 'promoting',
 'relationship',
 'back',
 'art',
 'interests',
 'foodcourt',
 'monthly',
 'information',
 'outlet',
 'testing',
 'the',
 'prosedur',
 'photoshop',
 'troubleshooting',
 'with',
 'waitress',
 'about',
 'internal',
 'vehicles',
 'which',
 'eats',
 'cover',
 'microsoft',
 'office',
 'official',
 'optimally',
 'closing',
 'selecting',
 'creating',
 'identity',
 'effictiveness',
 'health',
 'asersi',
 'cashflow',
 'field',
 'function',
 'business',
 'early',
 'house',
 'treatment',
 'softcopy',
 'hardcopi',
 'influencers',
 'workstations',
 'action',
 'operating',
 'highest',
 'leadership',
 'subordinates',
 'offline',
 'apply',
 'intermediate',
 'rectification',
 'specifications',
 'vehicle',
 'creativity',
 'including',
 'nursery',
 'their',
 'or',
 's

In [171]:
len(corpus)

1981

In [None]:
# translator = GoogleTranslator(source='en', target='id')

# corpus.sort()

# corpused = {}
# for original in corpus:
#     translated = translator.translate(original)
#     corpused[original] = str.lower(translated)

# no_translate = []
# for key in corpused:
#     if key == corpused[key]:
#         no_translate.append(key)

# for key in no_translate:
#     del corpused[key]

# slang_2 = []

# for i,j in zip(slang, stemmed):
#     if i == j:
#         slang_2.append(i)

In [90]:
stemmer.stem('melarang')

'larang'

In [103]:
'abad' in root_words

False

In [62]:
df_job.Description[df_job.Description.str.contains(' pd ')].values

array(['menerima pd dari pajak ijin mengecek lampiran pd',
       'menerima pd dari pajak ijin mengecek lampiran pd',
       'menerima pd dari pajak ijin mengecek lampiran pd',
       'menerima pd dari pajak ijin mengecek lampiran pd',
       'membuat dan collect invoice merapikan atau mengarsip kontrak invoice penawaran dan po menyimpan dokumen menjalankan alur pembelian mii produk it membuat pd permintaan dana menjadi pemegang nomor wa customer service eats aplikasi'],
      dtype=object)