In [24]:
import pyodbc
import re
import json

import numpy as np
import pandas as pd
import tensorflow as tf

from sqlalchemy import create_engine
from sklearn.metrics.pairwise import cosine_similarity

from collections import defaultdict
from gensim import corpora, models, similarities
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
from gensim.models.word2vec import Word2Vec

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from googletrans import Translator
from deep_translator import GoogleTranslator

from FlaskApp.transform import *

from job_model import JobModel
from app_model import AppModel

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [25]:
translator = Translator(service_urls=['translate.googleapis.com'])
factory = StemmerFactory()
stemmer = factory.create_stemmer()

with open('data/slangwords.json', 'r') as file:
    slangwords = json.load(file)

with open('data/englishwords.json', 'r') as file:
    englishwords = json.load(file)

In [26]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

In [27]:
user = 'huda'
password = 'Vancha12'
host = '127.0.0.1'
port = 1433
database = 'HRSystemDB'


def get_connection():         
    return create_engine(
        url=f"mssql+pyodbc://{user}:{password}@{host}:{port}/{database}?driver=SQL Server",
    )

engine = get_connection()
conn = engine.connect()

In [112]:
applicant_id = 31790

df_job = pd.DataFrame(engine.execute(
    """
    SELECT Job.JobID, Job.UsiaMax, Job.SalaryMin, Job.SalaryMax, City.Name AS CityName, Province.Name AS ProvinceName, EducationLevel.EducationLevelName, Major.MajorName, Job.DriverLicenseType, Job.UsingGlasses, Job.Gender, Job.MaritalStatus, Job.JobTitle, FunctionPosition.FunctionPositionName, Job.Description, Job.Requirement
    FROM (((((Job
    RIGHT JOIN FunctionPosition ON Job.FunctionPositionID = FunctionPosition.FunctionPositionID)
    RIGHT JOIN EducationLevel ON Job.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN City ON Job.CityID = City.CityID)
    RIGHT JOIN Province ON Job.ProvinceID = Province.ProvinceID)
    RIGHT JOIN Major ON Job.MajorID = Major.MajorID)
    """
))

df_function = pd.DataFrame(engine.execute(
    """
    SELECT FunctionPositionID, FunctionPositionName
    FROM FunctionPosition
    """
))

df_education = pd.DataFrame(engine.execute(
    """
    SELECT EducationLevelID, EducationLevelName
    FROM EducationLevel
    """
))

df_city = pd.DataFrame(engine.execute(
    """
    SELECT CityID, Name AS CityName
    FROM City
    """
))

df_province = pd.DataFrame(engine.execute(
    """
    SELECT ProvinceID, Name AS ProvinceName
    FROM Province
    """
))

df_major = pd.DataFrame(engine.execute(
    """
    SELECT MajorID, MajorName
    FROM Major
    """
))

df_applicant = pd.DataFrame(engine.execute(
    f"""
    SELECT Applicant.ApplicantID, Applicant.Dob, Applicant.ExpectedSalary, City.Name AS CityName, Province.Name AS ProvinceName, Applicant.DriverLicenseType, Applicant.IsUsingGlasses, Applicant.Gender, Applicant.MaritalStatus, Applicant.Strengthness
    FROM (((Applicant
    RIGHT JOIN City ON Applicant.CurrentAddressCityID = City.CityID)
    RIGHT JOIN Province ON Applicant.CurrentAddressProvinceID = Province.ProvinceID)
    LEFT JOIN Pipeline ON Applicant.ApplicantID = Pipeline.ApplicantID)
    """
))

df_applicant_education = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantEducation.ApplicantID, ApplicantEducation.DateStart, ApplicantEducation.DateEnd, EducationLevel.EducationLevelName, Major.MajorName
    FROM (((ApplicantEducation
    RIGHT JOIN EducationLevel ON ApplicantEducation.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN Major ON ApplicantEducation.MajorID = Major.MajorID)
    LEFT JOIN Pipeline ON ApplicantEducation.ApplicantID = Pipeline.ApplicantID)
    """
))

df_applicant_experience = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantExperience.ApplicantID, ApplicantExperience.DateFrom, ApplicantExperience.DateTo, ApplicantExperience.Position, ApplicantExperience.JobDescription
    FROM (ApplicantExperience
    LEFT JOIN Pipeline ON ApplicantExperience.ApplicantID = Pipeline.ApplicantID)
    """
))

In [113]:
job_str = ['CityName', 'ProvinceName', 'EducationLevelName', 'MajorName', 'DriverLicenseType', 'Gender', 'MaritalStatus', 'JobTitle', 'FunctionPositionName', 'Description', 'Requirement']
job_num = ['UsiaMax', 'SalaryMin', 'SalaryMax']
job_bol = ['UsingGlasses']

'''general'''
df_job.set_index(['JobID'], inplace=True)
df_job.fillna('', inplace=True)

'''str'''
df_job[job_str] = df_job[job_str].applymap(str.lower)
df_job.replace('none', '', inplace=True)

In [114]:
df_job.Description = df_job.Description.map(clean_text).map(maintain_alphabet)

In [32]:
df_job.Description

JobID
1       di bawah asisten depan kantor pengelola pada g...
2       menjadi buku penjaga untuk gunawangsa hotel se...
3       mengembangkan dari android aplikasi dan milik ...
4       melakukan tugas kesekretariatan menggunakan te...
5       bantu menangani pekerjaan atasan membuat surat...
                              ...                        
3090    memimpin perusahaan dan menjadi motivator bagi...
3091                                                     
3092    menguasai brevet dan pelaporan pajak dapat mem...
3093    mengerti dan memahami peraturan peraturan yang...
3099    berpikir kreatif untuk menghasilkan ide ide da...
Name: Description, Length: 1008, dtype: object

In [42]:
corpus = list(set(' '.join([i for i in df_job.Description.values]).split()))

In [18]:
translator = GoogleTranslator(source='en', target='id')

corpus.sort()

corpused = {}
for original in corpus:
    translated = translator.translate(original)
    corpused[original] = str.lower(translated)

no_translate = []
for key in corpused:
    if key == corpused[key]:
        no_translate.append(key)

for key in no_translate:
    del corpused[key]

In [22]:
root_words = pd.read_csv('data/combined_root_words.txt')
root_words = [i[0] for i in root_words.values]

In [43]:
raw = [i for i in corpus if i in root_words]

In [44]:
slang = [i for i in corpus if i not in root_words]

In [51]:
stemmed = [stemmer.stem(i) for i in slang]

In [65]:
slang_2 = []

for i,j in zip(slang, stemmed):
    if i == j:
        slang_2.append(i)

In [202]:
stemmer.stem('diunggah')

'unggah'

In [217]:
'antar' in root_words

False

In [220]:
df_job.Description[df_job.Description.str.contains(' ppc ')].values

array(['job descriptions creating content including text posts video and images for use on social media promoting products services and content over social media in way that is consistent with an organisation brand and social media strategy scheduling social media posts using applications such as hootsuite and tweetdeck interacting with customers and dealing with customers enquiries developing new social media strategies and campaigns managing budget to be spent on promoting social media posts and pay per click ppc advertising keeping track of data and analysing the performance of social media campaigns collaborating with colleagues from across marketing departments to ensure branding is consistent meeting with clients and other stakeholders such as social media influencers these meetings may involve pitching potential marketing campaigns'],
      dtype=object)

In [72]:
for i in slang_2:
    print('"' + i + '": ')

"postingan": 
"enerjik": 
"ssp": 
"cm": 
"listiani": 
"operasionalisasi": 
"produktifitas": 
"intermediat": 
"pengecheckan": 
"indosat": 
"dengn": 
"ria": 
"materai": 
"-nya": 
"merk": 
"digital": 
"khusunya": 
"pln": 
"firewall": 
"kelengkapanya": 
"mengantur": 
"selalu": 
"saldo": 
"cisco": 
"mecakup": 
"mereport": 
"bank": 
"akumatika": 
"ios": 
"auditor": 
"pemda": 
"kolaboratif": 
"gym": 
"corel": 
"spreadsheet": 
"potensial": 
"ob": 
"copywriting": 
"mengautomasi": 
"mnrt": 
"termasukaplikasi": 
"assistensi": 
"posting": 
"maomm": 
"rapih": 
"spektrum": 
"untensil": 
"bauj": 
"fda": 
"token": 
"mejadi": 
"hootsuite": 
"memoliki": 
"osd": 
"erp": 
"twitter": 
"ap": 
"qc": 
"penembuatan": 
"anline": 
"managar": 
"memfollow": 
"permanen": 
"orginiza": 
"sby": 
"latte": 
"ekspetasi": 
"prabayar": 
"antar": 
"surveyor": 
"elakukan": 
"voucher": 
"foodcourt": 
"nunggak": 
"ensure": 
"youtube": 
"sosmed": 
"jabodetabek": 
"maintan": 
"saat": 
"jakarta": 
"saja": 
"warni": 
"ketidakseimb