In [1]:
import sys
import shelve
import pickle
import os
import math
from utils import helper, textprocessing
from collections import Counter
import json


# Load data from files
db_file = os.path.join(os.getcwd(), 'db', 'index.json')
urls_file = os.path.join(os.getcwd(), 'db', 'urls.json')
lengths_file = os.path.join(os.getcwd(), 'db', 'lengths.json')
stopwords_file = os.path.join(os.getcwd(), 'vietnamese-stopwords-dash.txt')



In [2]:
with open(urls_file, mode='rb') as f:
    urls = json.load(f)
with open(lengths_file, mode='rb') as f:
    lengths = json.load(f)
with open(stopwords_file, mode='r', encoding='utf-8') as f:
    stopwords_set = set(f.read().split())
with open(db_file, mode='rb') as f:
    index_db = json.load(f)



In [3]:
urls

['https://www.careerlink.vn/tim-viec/lai-chau-chuyen-vien-kinh-doanh/2729747?source=site',
 'https://www.careerlink.vn/tim-viec-lam/lai-chau-chuyen-vien-kinh-doanh/2729747?source=site',
 'https://www.careerlink.vn/tim-viec-lam/quan-ly-cua-hang-nhan-hang-thoi-trang-calvin-klein-binh-tan/2729902?source=site',
 'https://www.careerlink.vn/tim-viec-lam/vietinbank-quan-thu-duc-chuyen-vien-tu-van-vien-bao-hiem-kenh-hop-tac-ngan-hang/2745151?source=site',
 'https://www.careerlink.vn/tim-viec-lam/outside-sales-staff-packing-material-ha%CC%80-nam/2725983?source=site',
 'https://www.careerlink.vn/tim-viec-lam/cn-ha-noi-chuyen-vien-kinh-doanh-thu-nhap-len-den-30tr-thang/2729249?source=site',
 'https://www.careerlink.vn/tim-viec-lam/h.me-linh-chuyen-vien-tu-van-bao-hiem-metlife-kenh-lien-doanh-ngan-hang-bidv/2745005?source=site',
 'https://www.careerlink.vn/tim-viec-lam/quan-hoan-kiem-ha-noi-chuyen-vien-tu-van-bao-hiem-du-an-hop-tac-ngan-hang-acb/2740731?source=site',
 'https://www.careerlink.vn/ti

In [4]:
len(lengths)

6881

In [5]:
lengths

[7.227081356768379,
 7.644693986739235,
 15.401440338885275,
 9.6409072370293,
 16.236482070387698,
 10.801340377722688,
 13.050459736183974,
 11.120357304668309,
 13.60192001588454,
 6.073159655954687,
 12.299249236679078,
 11.693148784528708,
 8.023881390076584,
 15.86902325710593,
 10.001773177119441,
 9.889813619082327,
 10.595921698697897,
 16.068272385701455,
 16.58347056849082,
 22.36051171124752,
 16.78096992459776,
 18.015680284745304,
 6.931618822256752,
 22.86900007745477,
 14.373143315838954,
 10.798300795966686,
 11.017703076962073,
 7.7217234361441225,
 6.911044611083478,
 12.840951966902244,
 12.805458719461312,
 6.739205968134995,
 8.541095060043057,
 11.092317498214957,
 5.130257994796684,
 9.323220499080174,
 16.78840041165356,
 13.560490424024135,
 7.229668508677082,
 11.810800679979856,
 11.59699598981411,
 17.097958624301327,
 9.301662368496013,
 21.230736243538534,
 21.509712876414593,
 13.785658787003495,
 13.047200337742536,
 8.882538037332475,
 20.8523974216377

In [6]:
index_db

{'lai_châu': {'df': 5,
  'postings_list': {'0': 1, '1': 1, '1965': 1, '5850': 1, '5851': 1}},
 'chuyên_viên': {'df': 1613,
  'postings_list': {'1': 1,
   '3': 1,
   '5': 1,
   '6': 1,
   '7': 1,
   '8': 1,
   '11': 1,
   '15': 1,
   '16': 1,
   '28': 1,
   '31': 1,
   '64': 1,
   '67': 1,
   '69': 1,
   '70': 1,
   '71': 1,
   '72': 1,
   '73': 1,
   '83': 1,
   '85': 1,
   '88': 1,
   '90': 1,
   '92': 1,
   '93': 1,
   '94': 1,
   '96': 1,
   '98': 1,
   '106': 1,
   '109': 1,
   '110': 1,
   '112': 1,
   '114': 1,
   '128': 1,
   '149': 1,
   '151': 1,
   '153': 1,
   '155': 1,
   '157': 1,
   '169': 1,
   '170': 1,
   '173': 1,
   '180': 1,
   '182': 1,
   '190': 1,
   '193': 1,
   '194': 1,
   '196': 1,
   '198': 1,
   '200': 1,
   '202': 1,
   '208': 1,
   '211': 1,
   '212': 1,
   '215': 1,
   '216': 1,
   '217': 1,
   '218': 1,
   '219': 1,
   '220': 1,
   '221': 1,
   '223': 1,
   '224': 1,
   '225': 1,
   '226': 1,
   '227': 1,
   '228': 1,
   '230': 1,
   '231': 1,
   '232':

In [7]:
# Get query
query = "làm việc tại nhà từ xa"
# Load inverted index

# Construct vocabulary from inverted index
vocabulary = set(index_db.keys())
num_docs = len(urls)



In [8]:
# Preprocess query
tokens = textprocessing.preprocess_text(query, stopwords_set)
tokens = [token for token in tokens if token in vocabulary]



In [9]:
print(tokens)

['làm_việc', 'nhà']


In [10]:
vocabulary

{'fmcg',
 'tín_chấp',
 'giã_khánh',
 'coordidator',
 'learning',
 'tiên_du_lương',
 'giặt',
 'nguyên',
 'long',
 'itec',
 'cống',
 'thiết_kế',
 'administrative',
 'k',
 'cung_ứng',
 'quảng_ninhtư',
 'ria',
 'ngọc',
 'nhà_bè',
 'tuyển_kế_toán',
 'nhà_bếp',
 'bình_định',
 'mẹ',
 'imes',
 'buôn',
 'business',
 'lắp_đặt',
 'đội_trưởng',
 'agency',
 'đak',
 'atm',
 'hưng_hà',
 'vĩnh_tường',
 'nh',
 'ngãi',
 'căn_hộ',
 'development_staff',
 'nguyên_vật_liệu',
 'fpt_shop',
 'nhân_lực',
 'hub',
 'cá_nhân_lý',
 'admissions',
 'commercial',
 'node',
 'tech',
 'lai',
 'tài_liệu',
 'bo_mạch',
 'lãng_lạng',
 'vận_tải',
 'đào_tạo',
 'educational',
 'hữu',
 'chi_phí',
 'tài_xế',
 'electrical',
 'khám_sàng',
 'vietinbank_phú',
 'kfc_tập',
 'us',
 'vệ_sinh',
 'phi_nhân_thọ',
 'v',
 'hàng_dương',
 'unity',
 'onsite',
 'linh_đàm',
 'lạt',
 'villa',
 'nhập_khẩu',
 'văn_hà',
 'picity',
 'ứng_viên',
 'lập_trình_viên',
 'united',
 'gây_mê_khoa',
 'tphcm',
 'store_manager',
 'bcis',
 'tường',
 'french',
 'nhi

In [11]:
# Caculate weights for query
query_bow = Counter(tokens)



In [12]:
query_weights = {}

for term, freq in query_bow.items():
    df = index_db[term]['df']
    query_weights[term] = helper.idf(df, num_docs) * helper.tf(freq)
    


In [13]:
for term, freq in query_bow.items():
    print(helper.idf(df,num_docs),term)
    

5.370783366402753 làm_việc
5.370783366402753 nhà


In [14]:
query_weights.items()

dict_items([('làm_việc', 3.8877593788243114), ('nhà', 5.370783366402753)])

In [15]:
# Normalize query weights
query_length = math.sqrt(sum((e ** 2 for e in query_weights.values())))
for term, value in query_weights.items():
    query_weights[term] = value / query_length



In [16]:
query_weights.items()

dict_items([('làm_việc', 0.5863684575241636), ('nhà', 0.8100444629899849)])

In [17]:
# Caculate scores
scores = [[i, 0] for i in range(num_docs)]
for term, query_weight in query_weights.items():
    df = index_db[term]['df']
    postings_list = index_db[term]['postings_list']
    for docId, freq in postings_list.items():
        doc_weight = helper.idf(df, num_docs) * helper.tf(freq)
        docId_int = int(docId)
        scores[docId_int][1] += query_weight * doc_weight / lengths[docId_int]
        print(lengths[docId_int])

9.323220499080174
13.560490424024135
8.882538037332475
13.255652137390445
17.822716532587247
10.771515782573335
12.434042364896728
10.26400023873114
7.907394364827707
10.201909094023764
6.909915546831497
18.41251934257911
15.499549412151225
10.513559380254245
9.89595696763551
10.781977919976136
11.73316011436296
18.285884281766787
11.000103856017054
10.785542108905098
11.176836286609106
12.141789936645115
15.49136369171643
13.314581682569226
14.250728852849706
12.008504555747038
12.399422084533164
13.283964204875394
13.829225592129296
15.887433965795749
13.086430019922332
18.469196944048317
11.081748108313903
10.326772015528105
16.148500108364914
15.827428518882883
17.124714761711274
13.316831053440453
8.249225898960551
9.125162868414773
11.121578970622988
9.128846129120976
10.068699110521742
12.654381855920029
9.892526474291223
13.131512164960785
11.78385210729599
12.519778798215608
12.054694940914779
14.041559935870923
13.942127690118784
9.219637306972214
12.83995077648288
13.5217962

In [18]:

# Sort scores and display results
scores.sort(key=lambda e: e[1], reverse=True)
for index, score in scores[:20]:
    if score == 0:
        break
    print('{} - {}'.format(urls[index], score))

https://www.careerlink.vn/tim-viec-lam/nhan-vien-cham-soc-khach-hang-tai-nha-fpt-telecom-lam-viec-tai-can-tho/2742172?source=site - 0.5295806663137334
https://www.careerlink.vn/tim-viec-lam/nhan-vien-ky-thuat-toa-nha/2736238?source=site - 0.466546794017413
https://careerbuilder.vn/vi/tim-viec-lam/nhan-vien-ky-thuat-toa-nha.35BF3158.html - 0.466546794017413
https://careerbuilder.vn/vi/tim-viec-lam/nhan-vien-ky-thuat-toa-nha.35BF207A.html - 0.466546794017413
https://careerbuilder.vn/vi/tim-viec-lam/nhan-vien-kinh-doanh-du-an-nha-go.35BF3BD0.html - 0.4564629769304444
https://careerbuilder.vn/vi/tim-viec-lam/nhan-vien-kinh-doanh-tai-nha-phan-phoi-hai-phong.35BF2045.html - 0.4467827580010514
https://careerbuilder.vn/vi/tim-viec-lam/chuyen-vien-kinh-doanh-telesales-nha-tot.35BF21BC.html - 0.4176619822397761
https://careerbuilder.vn/vi/tim-viec-lam/nhan-vien-ky-thuat-toa-nha-quan-7-quan-4.35BF1CEA.html - 0.4126834263542422
https://www.careerlink.vn/tim-viec-lam/truong-bo-phan-quan-ly-toa-nha/