In [170]:
import numpy as np
import pandas as pd
import string
import pickle
import spacy
import time
from tabulate import tabulate
from tqdm.notebook import tqdm
from tqdm import tqdm
import re
nlp_id = spacy.blank('id')
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.tokenize import RegexpTokenizer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [171]:
df = pd.read_csv("dataset/News.csv")
df.isnull().sum()
# Dropping the rows having None values
df = df.dropna(subset=['content'])
# Resetting the indices
df = df.reset_index(drop=True)

In [172]:
def save_py_obj(filename, fileobj):
    with open(f"dataset/{filename}", 'wb') as pkl_file:
        pickle.dump(fileobj, pkl_file)

def load_py_obj(filename):
    with open(f"dataset/{filename}", 'rb') as pkl_file:
        return pickle.load(pkl_file)

## Preprocessing

In [5]:
def nlp_tokenization(contents):
    list_tokens_from_docs = []
    # for content in contents:
    for content in tqdm(contents):
        # sentence stemming
        content = stemmer.stem(content)
        nlp_contents = nlp_id(content)
        clean_token = []
        for token_of_nlp_contents in nlp_contents:
            # remove punctuation & remove stopword
            if not token_of_nlp_contents.is_digit and not token_of_nlp_contents.is_punct \
                and not token_of_nlp_contents.is_stop:
                clean_token.append(token_of_nlp_contents.text)
        list_tokens_from_docs.append(clean_token)
    return list_tokens_from_docs

In [28]:
st = time.time()
# Use only samples from the dataset to test the functionality of the code.
# nlp_tokens = nlp_tokenization(df['content'][0:30])
nlp_tokens = nlp_tokenization(df['content'])
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

100%|███████████████████████████████████████████████████████████████| 14334/14334 [1:51:39<00:00,  2.14it/s]

Execution time: 6699.369220972061 seconds





In [None]:
save_py_obj("nlp_tokens.pkl", nlp_tokens)

In [154]:
nlp_tokens = load_py_obj("nlp_tokens.pkl")

In [106]:
nlp_tokens

[['perintah',
  'menteri',
  'hukum',
  'hak',
  'asasi',
  'manusia',
  'menkumham',
  'yasonna',
  'h',
  'laoly',
  'luas',
  'batas',
  'hadap',
  'warga',
  'negara',
  'asing',
  'alias',
  'wna',
  'masuk',
  'wilayah',
  'indonesia',
  'utama',
  'larang',
  'kerja',
  'asing',
  'bijak',
  'maktub',
  'atur',
  'menteri',
  'hukum',
  'ham',
  'permenkumham',
  'nomor',
  'rincinya',
  'atur',
  'batas',
  'orang',
  'asing',
  'masuk',
  'wilayah',
  'indonesia',
  'laku',
  'batas',
  'giat',
  'masyarakat',
  'ppkm',
  'darurat',
  'permenkumham',
  'laku',
  'rabu',
  'juli',
  'tenggang',
  'sosialisasi',
  'koordinasi',
  'kategori',
  'wna',
  'masuk',
  'daftar',
  'kecuali',
  'kategori',
  'wna',
  'masuk',
  'wilayah',
  'indonesia',
  'simak',
  'infografis'],
 ['bulu',
  'tangkis',
  'andal',
  'indonesia',
  'buru',
  'medali',
  'olimpiade',
  'tokyo',
  'cabang',
  'olahraga',
  'cabor',
  'gelar',
  'juli',
  'agustus',
  'tanggung',
  'atlet',
  'indonesia',


In [184]:
def remove_alphanumeric_char(nlp_tokens):
    # example : --sebagai, -sebagai
    hyphen_separator = RegexpTokenizer(r'\w+')
    # example : 000anak, 000golongan
    num_letter_separator = RegexpTokenizer(r'\d+|\D+')
    clean_tokens = []
    for tokens in tqdm(nlp_tokens):
        clean_token_ofnum = []
        for token in tokens:
            pre_tokens = num_letter_separator.tokenize(token)
            for pre_token in pre_tokens:
                if len(pre_token) > 1:
                    pre_token = nlp_id(pre_token)
                    if not pre_token[0].is_digit and not pre_token[0].is_punct:
                        clean_token_ofnum.append(pre_token[0].text)
        clean_token_ofhyp = []
        for token in clean_token_ofnum:
            pre_tokens = hyphen_separator.tokenize(token)
            clean_token_ofhyp.append(pre_tokens[0])
        clean_tokens.append(clean_token_ofhyp)
    return clean_tokens

In [156]:
nlp_tokens = remove_alphanumeric_char(nlp_tokens)
# nlp_tokens = remove_alphanumeric_char(nlp_tokens[0:10])

100%|████████████████████████████████████| 14334/14334 [00:58<00:00, 243.30it/s]


## Pairing Term and Document ID

In [123]:
def pair_token_docID(nlp_tokens):
    doc_list_ofToken_DocID = []
    # index_docID
    for docID, doc_list_T in tqdm(enumerate(nlp_tokens)):
        for doc_list in doc_list_T:
            # doc_list_ofToken_DocID.append((doc_list.text, docID+1))
            doc_list_ofToken_DocID.append((doc_list, docID+1))
    return doc_list_ofToken_DocID

def remove_space(nlp_tokens_tup):
    new_tokens = []
    for i in nlp_tokens_tup:
        if not i[0].isspace():
            new_tokens.append(i)
    return new_tokens

def sort_token_docID(tup):
    return(sorted(tup, key = lambda x: x[0]))

In [158]:
st = time.time()
nlp_tokens_docId = pair_token_docID(nlp_tokens)
nlp_tokens_docId_clean_space = remove_space(nlp_tokens_docId)
nlp_tokens_docId_sorted = sort_token_docID(nlp_tokens_docId_clean_space)
# nlp_tokens_docId_sorted = sort_token_docID(nlp_tokens_docId)
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

14334it [00:00, 17103.54it/s]


Execution time: 4.819438457489014 seconds


[('a', 1076),
 ('aa', 1700),
 ('aa', 1700),
 ('aa', 1700),
 ('aa', 1730),
 ('aa', 1730),
 ('aa', 1730),
 ('aa', 1790),
 ('aa', 1790),
 ('aa', 1790),
 ('aa', 1790),
 ('aa', 1795),
 ('aa', 1795),
 ('aa', 1795),
 ('aa', 1795),
 ('aa', 1796),
 ('aa', 1796),
 ('aa', 1796),
 ('aa', 1796),
 ('aa', 1796)]

In [174]:
nlp_tokens_docId_sorted[200:210]

[('aai', 11536),
 ('aakhirahu', 6805),
 ('aakhirahu', 6805),
 ('aal', 2426),
 ('aal', 2428),
 ('aal', 2429),
 ('aal', 2577),
 ('aal', 7915),
 ('aal', 8406),
 ('aal', 10068)]

## Inverted Index

In [175]:
from collections import deque

In [178]:
def inverted_index(tokens_docID):
    vocab = {}
    for token, docId in tqdm(tokens_docID):
        if not token in vocab:
            vocab[token] = deque()
            vocab[token].append(docId)
        else:
            temp_post_list = vocab[token]
            temp_post_list.append(docId)
            # prevent duplication
            temp_post_list = sorted(set(temp_post_list))
            # back to linkedlist
            temp_post_list = deque(temp_post_list)
            vocab[token] = temp_post_list
    vocab_freq = {}
    # pairing term and doc frequency
    for key, val in tqdm(vocab.items()):
        vocab_freq[(key, len(val))] = val
    return vocab_freq

In [179]:
st = time.time()
dict_inverted_index = inverted_index(nlp_tokens_docId_sorted)
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

100%|██████████████████████████████| 3113513/3113513 [05:10<00:00, 10033.03it/s]
100%|█████████████████████████████████| 66636/66636 [00:00<00:00, 847419.44it/s]

Execution time: 310.4224021434784 seconds





In [180]:
dict_inverted_index

{('a', 1): deque([1076]),
 ('aa',
  37): deque([1700,
        1730,
        1790,
        1795,
        1796,
        2909,
        2998,
        2999,
        3555,
        3605,
        3636,
        5065,
        5231,
        5597,
        5693,
        6451,
        7380,
        7381,
        7387,
        7389,
        7402,
        7414,
        7455,
        7492,
        8659,
        8849,
        10239,
        10606,
        10621,
        10988,
        11087,
        11178,
        11994,
        12033,
        12587,
        12776,
        13328]),
 ('aaa', 3): deque([3605, 8951, 10739]),
 ('aaaa', 1): deque([6730]),
 ('aaaaa', 1): deque([6730]),
 ('aaahhhh', 1): deque([7348]),
 ('aaamiin', 1): deque([7478]),
 ('aaaron', 1): deque([1977]),
 ('aab',
  12): deque([319,
        769,
        1428,
        3372,
        7510,
        7513,
        12160,
        12167,
        12174,
        12187,
        12189,
        12191]),
 ('aacademy', 1): deque([2504]),
 ('aad', 7):

## Processing Boolean queries

In [41]:
class BooleanQueries:
    
    def __init__(self, inverted_index, debug=False):
        self.inverted_index = inverted_index
        self.debug = debug
        keys_of_inverted_index = self.inverted_index.keys()
        self.keys_of_inverted_index = dict(keys_of_inverted_index)

    def intersection_query(self, query1, query2):
        # get doc frequency from query
        doc_freq1 = self.keys_of_inverted_index[query1]
        doc_freq2 = self.keys_of_inverted_index[query2]
        # get posting list
        pl_q1 = self.inverted_index[(query1, doc_freq1)]
        pl_q2 = self.inverted_index[(query2, doc_freq2)]
        answer = set(pl_q1) & set(pl_q2)
        if self.debug:
            data = [[pl_q1, pl_q2, answer]]
            headers = [f"posting list of {query1}", f"posting list of {query2}", "intersection result"]
            print(tabulate(data, headers=headers))
        return answer
    
    def union_query(self, query1, query2):
        # get doc frequency from query
        doc_freq1 = self.keys_of_inverted_index[query1]
        doc_freq2 = self.keys_of_inverted_index[query2]
        # get posting list
        pl_q1 = self.inverted_index[(query1, doc_freq1)]
        pl_q2 = self.inverted_index[(query2, doc_freq2)]
        answer = set(pl_q1) | set(pl_q2)
        if self.debug:
            data = [[pl_q1, pl_q2, answer]]
            headers = [f"posting list of {query1}", f"posting list of {query2}", "union result"]
            print(tabulate(data, headers=headers))
        return answer
    
    def negation_query(self, query):
        # get doc frequency from query
        doc_freq = self.keys_of_inverted_index[query]
        # get posting list
        pl_q = self.inverted_index[(query, doc_freq)]
        # get docID membership
        self.docID_membership()
        answer = set(self.docID_space) - set(pl_q)
        if self.debug:
            data = [[pl_q, self.docID_space , answer]]
            headers = [f"posting list of {query}", "docID membership", "negation result"]
            print(tabulate(data, headers=headers))
        return answer
    
    def docID_membership(self):
        docID_space = set()
        for posting_lists in self.inverted_index.values():
            for pl in posting_lists:
                docID_space.add(pl)
        self.docID_space = sorted(docID_space)

In [16]:
# create sample data from inverted index to test the query
mock_inverted_index = {}
num_mock = 20
track_mock = 0
for key, values in dict_inverted_index.items():
    mock_inverted_index[key] = values
    track_mock += 1
    if track_mock == num_mock:
        break
mock_inverted_index

{('a', 5): deque([13, 17, 18, 27, 28]),
 ('abai', 1): deque([16]),
 ('abdullah', 1): deque([8]),
 ('acara', 1): deque([23]),
 ('aces', 1): deque([23]),
 ('adaptasi', 2): deque([21, 25]),
 ('adil', 1): deque([14]),
 ('aflah', 1): deque([17]),
 ('afrika', 1): deque([23]),
 ('agam', 1): deque([15]),
 ('aganda', 1): deque([18]),
 ('agen', 2): deque([25, 28]),
 ('aguero', 1): deque([25]),
 ('agustus', 6): deque([2, 15, 17, 18, 19, 26]),
 ('ahli', 1): deque([23]),
 ('ahsan', 2): deque([17, 18]),
 ('air', 3): deque([13, 17, 20]),
 ('aisah', 1): deque([17]),
 ('akademi', 1): deque([26]),
 ('akibat', 2): deque([14, 23])}

In [43]:
# booleanQuery = BooleanQueries(mock_inverted_index, debug=True)
booleanQuery = BooleanQueries(dict_inverted_index, debug=True)

In [44]:
booleanQuery.intersection_query('agustus', 'air')
print("\n")
booleanQuery.union_query('agustus', 'air')
print("\n")
booleanQuery.negation_query('agustus')

posting list of agustus                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

{1,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 16,
 20,
 21,
 22,
 23,
 24,
 25,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 35,
 36,
 38,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 194,
 195

In [183]:
# type(stemmer.stem("hello saya 'hello"))
stemmer.stem("hello saya 'hello")

'hello saya hello'