In [None]:
import numpy as np
import pandas as pd
import string
import pickle
import spacy
from tabulate import tabulate
nlp_id = spacy.blank('id')
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
df = pd.read_csv("dataset/News.csv")
df.isnull().sum()
# Dropping the rows having None values
df = df.dropna(subset=['content'])
# Resetting the indices
df = df.reset_index(drop=True)

## Preprocessing

In [None]:
def sentence_stem(token_DocID):
    new_token = []
    for to_ in token_DocID:
        new_token.append( ( stemmer.stem(to_) ))
    return new_token

In [None]:
def nlp_tokenization(contents):
    list_tokens_from_docs = []
    for content in contents:
        nlp_contents = nlp_id(content.lower())
        clean_token = []
        for token_of_nlp_contents in nlp_contents:
            # remove punctuation & remove stopword
            if not token_of_nlp_contents.is_digit and not token_of_nlp_contents.is_punct \
                and not token_of_nlp_contents.is_stop:
                clean_token.append(token_of_nlp_contents)
        list_tokens_from_docs.append(clean_token)
    return list_tokens_from_docs

In [None]:
# nlp_tokens = nlp_tokenization(df['content'])
# Use only samples from the dataset to test the functionality of the code.
nlp_stem = sentence_stem(df['content'][0:30])
nlp_tokens = nlp_tokenization(nlp_stem)

## Pairing Term and Document ID

In [None]:
def pair_token_docID(nlp_tokens):
    doc_list_ofToken_DocID = []
    # index_docID
    for docID, doc_list_T in enumerate(nlp_tokens):
        for doc_list in doc_list_T:
            doc_list_ofToken_DocID.append((doc_list.text, docID+1))
    return doc_list_ofToken_DocID

def remove_space(nlp_tokens_tup):
    new_tokens = []
    for i in nlp_tokens_tup:
        if not i[0].isspace():
            new_tokens.append(i)
    return new_tokens

def sort_token_docID(tup):
    return(sorted(tup, key = lambda x: x[0]))

In [None]:
nlp_tokens_docId = pair_token_docID(nlp_tokens)
nlp_tokens_docId_clean_space = remove_space(nlp_tokens_docId)
nlp_tokens_docId_sorted = sort_token_docID(nlp_tokens_docId_clean_space)
nlp_tokens_docId_sorted

## Inverted Index

In [None]:
from collections import deque

In [None]:
def inverted_index(tokens_docID):
    vocab = {}
    for token, docId in tokens_docID:
        if not token in vocab:
            vocab[token] = deque()
            vocab[token].append(docId)
        else:
            temp_post_list = vocab[token]
            temp_post_list.append(docId)
            # prevent duplication
            temp_post_list = sorted(set(temp_post_list))
            # back to linkedlist
            temp_post_list = deque(temp_post_list)
            vocab[token] = temp_post_list
    vocab_freq = {}
    # pairing term and doc frequency
    for key, val in vocab.items():
        vocab_freq[(key, len(val))] = val
    return vocab_freq

In [None]:
dict_inverted_index = inverted_index(nlp_tokens_docId_sorted)

In [None]:
dict_inverted_index

## Processing Boolean queries

In [None]:
class BooleanQueries:
    
    def __init__(self, inverted_index, debug=False):
        self.inverted_index = inverted_index
        self.debug = debug
        keys_of_inverted_index = self.inverted_index.keys()
        self.keys_of_inverted_index = dict(keys_of_inverted_index)

    def intersection_query(self, query1, query2):
        # get doc frequency from query
        doc_freq1 = self.keys_of_inverted_index[query1]
        doc_freq2 = self.keys_of_inverted_index[query2]
        # get posting list
        pl_q1 = self.inverted_index[(query1, doc_freq1)]
        pl_q2 = self.inverted_index[(query2, doc_freq2)]
        answer = set(pl_q1) & set(pl_q2)
        if self.debug:
            data = [[pl_q1, pl_q2, answer]]
            headers = [f"posting list of {query1}", f"posting list of {query2}", "intersection result"]
            print(tabulate(data, headers=headers))
        return answer
    
    def union_query(self, query1, query2):
        # get doc frequency from query
        doc_freq1 = self.keys_of_inverted_index[query1]
        doc_freq2 = self.keys_of_inverted_index[query2]
        # get posting list
        pl_q1 = self.inverted_index[(query1, doc_freq1)]
        pl_q2 = self.inverted_index[(query2, doc_freq2)]
        answer = set(pl_q1) | set(pl_q2)
        if self.debug:
            data = [[pl_q1, pl_q2, answer]]
            headers = [f"posting list of {query1}", f"posting list of {query2}", "union result"]
            print(tabulate(data, headers=headers))
        return answer
    
    def negation_query(self, query):
        # get doc frequency from query
        doc_freq = self.keys_of_inverted_index[query]
        # get posting list
        pl_q = self.inverted_index[(query, doc_freq)]
        # get docID membership
        self.docID_membership()
        answer = set(self.docID_space) - set(pl_q)
        if self.debug:
            data = [[pl_q, self.docID_space , answer]]
            headers = [f"posting list of {query}", "docID membership", "negation result"]
            print(tabulate(data, headers=headers))
        return answer
    
    def docID_membership(self):
        docID_space = set()
        for posting_lists in self.inverted_index.values():
            for pl in posting_lists:
                docID_space.add(pl)
        self.docID_space = sorted(docID_space)

In [None]:
# create sample data from inverted index to test the query
mock_inverted_index = {}
num_mock = 20
track_mock = 0
for key, values in dict_inverted_index.items():
    mock_inverted_index[key] = values
    track_mock += 1
    if track_mock == num_mock:
        break
mock_inverted_index

In [None]:
booleanQuery = BooleanQueries(mock_inverted_index, debug=True)

In [None]:
booleanQuery.intersection_query('agustus', 'air')
print("\n")
booleanQuery.union_query('agustus', 'air')
print("\n")
booleanQuery.negation_query('agustus')