## Query Processing

We load the index from the stored pickle file first for the querying purposes. Then, using the positional index established, we build pointers for all of the words in the query and try to discover documents that include all of the query's terms in the right sequence.

**Input Format**    

Input statement for querying of length <=5

**Output Format**  

Number of documents retrieved: Returned value \
List of Documents retrieved


In [2]:
#importing neccesary files

import nltk
import re
import pickle as pkl
import nltk
from nltk.tokenize import word_tokenize
from ipynb.fs.defs.data_preprocessing import remove_punc
from nltk.corpus import stopwords
nltk.download('stopwords')
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/frostrot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading Data


In [3]:
# Loading Pickle files

with open("./pickle_files/index.pkl","rb") as f:
    index = pkl.load(f)

### Filtering

In [4]:
#Remove stopwords, and convert shorted words into there extended forms

def stopword(x):
  EXTENDED_FORMS = {"aren't": 'are not', "can't": 'cannot', "couldn't": 'could not', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would', "he'll": 'he will', "he's": 'he is', "i'd": 'i would', "i'll": 'i will', "i'm": 'i am', "isn't": 'is not', "it's": 'it is', "it'll": 'it will', "i've": 'i have', "let's": 'let us', "mightn't": 'might not', "mustn't": 'must not',"n't": 'not', "shan't": 'shall not', "she'd": 'she would', "she'll": 'she will', "she's": 'she is', "shouldn't": 'should not', "that's": 'that is', "there's": 'there is', "they'd": 'they would', "they'll": 'they will', "they're": 'they are', "they've": 'they have', "we'd": 'we would', "we're": 'we are', "weren't": 'were not', "we've": 'we have', "what'll": 'what will', "what're": 'what are', "what's": 'what is', "what've": 'what have', "where's": 'where is', "who'd": 'who would', "who'll": 'who will', "who're": 'who are', "who's": 'who is', "who've": 'who have', "won't": 'will not', "wouldn't": 'would not', "you'd": 'you would', "you'll": 'you will', "you're": 'you are', "you've": 'you have', "'re": ' are', "wasn't": 'was not', "we'll": 'we will', "'cause": 'because', "could've": 'could have', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have', "i'd've": 'i would have', "i'll've": 'i will have', "it'd": 'it would', "it'd've": 'it would have', "it'll've": 'it will have', "ma'am": 'madam', "mayn't": 'may not', "might've": 'might have', "mightn't've": 'might not have', "must've": 'must have', "mustn't've": 'must not have', "needn't": 'need not', "needn't've": 'need not have', "o'clock": 'of the clock', "oughtn't": 'ought not', "oughtn't've": 'ought not have', "sha'n't": 'shall not', "shan't've": 'shall not have', "she'd've": 'she would have', "she'll've": 'she will have', "should've": 'should have', "shouldn't've": 'should not have', "so've": 'so have', "so's": 'so as', "this's": 'this is', "that'd": 'that would', "that'd've": 'that would have', "there'd": 'there would', "there'd've": 'there would have', "here's": 'here is', "they'd've": 'they would have', "they'll've": 'they will have', "to've": 'to have', "we'd've": 'we would have', "we'll've": 'we will have', "what'll've": 'what will have', "when's": 'when is', "when've": 'when have', "where'd": 'where did', "where've": 'where have', "who'll've": 'who will have', "why's": 'why is', "why've": 'why have', "will've": 'will have', "won't've": 'will not have', "would've": 'would have', "wouldn't've": 'would not have', "y'all": 'you all', "y'all'd": 'you all would', "y'all'd've": 'you all would have', "y'all're": 'you all are', "y'all've": 'you all have', "you'd've": 'you would have', "you'll've": 'you will have'}
  
  for i in range(len(x)):
    if x[i] in EXTENDED_FORMS:
      x[i] = EXTENDED_FORMS[x[i]]
    if x[i] in stopwords.words('english'):
      x[i]=''
  x=remove_punc(x)
  return " ".join(x)

In [5]:
#Filter the parsed text, by, converting them into lowercase, removing any tags, extra spaces.

def filter(item):
  if type(item)==str:
    item=item.lower()
    words = word_tokenize(item)
    item=stopword(words)
    item=re.sub(r'\\N','',item)
  return item

### Querying Inputs

In [8]:
# Positional query processing

def positional_query(text):
    sentence_words = filter(text).split()
    n = len(sentence_words)
    print(sentence_words)
    if n<1:
        print("Sentence empty after filtering")
        return []
        
    pointers = [0 for i in range(n)]
    documents = []
    flag = True
    
    for i in range(n):
        if sentence_words[i] not in index:
            flag = False
            break
            
    while flag:
        for i in range(n):
            if pointers[i] == len(index[sentence_words[i]]):
                flag = False
                break
                
        if flag == False:
            break

        for i in range(1,n):
            if index[sentence_words[i]][pointers[i]][0] != index[sentence_words[0]][pointers[0]][0]:
                flag = False
                break
        
        if flag:
            for i in range(1,n):
                if index[sentence_words[i]][pointers[i]][1] - index[sentence_words[i-1]][pointers[i-1]][1] != 1:
                    flag = False
                    break
            if flag:
                documents.append(index[sentence_words[0]][pointers[0]][0])
        
        j = 0
        for i in range(1,n):
            if index[sentence_words[j]][pointers[j]] > index[sentence_words[i]][pointers[i]]:
                j = i
        pointers[j] += 1
        flag = True

    unique = {}
    for i in documents:
        unique[i] = 1
    return list(unique.keys())

In [9]:
query = input("Enter the Sentence to Query: ").strip()
print(query)
documents = positional_query(query)
print('Number Of Documents Retrieved: ',len(documents))
print('List Of Documents Retrieved: \n',[i for i in documents])

this is me
[]
Sentence empty after filtering
Number Of Documents Retrieved:  0
List Of Documents Retrieved: 
 []
