<a href="https://colab.research.google.com/github/chunter3/Information_Retrieval_Projects/blob/master/Tokenizing%2C_Inverted_Indexes%2C_%26_Query_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
%matplotlib inline

In [2]:
# Problem 1 (start)

In [None]:
# Loading the "20newsgroups" sample dataset

from sklearn.datasets import fetch_20newsgroups
newsGroups_ds = fetch_20newsgroups(subset='all', shuffle='False')
newsGroups_ds.data # returns the total number of samples from the 20newsgroups dataset (of which there are 18846)

In [4]:
# Tokenization function (using the regular expression (re) module)

def Tokenize(docData):
  tokenizedDocs = []                               
  for msg in docData:
    processMsg = re.sub('[^A-Za-z0-9]+',' ', msg)  # takes a msg from docData and removes all instances of special characters (including punctuation); processes the msg
    tokenizedMsg = processMsg.split()              # uses the built-in Python split() method to split the processed msg using whitespace as a delimiter
    tokenizedDocs.append(tokenizedMsg)
  return tokenizedDocs                             # "tokenizedDocs" is a 2D array (an array of arrays)

In [None]:
Tokenize(newsGroups_ds.data)

In [6]:
# Problem 1 (end)

In [7]:
# Problem 2 (start)

In [8]:
# Inverted index function

def InvertedIndex(tokenizedDocs):
  invertedIndex = {}
  docID = 0
  for tokenizedDoc in tokenizedDocs:
    docID += 1
    for token in tokenizedDoc:
      if token in invertedIndex and docID not in invertedIndex[token]:
        invertedIndex[token].append(docID)
      else:
        invertedIndex[token] = [docID]
  return invertedIndex

In [None]:
tokenizedDocs = Tokenize(newsGroups_ds.data)
invertedIndex = InvertedIndex(tokenizedDocs)
invertedIndex

In [10]:
 # Token count function; determines number of appearances of each token across all documents
 
def TokenCount(tokenizedDocs): 
  tokenCount = {}
  for tokenizedDoc in tokenizedDocs:
    for token in tokenizedDoc:
      if token in tokenCount:
        tokenCount[token] += 1
      else:
        tokenCount[token] = 1
  return sorted(tokenCount.items(), key=lambda x: x[1], reverse=True)  # returns a dictionary sorted by the descending order of the keys' values

In [None]:
TokenCount(tokenizedDocs)

In [12]:
# Problem 2 (end)

In [13]:
# Problem 3 (start)

In [14]:
# Intersect auxilary function (part a of problem 3); returns a list of unique docIDs from the intersection of two list values from the inverted index

def Intersect(p1,p2):          # p1 and p2 are the list values of two arbitrary entries from the inverted index in problem 2
 ans = []  
 while p1 and p2:
   if p1[0] == p2[0]:
     ans.append(p1[0])
     del p1[0]
     del p2[0]
   elif p1[0] < p2[0]:
     del p1[0]
   else:
     del p2[0]
 return ans

In [15]:
# Let's examine a test input to verify the intersect function's correctness. Given the query 'Mamatha' AND 'po4' Intersect() should return [1,334,11486] 
p1 = invertedIndex['Mamatha']
p2 = invertedIndex['po4']

In [None]:
p1

In [None]:
p2

In [18]:
# Testing intersect function
actual = Intersect(p1,p2)  
assert actual == [1, 334, 11486] 

In [19]:
# Sort auxilary function (part b of problem 3); returns a word list that's sorted based on the words' number of postings (according to the inverted index)

def Sort(wordLst, index):
  unsortedAuxDict = {}       # an unsorted dictionary based on the given word list; used to help sort the word list accordingly
  for term in wordLst:
    if term not in index:
      unsortedAuxDict[term] = 0    # if a word in the word list isn't in the index, then it's treated as having a posting of 0
      continue
    unsortedAuxDict[term] = len(index[term])
  sortedAuxDict = dict(sorted(unsortedAuxDict.items(), key=lambda item: item[1]))
  return list(sortedAuxDict.keys())  

In [None]:
# Let's examine a test input to verify the sort function's correctness. Given the word list ['Mamatha', 'The', 'only', 'which'] Sort() should return ['Mamatha', 'only', 'which', 'The']

wordLst = ['Mamatha', 'The', 'only', 'which']
Sort(wordLst,invertedIndex)

In [21]:
# Problem 3 (end)

In [22]:
# Problem 4 (start)

In [23]:
tokenizedDocs = Tokenize(newsGroups_ds.data)
invertedIndex = InvertedIndex(tokenizedDocs)

In [24]:
# Query search function

def QuerySearch(query,index):
  ans = []
  tokens = query.replace('AND',"").split()       # Tokenizing the query
  if len(tokens) < 2:                            # If there are less than 2 tokens, then an intersection is impossible
    return tokens
  for token in tokens:
    if token not in index:           # If there are tokens not present in the inverted index, then add them to the index with a value of an empty list
      index[token] = []
  if len(tokens) == 2:
    return Intersect(index[tokens[0]],index[tokens[1]])     # If there are only two tokens, then immediately return their intersection
  sortedWordLst = Sort(tokens,index)
  p1 = Intersect(index[sortedWordLst[0]],index[sortedWordLst[1]])
  del sortedWordLst[:2]
  for word in sortedWordLst:
    ans = Intersect(p1,index[word])
    p1 = ans
  return ans

In [None]:
query1 = "Mamatha"   # A term that should be present in the inverted index
QuerySearch(query1,invertedIndex)

In [None]:
query2 = "ETRIAN"   # A word term should not be present in the inverted index
QuerySearch(query2,invertedIndex)

In [None]:
query3 = "Mamatha AND Devineni AND Ratnam"  # A boolean AND query with each term in the inverted index
QuerySearch(query3,invertedIndex)

In [None]:
query4 = "Mamatha AND ETRIAN AND Ratnam"  # A boolean AND query with at least one term that's not in the inverted index
QuerySearch(query4,invertedIndex)