In [153]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [154]:
cd drive/MyDrive/NLP

[Errno 2] No such file or directory: 'drive/MyDrive/NLP'
/content/drive/MyDrive/NLP


In [156]:
pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [234]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
import re,os,time,string,time
from pprint import pprint
import pandas as pd
from csv import reader
from nltk.tokenize import MWETokenizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import math


class InvertedIndex:
    """
    Construct Inverted Index
    """
    def __init__(self):
        self = self
        self.multiword = [] 
        self.DocID = None
        self.invertedindex = None
        self.time = None
    
    # pos tagging for lemmatization 
    # pos tagging rules
    def pos_tagger (self,nltk_tag):
      if nltk_tag.startswith('J'):
        return wordnet.ADJ
      elif nltk_tag.startswith('V'):
        return wordnet.VERB
      elif nltk_tag.startswith('N'):
        return wordnet.NOUN
      elif nltk_tag.startswith('R'):
        return wordnet.ADV
      else:         
        return None

    # read csv file for multi words
    def read_csv_file(self,path:str) -> None:
        wordlist = []
        filelist = os.listdir(path)
        for x in filelist:
            if x.endswith(".csv"):
                df = pd.read_csv(path+x)
                for row in df['name']:
                    self.multiword.append(row)
        
    def read_data(self, path: str) -> list:
        """
        Read files from a directory and then append the data of each file into a list.
        """
        self.time= time.time()
        alllist = []
        filelist = os.listdir(path)
        self.path = path
        self.read_csv_file(path)
        for x in filelist:
            if x.endswith(".txt"):
                file_obj = open(path+x, "r")
                self.DocID = str(x).replace(".txt","")
                wordslist = self.process_document(file_obj)
                alllist+= wordslist
        return alllist

    def process_document(self, document: str) -> list:
        """
        pre-process a document and return a list of its terms
        str->list"""
        
        #lemmatization
        lemmatizer=WordNetLemmatizer()
        
        wordslist = []
        # read documents
        raw = document.read()
        
        # tokenize the word 
        tokenizer = TweetTokenizer()
        tokens = (tokenizer.tokenize(raw))
        
        #remove punctuation
        table = str.maketrans('', '', '\t')
        tokens = [word.translate(table) for word in tokens]
        punctuations = (string.punctuation).replace("'", "")
        trans_table = str.maketrans('', '', punctuations)
        stripped_words = [word.translate(trans_table) for word in tokens]
        tokens = [str for str in stripped_words if str]
        
        #remove some symbols still exist
        punctuations_symbols = "≠−→←“•’´―⁄ ”"
        for items in tokens:
          if items in punctuations_symbols:
            tokens.remove(items)

        #uni gram tokens for verb and nouns
        uni_tokens = tokens

        # pos tagging
        pos_tagged = nltk.pos_tag(uni_tokens)
        wordnet_tagged = [(x[0], self.pos_tagger(x[1])) for x in pos_tagged]
        
        # lemmatisation
        lemmatized_sentence = []
        for word, tag in wordnet_tagged:
            if tag is None:
                # if there is no available tag, append the token as is
                lemmatized_sentence.append(word)
            else:       
                # else use the tag to lemmatize the token
                lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
                # print(lemmatized_sentence)   
        uni_tokens = lemmatized_sentence
        
        #put multi word from csv file
        multi_word_list = []
        for words in self.multiword:
            multi_word_list.append(words.split())
        tk = MWETokenizer(multi_word_list)
        tokens = tk.tokenize(tokens)
        
        #make lower case
        tokens_lower = [word.lower() for word in tokens]
        uni_tokens_lower = [word.lower() for word in uni_tokens] 
        
        #remove stopwords
        stop_words = set(stopwords.words("english"))
        tokens = [word for word in tokens_lower if not word in stop_words]
        uni_tokens = [word for word in uni_tokens_lower if not word in stop_words]
        
        #make multi word lower
        multiword_list_lower =[]
        for item in self.multiword:
            multiword_list_lower.append(item.lower())

        # put every uni gram word in the list
        number_of_words =0
        for words in uni_tokens:
            number_of_words +=1
            inlist =[]
            inlist.append(words)
            inlist.append(self.DocID)
            inlist.append(number_of_words)
            wordslist.append(inlist)

        #just put csv file multi words in the list
        number_of_words = 0
        for words in tokens:
            number_of_words +=1
            if words in multiword_list_lower:
              if words not in wordslist:
                pos = len(words.split("_")) # positional
                number_of_words += (pos-1)
                inlist =[]
                inlist.append(words)
                inlist.append(self.DocID)
                inlist.append(number_of_words)
                wordslist.append(inlist)


        return wordslist
    
    def index_corpus(self, documents: list) -> None:
        """
        index given documents
        list->None"""
        documents.sort()
        inverted_index = {}
        for index in documents:
          term = index[0]
          file_index = index[1]
          pos = index[2] #positional informtaion for each word
          if term in inverted_index:
          #increment total freq by 1
            inverted_index[term][0] = inverted_index[term][0] +1
          #check if the tem has existed in that DocID before
            if file_index in inverted_index[term][1]:
              inverted_index[term][1][file_index].append(pos)
            else:
              inverted_index[term][1][file_index] = [pos]
          #If term does not exist in the positional index dictionary
          #first encounter
          else:
            inverted_index[term] =[]
            inverted_index[term].append(1)
            inverted_index[term].append({})
            inverted_index[term][1][file_index] = [pos]
            
        
        print("inverted index: ",inverted_index)
        execution_time = time.time() - self.time
        print("execution time :",execution_time)
        print("length of inverted index: ",len(inverted_index))
        self.invertedindex = inverted_index
        self.dump(inverted_index)
        
     
    def dump(self, examples: list) -> None:
        """
        provide a dump function to show index entries for a given set of terms        
        """

        #preprocess the word in the list to match 
        lemmatizer=WordNetLemmatizer()
        tokenizer = TweetTokenizer()
        stop_words = set(stopwords.words("english"))
        devex = './'
        filelist = os.listdir(devex)
        for x in filelist:
            if x.endswith(".txt"):
                file_obj = open(devex+x, "r")
                for words in file_obj:
                    
                    multiwords =[]
                    tokens = []
                    tokens = (tokenizer.tokenize(words))
                   
                    
                    table = str.maketrans('', '', '\t')
                    tokens = [word.translate(table) for word in tokens]
                    punctuations = (string.punctuation).replace("'", "")
                    trans_table = str.maketrans('', '', punctuations)
                    stripped_words = [word.translate(trans_table) for word in tokens]
                    tokens = [str for str in stripped_words if str]
                    
                    punctuations_symbols = "≠−→←“•’´―⁄ ”"

                    for items in tokens:
                      if items in punctuations_symbols:
                        tokens.remove(items)
                    
                    # if length of word is larger than 1, put in the MWET tokenizer
                    if(len(tokens)) >1:
                      multiwords =[]
                      multiwords.append(tokens)
                      tk = MWETokenizer(multiwords)
                      tokens = tk.tokenize(tokens)
                    # else put the pos tagging and lemmatization for uni gram
                    else:
                       pos_tagged = nltk.pos_tag(tokens)
                       wordnet_tagged = [(x[0], self.pos_tagger(x[1])) for x in pos_tagged]
                      
                      
                       # lemmatisation
                       lemmatized_sentence = []
                       for word, tag in wordnet_tagged:
                          if tag is None:
                              # if there is no available tag, append the token as is
                              lemmatized_sentence.append(word)
                          else:       
                              # else use the tag to lemmatize the token
                              lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
                       tokens = lemmatized_sentence
                
                    # make the word lower case
                    if(len(tokens))>1:
                      tokens= [word.lower() for word in tokens]
                    else:
                      tokens[0] = tokens[0].lower()
                    
                    #remove stop word in tokens
                    tokens = [word for word in tokens if not word in stop_words]
                  
                    # check the word in the inverted list
                    if not tokens:
                       print(words,"not in the inverted index" )
                    else:
                      exist = 0
                      for key in examples:
                        if(tokens[0] == key):
                          exist =1
                          print(key,examples[key])
                      if exist == 0:
                        print(words,"not in the inverted index")
                      
    def preprocess(self,term:str) -> list:
       "preprocess the input for proximity search"
       lemmatizer=WordNetLemmatizer()
       tokenizer = TweetTokenizer()
       stop_words = set(stopwords.words("english"))
       multiwords =[]
       tokens = []
       tokens = (tokenizer.tokenize(term))
                   
                    
       table = str.maketrans('', '', '\t')
       tokens = [word.translate(table) for word in tokens]
       punctuations = (string.punctuation).replace("'", "")
       trans_table = str.maketrans('', '', punctuations)
       stripped_words = [word.translate(trans_table) for word in tokens]
       tokens = [str for str in stripped_words if str]
                    
       punctuations_symbols = "≠−→←“•’´―⁄ ”"

       for items in tokens:
          if items in punctuations_symbols:
              tokens.remove(items)
                    

       if(len(tokens)) >1:
          multiwords =[]
          multiwords.append(tokens)
          tk = MWETokenizer(multiwords)
          tokens = tk.tokenize(tokens)            
       else:
          pos_tagged = nltk.pos_tag(tokens)
          wordnet_tagged = [(x[0], self.pos_tagger(x[1])) for x in pos_tagged]
                                  
       # lemmatisation
          lemmatized_sentence = []
          for word, tag in wordnet_tagged:
              if tag is None:
          # if there is no available tag, append the token as is
                lemmatized_sentence.append(word)
              else:       
           # else use the tag to lemmatize the token
                lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
          tokens = lemmatized_sentence
                
                    
       if(len(tokens))>1:
          tokens= [word.lower() for word in tokens]
       else:
          tokens[0] = tokens[0].lower()
                    
                    
       tokens = [word for word in tokens if not word in stop_words]
                  
       return tokens

    def proximity_search(self, term1: str, term2: str, window_size: int) -> dict:
        """
        This is Task 2"""

        answers = {}

        term1 = self.preprocess(term1)
        term2 = self.preprocess(term2)

        if window_size ==1:
          if term_1 != term_2:
            print("window size error")
            return 0
        elif window_size <1:
          print("window size error")
          return 0
        else:
          pass
          


        term1_indices = None
        term2_indices = None

        # find the index of each term 
        for key in self.invertedindex:
          if (key == term1[0]):
            term1_indices = self.invertedindex[key]
          elif (key == term2[0]):
            term2_indices = self.invertedindex[key]

        # check if term has key
        if len(term1_indices) != 0 :
          if len(term2_indices) != 0:
            pass
          else:
            print("try proper word")
        else:
          print("try proper word")

        #find the common index for both terms
        common_indexes =[]
        for index in term1_indices[1]:
          if index in term2_indices[1]:
            common_indexes.append(index)

        # find the positional information according to windows size
        for index in common_indexes:
          for term_1 in term1_indices[1][index]:
            for term_2 in term2_indices[1][index]:
              if abs(term_1-term_2) <= (window_size-1):
                if index not in answers:
                  answers[index] = [(term1,term2)]
                  answers[index].append([(term_1,term_2)])
                else:
                  if (term_1 and term_2) not in answers[index]:
                    answers[index].append([(term_1,term_2)])

        print("proximity search result: ",answers)
        return answers

        

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [235]:
def main():
    "main call function"
    index = InvertedIndex() # initilaise the index
    corpus = index.read_data('Simpsons2022/') # specify the directory path in which files are located
    index.index_corpus(corpus) # index documents/corpus
    index.proximity_search('Bart','First',2)
    
    return index
    
index = main()

execution time : 51.140666246414185
length of inverted index:  12828
bart [1108, {'3.1': [149, 323, 354, 404, 426, 431, 444, 458, 612, 622, 903, 907, 1141, 1164, 1283, 1850, 1885], '3.10': [983], '3.11': [445, 743, 772], '3.12': [106, 146, 241, 294, 506, 509, 527, 583, 688], '3.13': [2, 11, 75, 95, 119, 130, 163, 223, 233, 258, 306, 318, 324, 336, 342, 366, 379, 396, 645, 732, 755, 821, 859, 939, 978, 997, 1171, 1182], '3.14': [79, 399, 419], '3.15': [74, 119, 147, 303, 328, 416, 554, 581, 595, 745], '3.16': [1, 9, 71, 92, 104, 139, 170, 238, 251, 263, 292, 312, 328, 341, 352, 361, 409, 489, 501, 509, 620, 642, 729, 824, 932, 1008, 1071], '3.17': [98, 1083], '3.18': [51, 121, 154, 267, 299, 303, 316, 391, 400, 442, 518, 640, 668, 698, 719, 863, 899, 977, 1004], '3.19': [151, 258, 283, 366, 384, 394, 812, 1316], '3.2': [315, 848], '3.20': [326, 1773], '3.21': [151, 160, 231, 272, 275, 328, 363, 390, 407, 418, 611, 696, 848], '3.22': [118, 252, 269, 284, 362, 372], '3.23': [18, 148, 156,