# Importing libraries

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
import pickle
import operator

#nltk.download('all')

# Reading Corpus

In [8]:
path = "corpus/"

files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
    for file in f:
        files.append(os.path.join(r, file))

# Removing tags, headers, Lower Casing and tokenization 

In [13]:
docs_words = {}
i = 0 #for docs_words

for f in files:
    doc = open(f, errors='ignore')
    html = doc.read()
    
    index = html.find("<html")
    if index == -1 :
        html.find("<Html")
    elif index == -1 : 
        html.find("<HTML")
    html = html[index:]

    soup = BeautifulSoup(html)

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    
    #print("Doc : " + text + "\n\n")
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(text)
    words_pos = {}
    j = 0; #for words_pos
    for t in tokens:
        t = t.lower() #lowering Case
        t = t.replace("'", "")
        if t not in words_pos.keys():
            words_pos[t] = []
        if t in words_pos.keys():
            words_pos[t].append(j)
        j = j + 1
    
    docs_words[i] = words_pos
    i = i + 1

#print(docs_words)

# Removing Stop words

In [14]:
f = open("stoplist.txt", "r") 
stoplist = f.read().splitlines() #Stoplist words

for i in range(len(docs_words)):
    for key in list(docs_words[i]):
        if key in stoplist or len(key) == 1:
            docs_words[i].pop(key,None)
#print(docs_words)            

# Inverted Index using Dictionary

In [15]:
words_docs = {}

for i in range(len(docs_words)):
    for key in docs_words[i].keys():
        docs_pos = {}
        docs_pos[i] = docs_words[i].get(key)
        if key not in words_docs.keys():
            words_docs[key] = docs_pos
        elif key in words_docs.keys():
             words_docs[key][i] = docs_pos[i]

#print(words_docs)

# Stemming

In [16]:
ps = PorterStemmer()

for key in list(words_docs):
    stem = ps.stem(key)
    if stem != key:
        if stem not in words_docs.keys():
            words_docs[stem] = words_docs.pop(key)
        elif stem in words_docs.keys():
            dictn = words_docs.pop(key)
            for doc in dictn.keys():
                if doc not in words_docs[stem].keys():
                    words_docs[stem][doc] = dictn[doc]
                elif doc in words_docs[stem].keys():
                    words_docs[stem][doc] = words_docs[stem][doc] + dictn[doc]

#print(docs_words)

# Inverted Index WITHOUT Dictionary

In [17]:
list_of_tuples = []
i = 0

for key in words_docs.keys():
    for docids in words_docs[key].keys():
        tup = (i, docids)
        list_of_tuples.append(tup)
    i = i + 1    

# Finding total occurences of a term

In [18]:
def get_total_occurences(key):
    count = 0
    
    for docid in words_docs[key].keys():
        count = count + len(words_docs[key][docid]) 
    return count    

# Writing term_index.txt and term_info.pickle

In [19]:
term_index = open("term_index.txt", "w", errors="ignore")
i = 0
term_info = {}

for key in words_docs.keys():
    ocr = get_total_occurences(key)
    docs = len(words_docs[key].keys())
    tup = (i, ocr, docs)
    term_info[key] = tup
    
    term_index.write(str(i) + "\t" + str(ocr) + "\t" + str(docs) + "\t" + str(words_docs[key]) + "\n")
    i = i + 1

# Making pickles of Inverted Index and Term info

In [20]:
with open('Inverted-Index.pickle', 'wb') as handle:
    pickle.dump(words_docs, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('Term-Info.pickle', 'wb') as handle:
    pickle.dump(term_info, handle, protocol=pickle.HIGHEST_PROTOCOL)    

# Delta Encoding

In [21]:
def delta_encoding(term):
    delta_enc = {}
    res = {}
    
    for key in sorted(words_docs[term].keys()):
        delta_enc[key] = words_docs[term][key]
    
    for key in delta_enc.keys():
        delta_enc[key].sort()
        
    for key in delta_enc.keys():
        values = []
        prev = 0
        for value in delta_enc[key]:
            enc_value = value - prev
            prev = value
            values.append(enc_value)
        res[key] = values

    return res   


# Writing Delta Encoded term_index.txt

In [22]:
term_index = open("delta_encoded_term_index.txt", "w", errors="ignore")
i = 0

for key in words_docs.keys():
    ocr = get_total_occurences(key)
    docs = len(words_docs[key].keys())
    
    term_index.write(str(i) + "\t" + str(ocr) + "\t" + str(docs) + "\t")
    d = delta_encoding(key)
    prev = 0
    term_index.write("{ ")   
    for doc in d.keys():
        term_index.write(str(doc - prev) + ": " + str(d[doc]) + ", ")
        prev = doc
    term_index.write(" }\n")    
    i = i + 1