In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle

# nltk.download()
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dkiva\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [22]:
def remove_header(data):
    try:
        ind = data.index('\n\n')
        data = data[ind:]
    except:
        pass
        # print("No Header")
    return data


def convert_lower_case(data):
    return np.char.lower(data)


def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words:
            new_text = new_text + " " + w
    return np.char.strip(new_text)


def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data



def remove_apostrophe(data):
    return np.char.replace(data, "'", "")


def remove_single_characters(data):
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if len(w) > 1:
            new_text = new_text + " " + w
    return np.char.strip(new_text)


def convert_numbers(data):
    data = np.char.replace(data, "0", " zero ")
    data = np.char.replace(data, "1", " one ")
    data = np.char.replace(data, "2", " two ")
    data = np.char.replace(data, "3", " three ")
    data = np.char.replace(data, "4", " four ")
    data = np.char.replace(data, "5", " five ")
    data = np.char.replace(data, "6", " six ")
    data = np.char.replace(data, "7", " seven ")
    data = np.char.replace(data, "8", " eight ")
    data = np.char.replace(data, "9", " nine ")
    return data



def stemming(data):
    stemmer= PorterStemmer()
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return np.char.strip(new_text)


def preprocess(data, query):
    if not query:
        data = remove_header(data)
    data = convert_lower_case(data)
    data = convert_numbers(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_single_characters(data)
    data = stemming(data)
    return data

In [23]:
postings = pd.DataFrame()
frequency = pd.DataFrame()
doc = 0

# with open("./corpuses/sonnets.txt", 'r') as f:
    # text = f.read().strip()

for sonnet in os.listdir('./corpuses/sonnets/'):
    with open('./corpuses/sonnets/' + sonnet, 'r') as f:
        text = f.read().strip()
    preprocessed_text = preprocess(text, False)
    if doc%100 == 0:
        print(doc)

    tokens = word_tokenize(str(preprocessed_text))

    pos = 0
    for token in tokens:
        if token in postings:
            p = postings[token][0]

            k = [a[0] for a in p]
            if doc in k:
                for a in p:
                    if a[0] == doc:
                        a[1].add(pos)
            else:
                p.append([doc,{pos}])
                frequency[token][0] += 1
        else:
            postings.insert(value=[[[doc, {pos}]]], loc=0, column=token)
            frequency.insert(value=[1], loc=0, column=token)

        pos += 1
    doc += 1

0
100


In [24]:
# print(postings)
print(frequency)

   discas  cool  quench  disarm  virgin  legion  votari  trip  chast  nymph  \
0       1     1       1       1       1       1       1     1      1      1   

   ...  beauti  therebi  that  increas  desir  we  creatur  fairest  from  one  
0  ...      43        2   131        5     13  11        3        4    61   88  

[1 rows x 2431 columns]


In [45]:
def get_word_postings(word):
    preprocessed_word = str(preprocess(word, True))
    print(preprocessed_word)
    print("Frequency:",frequency[preprocessed_word][0])
    print("Postings List:",postings[preprocessed_word][0])


def get_positions(posting_values, doc):
    for posting_value in posting_values:
        if posting_value[0] == doc:
            return posting_value[1]
    return {}


def gen_init_set_matchings(word):
    init = []
    word_postings = postings[word][0]
    for word_posting in word_postings:
        for positions in word_posting[1]:
            init.append((word_posting[0], positions))
    return init


def match_positional_index(init, b):
    matched_docs = []
    for p in init:
        doc = p[0]
        pos = p[1]

        count = 0

        for k in b:
            pos = pos+1
            k_pos = postings[k][0]
            docs_list = [z[0] for z in k_pos]
            if doc in docs_list:
                doc_positions = get_positions(k_pos, doc)
                if pos in doc_positions:
                    count += 1
                else:
                    count += 1
                    break

            if count == len(b):
                matched_docs.append(p[0])
    return set(matched_docs)



def run_query(query):
    processed_query = preprocess(query, True)
    print(processed_query)

    query_tokens = word_tokenize(str(processed_query))
    print(query_tokens)

    if len(query_tokens)==1:
        print("Total Document Mathces", [a[0] for a in postings[query][0]])
        return [a[0] for a in postings[query][0]]

    init_word = query_tokens[0]
    init_matches = gen_init_set_matchings(init_word)

    query_tokens.pop(0)
    total_matched_docs = match_positional_index(init_matches, query_tokens)
    print("Total Document Matches:", total_matched_docs)
    return total_matched_docs



def print_document(document):
    with open('./corpuses/sonnets/' + document, 'r', encoding='utf-8') as f:
        out_text = f.read()
    print(out_text)


In [28]:
get_word_postings("lively")
get_word_postings("king")
get_word_postings("time")
get_word_postings("one")
get_word_postings("be")

live
Frequency: 37
Postings List: [[2, {104}], [3, {57, 99}], [4, {104}], [5, {91}], [9, {111}], [10, {39}], [12, {19}], [15, {106, 52, 87}], [16, {115}], [17, {105}], [18, {115, 31}], [21, {55}], [30, {69}], [34, {30}], [35, {46}], [36, {92}], [38, {43}], [42, {83}], [53, {34, 74}], [54, {56, 102}], [62, {102}], [66, {8, 72, 42, 86, 61}], [67, {32, 49, 13}], [71, {90, 12}], [78, {96}], [80, {4, 101}], [82, {94}], [92, {4, 36}], [93, {76}], [104, {98}], [106, {85}], [123, {109}], [126, {64}], [127, {94}], [143, {97}], [145, {65}], [152, {41}]]
king
Frequency: 4
Postings List: [[28, {113}], [62, {49}], [86, {110}], [114, {50}]]
time
Frequency: 46
Postings List: [[0, {20}], [2, {100, 14}], [4, {35}], [5, {56, 64, 77}], [10, {57}], [11, {10, 101, 79}], [14, {97, 79}], [15, {71, 15}], [16, {8, 112}], [17, {90}], [18, {104, 51, 3}], [21, {24}], [29, {31}], [31, {42}], [36, {107}], [37, {70}], [38, {91, 85}], [43, {98}], [46, {54}], [48, {8, 67, 4, 36}], [51, {68}], [54, {30}], [56, {21, 14}

In [50]:
query = "to be"
print(run_query(query))
print(run_query("thee partake"))
print(run_query("raised love"))
print(run_query("liquid prisoner"))


to be
['to', 'be']
Total Document Matches: {1, 2, 3, 130, 5, 132, 133, 136, 140, 143, 149, 150, 40, 73, 74, 80, 100, 117, 120, 127}
{1, 2, 3, 130, 5, 132, 133, 136, 140, 143, 149, 150, 40, 73, 74, 80, 100, 117, 120, 127}
thee partak
['thee', 'partak']
Total Document Matches: {148}
{148}
rais love
['rais', 'love']
Total Document Matches: {149}
{149}
liquid prison
['liquid', 'prison']
Total Document Matches: {4}
{4}


In [49]:
# print_document('120')
print_document('4')

                     5
  Those hours that with gentle work did frame
  The lovely gaze where every eye doth dwell
  Will play the tyrants to the very same,
  And that unfair which fairly doth excel:
  For never-resting time leads summer on
  To hideous winter and confounds him there,
  Sap checked with frost and lusty leaves quite gone,
  Beauty o'er-snowed and bareness every where:
  Then were not summer's distillation left
  A liquid prisoner pent in walls of glass,
  Beauty's effect with beauty were bereft,
  Nor it nor no remembrance what it was.
    But flowers distilled though they with winter meet,
    Leese but their show, their substance still lives sweet.


