For each SELECTED result, create snippet...

# Snippet should include:
1. document title
2. 2 sentences from doc with highest COSINE SIMILARITY w/respect to q. 


In [1]:
from classes.inverted_index import inverted_index
from classes.persist_index_memory import persist_index_memory
from classes.utilities import doc_utilities
from classes.preprocessing import preprocessing


# 1. load dataset

In [2]:
u = doc_utilities()
u.read_data_set(file='data/12000_docs.p')
memory_unit = persist_index_memory()
u.process_documents_for_indexing()
i_i = inverted_index(memory_unit)
i_i.create_index(collection=u.get_collection_json(),
                     process_text=True)

[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
12700it [00:34, 367.91it/s]


# 2. create index from dataset

In [3]:
print("Index size = {}".format(i_i.get_index_size()))
# This should be roughly 100k tokens

Index size = 94679


# 3. given query, find relevant results 

In [4]:
def return_matches(CR={}, len_DC=1699999):
    doc_first = {}
    
    # better way of generating pertinent indices?
    for i in range(1, len_DC+1):
        doc_first[i] = []    

    for token in CR:
        for val in CR[token]:
            match = ('match: ' + str(token), val.frequency)
            doc_first[val.docId].append(match)
    return {k: doc_first[k] for k in doc_first if len(doc_first[k]) > 0}

def match_scaling(CR = {}, match_num = int):
    return {k: CR[k] for k in CR if len(CR[k]) == match_num}

In [5]:
query = "When is shark week"
p = preprocessing()
text = p.remove_punctuation(text=query)
tokens = p.tokenize(text=text)
tokens = p.remove_stopwords(tokens=tokens)
tokens = p.remove_capitalization(tokens=tokens)
q = p.stem(tokens=tokens)
q = ' '.join(q)
print('Query pre processed = {}'.format(q))

Query pre processed = when shark week


[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
CR = i_i.lookup_query(q.split())

In [7]:
len(CR)

3

In [8]:
CR = return_matches(CR)

In [9]:
CR

{33: [('match: when', 1), ('match: week', 1)],
 35: [('match: when', 4)],
 63: [('match: when', 1)],
 77: [('match: when', 1)],
 84: [('match: week', 3)],
 89: [('match: when', 1)],
 98: [('match: week', 1)],
 101: [('match: when', 2), ('match: week', 2)],
 130: [('match: week', 1)],
 146: [('match: when', 1)],
 154: [('match: week', 1)],
 165: [('match: when', 1)],
 183: [('match: when', 1)],
 216: [('match: when', 1)],
 218: [('match: week', 1)],
 227: [('match: when', 3)],
 231: [('match: when', 2), ('match: week', 2)],
 248: [('match: week', 1)],
 265: [('match: when', 1)],
 285: [('match: week', 2)],
 288: [('match: when', 2), ('match: week', 1)],
 289: [('match: shark', 2)],
 294: [('match: when', 3)],
 295: [('match: week', 1)],
 296: [('match: when', 1)],
 305: [('match: when', 2)],
 307: [('match: week', 1)],
 353: [('match: week', 2)],
 403: [('match: when', 1)],
 427: [('match: week', 1)],
 473: [('match: week', 1)],
 498: [('match: when', 1)],
 538: [('match: when', 1)],
 5

In [10]:
# select resources that match every token from query
# CR = match_scaling(CR, len(q.split()))
CR = match_scaling(CR, 2)

In [11]:
CR.keys()

dict_keys([33, 101, 231, 288, 836, 1463, 1590, 2291, 2411, 2909, 2926, 3025, 3556, 3866, 4443, 4558, 4768, 5371, 5391, 5987, 6112, 6210, 6724, 6794, 7281, 7580, 8044, 8081, 8151, 8159, 8450, 8567, 8829, 9137, 10277, 11368, 11384, 11525, 11824, 12607])

# 4. rank relevant results

In [12]:
from classes.ranking import ranking

r_ranking = ranking()
resources = list(CR.keys())

[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
processed_query_tokens = q.split()

In [14]:
# this step is necessary for idf
i_i.create_term_document_matrix()

In [15]:
max_freq = r_ranking.get_max_frequencies(index=CR) 

100%|██████████| 40/40 [00:00<00:00, 233016.89it/s]


In [16]:
i_i.storage.max_frequency_terms_per_doc = max_freq

In [17]:
res = r_ranking.relevance_ranking(query = query,
                           num_results=5,
                            index=i_i.index,
                            resources=resources,
                            max_freq=i_i.storage.max_frequency_terms_per_doc,
                            N=len(i_i.storage.index),
                            term_doc_matrix=i_i.doc_term_matrix_all)

# 5. generate snippets 

## calculate cosine similarity between every 'sentence' in doc and q

In [18]:
import numpy as np

def cos_similarity(d_weights, q_weights):
    numerator = sum([d*q for d,q in zip(d_weights, q_weights)])
    d_norm = sum([d*d for d in d_weights])
    q_norm = sum([q*q for q in q_weights])
    denom = np.sqrt(d_norm * q_norm)
    return numerator/denom

In [19]:
weigher = ranking()
p = preprocessing()
def gen_snippets(document=[], i_i={}, query=''):
    q_weights = weigher.relevance_ranking(query = query,
                           num_results=len(query),
                            index=i_i.index,
                            resources=[],
                            max_freq=i_i.storage.max_frequency_terms_per_doc,
                            N=len(i_i.storage.index),
                            term_doc_matrix=i_i.doc_term_matrix_all,
                            weigh=True)
    
    q_w=ordered_q_weights = [w[1] for w in sorted(q_weights, key=lambda x: x[0])]
    
    cos_score = -1.0
    snippet = collections.deque(maxlen=2)

    for i in range(len(document)):
        s = document[i]
        s_max_freqs = weigher.get_max_frequencies(index=i_i.index, sentence_tokens=p.the_works(text=s))
        i_i.storage.max_frequency_terms_per_doc = s_max_freqs

        s_weights = weigher.relevance_ranking(query = s,
                               num_results=len(s),
                                index=i_i.index,
                                resources=[],
                                max_freq=i_i.storage.max_frequency_terms_per_doc,
                                N=len(i_i.storage.index),
                                term_doc_matrix=i_i.doc_term_matrix_all,
                                weigh=True)

        ordered_s_weights = [w[1] for w in sorted(s_weights, key=lambda x: x[0])]

        cosine_sim = cos_similarity(ordered_s_weights, q_w)

        if cosine_sim > cos_score:
            cos_score = cosine_sim
            snippet.append(s)
    return snippet

[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
resources, query

([33,
  101,
  231,
  288,
  836,
  1463,
  1590,
  2291,
  2411,
  2909,
  2926,
  3025,
  3556,
  3866,
  4443,
  4558,
  4768,
  5371,
  5391,
  5987,
  6112,
  6210,
  6724,
  6794,
  7281,
  7580,
  8044,
  8081,
  8151,
  8159,
  8450,
  8567,
  8829,
  9137,
  10277,
  11368,
  11384,
  11525,
  11824,
  12607],
 'When is shark week')

In [21]:
import nltk.data
import collections
from tqdm import tqdm

# id corresponds to doc's index in "resources"

def generate_snippets(ranked_res, resources, query, i_i):
    sent_split = nltk.data.load('tokenizers/punkt/english.pickle')
    json_frontend = []
    for id, weight in tqdm(res):
        doc_id = resources[id]

        # retrieve original unprocessed doc-string    
        original = i_i.storage.get(doc_id)['content']

        # get doc title
        title, *text = original.split('\n\n')
        text = ' '.join(text)

        # separate text into sentences
        sentences = sent_split.tokenize(text)

        # generate snippet
        doc_snippet = gen_snippets(document=sentences, i_i=i_i, query=query)
    #     print(doc_snippet)

        # prepare for frontend
        json_frontend.append({'title':title, 'snippet':'...'.join(doc_snippet)})
    
    return json_frontend

In [22]:
generate_snippets(res, resources, query, i_i)

  

  0%|          | 0/94679 [00:00<?, ?it/s][A

TypeError: 'Appearance' object is not subscriptable

In [None]:
json_frontend

**(what does this number mean? Are we looking for the greatest value?)**
* -1: exactly opposite; 
* +1: identical
* 0: orthogonal