For each SELECTED result, create snippet...

# Snippet should include:
1. document title
2. 2 sentences from doc with highest COSINE SIMILARITY w/respect to q. 


In [1]:
from classes.inverted_index import inverted_index
from classes.persist_index_memory import persist_index_memory
from classes.utilities import doc_utilities
from classes.preprocessing import preprocessing


# 1. load dataset

In [2]:
u = doc_utilities()
u.read_data_set(file='data/12000_docs.p')
memory_unit = persist_index_memory()
u.process_documents_for_indexing()
i_i = inverted_index(memory_unit)
i_i.create_index(collection=u.get_collection_json(),
                     process_text=True)

[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
12700it [00:34, 365.96it/s]


# 2. create index from dataset

In [3]:
print("Index size = {}".format(i_i.get_index_size()))
# This should be roughly 100k tokens

Index size = 94679


# 3. given query, find relevant results 

In [4]:
def return_matches(CR={}, len_DC=1699999):
    doc_first = {}
    
    # better way of generating pertinent indices?
    for i in range(1, len_DC+1):
        doc_first[i] = []    

    for token in CR:
        for val in CR[token]:
            match = ('match: ' + str(token), val.frequency)
            doc_first[val.docId].append(match)
    return {k: doc_first[k] for k in doc_first if len(doc_first[k]) > 0}

def match_scaling(CR = {}, match_num = int):
    return {k: CR[k] for k in CR if len(CR[k]) == match_num}

In [None]:
# query = "test rEd!"
# p = preprocessing()
# text = p.remove_punctuation(text=query)
# tokens = p.tokenize(text=text)
# tokens = p.remove_stopwords(tokens=tokens)
# tokens = p.remove_capitalization(tokens=tokens)
# q = p.stem(tokens=tokens)
# q = ' '.join(q)
# print('Query pre processed = {}'.format(q))

In [36]:
CR = i_i.lookup_query(q)
CR = return_matches(CR)

In [7]:
# select resources that match every token from query
CR = match_scaling(CR, len(q.split()))
CR = match_scaling(CR, 2)

# 4. rank relevant results

In [9]:
from classes.ranking import ranking

r_ranking = ranking()
resources = list(CR.keys())

[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
processed_query_tokens = q.split()

['test', 'red']

In [13]:
# this step is necessary for idf
i_i.create_term_document_matrix()

In [14]:
max_freq = r_ranking.get_max_frequencies(index=CR) 

100%|██████████| 7/7 [00:00<00:00, 53382.05it/s]


In [15]:
i_i.storage.max_frequency_terms_per_doc = max_freq

In [16]:
i_i.storage.max_frequency_terms_per_doc

[3, 2, 1, 1, 4, 2, 3]

In [17]:
resources

[3918, 5128, 6680, 6882, 8081, 10650, 11014]

In [18]:
query = 'state nation disput local employ the'

In [19]:
res = r_ranking.relevance_ranking(query = query,
                           num_results=5,
                            index=i_i.index,
                            resources=resources,
                            max_freq=i_i.storage.max_frequency_terms_per_doc,
                            N=len(i_i.storage.index),
                            term_doc_matrix=i_i.doc_term_matrix_all)

# 5. generate snippets 

## calculate cosine similarity between every 'sentence' in doc and q

In [20]:
import numpy as np

def cos_similarity(d_weights, q_weights):
    numerator = sum([d*q for d,q in zip(d_weights, q_weights)])
    d_norm = sum([d*d for d in d_weights])
    q_norm = sum([q*q for q in q_weights])
    denom = np.sqrt(d_norm * q_norm)
    return numerator/denom

In [31]:
weigher = ranking()
p = preprocessing()
def gen_snip(document=[], i_i={}, query=''):
    q_weights = weigher.relevance_ranking(query = query,
                           num_results=len(sentences[0]),
                            index=i_i.index,
                            resources=[],
                            max_freq=i_i.storage.max_frequency_terms_per_doc,
                            N=len(i_i.storage.index),
                            term_doc_matrix=i_i.doc_term_matrix_all,
                            weigh=True)
    
    q_w=ordered_q_weights = [w[1] for w in sorted(q_weights, key=lambda x: x[0])]
    
    cos_score = -1.0
    snippet = collections.deque(maxlen=2)

    for i in range(len(document)):
        s = document[i]
        s_max_freqs = weigher.get_max_frequencies(index=i_i.index, sentence_tokens=p.the_works(text=s))
        i_i.storage.max_frequency_terms_per_doc = s_max_freqs

        s_weights = weigher.relevance_ranking(query = s,
                               num_results=len(s),
                                index=i_i.index,
                                resources=[],
                                max_freq=i_i.storage.max_frequency_terms_per_doc,
                                N=len(i_i.storage.index),
                                term_doc_matrix=i_i.doc_term_matrix_all,
                                weigh=True)

        ordered_s_weights = [w[1] for w in sorted(s_weights, key=lambda x: x[0])]

        cosine_sim = cos_similarity(ordered_s_weights, q_w)

        if cosine_sim > cos_score:
            cos_score = cosine_sim
            snippet.append(s)
    return snippet

[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
resources, query

([3918, 5128, 6680, 6882, 8081, 10650, 11014],
 'state nation disput local employ the')

In [34]:
import nltk.data
import collections
from tqdm import tqdm

# id corresponds to doc's index in "resources"

def generate_snippets(ranked_res, resources, query, i_i):
    sent_split = nltk.data.load('tokenizers/punkt/english.pickle')
    json_frontend = []
    for id, weight in tqdm(res):
        doc_id = resources[id]

        # retrieve original unprocessed doc-string    
        original = i_i.storage.get(doc_id)['content']

        # get doc title
        title, *text = original.split('\n\n')
        text = ' '.join(text)

        # separate text into sentences
        sentences = sent_split.tokenize(text)

        # generate snippet
        doc_snippet = gen_snippets(document=sentences, i_i=i_i, query=query)
    #     print(doc_snippet)

        # prepare for frontend
        json_frontend.append({'title':title, 'snippet':'...'.join(doc_snippet)})
    
    return json_frontend

In [35]:
generate_snippets(res, resources, query, i_i)


  

 20%|██        | 1/5 [00:04<00:16,  4.03s/it][A

Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index
Term themselv not found in index



 40%|████      | 2/5 [00:15<00:18,  6.33s/it][A
 60%|██████    | 3/5 [00:21<00:12,  6.18s/it][A
 80%|████████  | 4/5 [00:31<00:07,  7.40s/it][A
100%|██████████| 5/5 [00:34<00:00,  5.96s/it][A

[{'title': 'Marianne Plehn',
  'snippet': 'But the field had changed so much in 20 years that Plehn took a different approach....The illustrations and photographs continued to be used for decades.'},
 {'title': 'List of Wallace and Gromit characters',
  'snippet': 'A very homely sort who doesn\'t mind the odd adventure."...Some of Wallace\'s contraptions are based on real-life inventions.'},
 {'title': 'Claudia UmpiÃ©rrez',
  'snippet': 'Claudia InÃ©s UmpiÃ©rrez RodrÃ\xadguez (born 6 January 1983) is a Uruguayan association football referee and lawyer by profession....Claudia was enthusiastic and wanted to register, but as a minor she could not.'},
 {'title': 'Meltdown (security vulnerability)',
  'snippet': 'It allows a rogue process to read all memory, even when it is not authorized to do so.'},
 {'title': 'Harold W. Parsons',
  'snippet': "It was a bargain, and later sold for much more than it cost....Their letter are now at Cambridge University, King's College Archive Centre."}]

In [28]:
json_frontend

[{'title': 'Marianne Plehn',
  'snippet': 'But the field had changed so much in 20 years that Plehn took a different approach....The illustrations and photographs continued to be used for decades.'},
 {'title': 'List of Wallace and Gromit characters',
  'snippet': 'A very homely sort who doesn\'t mind the odd adventure."...Some of Wallace\'s contraptions are based on real-life inventions.'},
 {'title': 'Claudia UmpiÃ©rrez',
  'snippet': 'Claudia InÃ©s UmpiÃ©rrez RodrÃ\xadguez (born 6 January 1983) is a Uruguayan association football referee and lawyer by profession....Claudia was enthusiastic and wanted to register, but as a minor she could not.'},
 {'title': 'Meltdown (security vulnerability)',
  'snippet': 'It allows a rogue process to read all memory, even when it is not authorized to do so.'},
 {'title': 'Harold W. Parsons',
  'snippet': "It was a bargain, and later sold for much more than it cost....Their letter are now at Cambridge University, King's College Archive Centre."}]

**(what does this number mean? Are we looking for the greatest value?)**
* -1: exactly opposite; 
* +1: identical
* 0: orthogonal