For each SELECTED result, create snippet...

# Snippet should include:
1. document title
2. 2 sentences from doc with highest COSINE SIMILARITY w/respect to q. 


In [1]:
from classes.inverted_index import inverted_index
from classes.persist_index_memory import persist_index_memory
from classes.utilities import doc_utilities
from classes.preprocessing import preprocessing


# 1. load dataset

In [2]:
u = doc_utilities()
u.read_data_set(file='data/12000_docs.p')
memory_unit = persist_index_memory()
u.process_documents_for_indexing()
i_i = inverted_index(memory_unit)
i_i.create_index(collection=u.get_collection_json(),
                     process_text=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gerar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gerar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
12700it [01:01, 207.76it/s]


# 2. create index from dataset

In [3]:
print("Index size = {}".format(i_i.get_index_size()))
# This should be roughly 100k tokens

Index size = 94679


# 3. given query, find relevant results 

In [4]:
def return_matches(CR={}, len_DC=1699999):
    doc_first = {}
    
    # better way of generating pertinent indices?
    for i in range(1, len_DC+1):
        doc_first[i] = []    

    for token in CR:
        for val in CR[token]:
            match = ('match: ' + str(token), val.frequency)
            doc_first[val.docId].append(match)
    return {k: doc_first[k] for k in doc_first if len(doc_first[k]) > 0}

def match_scaling(CR = {}, match_num = int):
    return {k: CR[k] for k in CR if len(CR[k]) == match_num}

In [5]:
query = "test rEd!"
p = preprocessing()
text = p.remove_punctuation(text=query)
tokens = p.tokenize(text=text)
tokens = p.remove_stopwords(tokens=tokens)
tokens = p.remove_capitalization(tokens=tokens)
q = p.stem(tokens=tokens)
q = ' '.join(q)
print('Query pre processed = {}'.format(q))
CR = i_i.lookup_query(q)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gerar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Query pre processed = test red


In [6]:
CR = return_matches(CR)

In [7]:
# select resources that match every token from query
CR = match_scaling(CR, len(q.split()))
CR = match_scaling(CR, 2)

In [8]:
CR

{3918: [('match: test', 3), ('match: red', 3)],
 5128: [('match: test', 1), ('match: red', 2)],
 6680: [('match: test', 1), ('match: red', 1)],
 6882: [('match: test', 1), ('match: red', 1)],
 8081: [('match: test', 4), ('match: red', 1)],
 10650: [('match: test', 1), ('match: red', 2)],
 11014: [('match: test', 3), ('match: red', 1)]}

# 4. rank relevant results

In [9]:
from classes.ranking import ranking

r_ranking = ranking()
resources = list(CR.keys())

In [10]:
resources

[3918, 5128, 6680, 6882, 8081, 10650, 11014]

In [11]:
CR.items()

dict_items([(3918, [('match: test', 3), ('match: red', 3)]), (5128, [('match: test', 1), ('match: red', 2)]), (6680, [('match: test', 1), ('match: red', 1)]), (6882, [('match: test', 1), ('match: red', 1)]), (8081, [('match: test', 4), ('match: red', 1)]), (10650, [('match: test', 1), ('match: red', 2)]), (11014, [('match: test', 3), ('match: red', 1)])])

In [12]:
processed_query_tokens = q.split()
processed_query_tokens

['test', 'red']

In [13]:
# this step is necessary for idf
i_i.create_term_document_matrix()

In [14]:
max_freq = r_ranking.get_max_frequencies(index=CR) 

100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<?, ?it/s]


In [15]:
i_i.storage.max_frequency_terms_per_doc = max_freq

In [16]:
i_i.storage.max_frequency_terms_per_doc

[3, 2, 1, 1, 4, 2, 3]

In [17]:
resources

[3918, 5128, 6680, 6882, 8081, 10650, 11014]

In [18]:
res = r_ranking.relevance_ranking(query = 'state nation disput local employ the',
                           num_results=5,
                            index=i_i.index,
                            resources=resources,
                            max_freq=i_i.storage.max_frequency_terms_per_doc,
                            N=len(i_i.storage.index),
                            term_doc_matrix=i_i.doc_term_matrix_all)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gerar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Id =  0
Id =  1
Id =  2
Id =  3
Id =  4
Id =  5
Id =  6


# calculate cosine similarity between every 'sentence' in doc and q

## retrieve original, unprocessed doc-string

In [22]:
original = i_i.storage.get(1)['content']

In [23]:
original

"Moroccoâ\x80\x93Saudi Arabia relations\n\nMoroccanâ\x80\x93Saudi Arabian relations refers to the current and historical relations between Morocco and Saudi Arabia. Morocco has an embassy in Riyadh and Saudi Arabia has an embassy in Rabat.\n\nBoth two nations have a long and traditional friendly relations based on many historical commons. Both Morocco and Saudi Arabia are two monarch states ruled by respective families, the Alaouites and the Al-Sauds, adhere to Sunni Islam, fear of growing Iranian and Shiite influence and their relationship is mostly described as excellent.\n\nMorocco and Saudi Arabia have together taken steps to curb Iranian influence in the Arab world, although Morocco has a moderate approach to Iran while Saudi Arabia is more cautious and hostile of Iran.\n\nSaudi Arabia has recently supported Morocco over the dispute of Western Sahara and doesn't recognize the legitimacy of Sahrawi Arab Democratic Republic, through the Arab League. Saudi Arabia, on the other side, 

## get doc title

In [24]:
'''
cool python trick:
https://stackoverflow.com/questions/31426095/assign-multiple-values-of-a-list
'''

title, *text = original.split('\n\n')
text = ' '.join(text)

In [25]:
title

'Moroccoâ\x80\x93Saudi Arabia relations'

In [26]:
import nltk.data

sent_split = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = sent_split.tokenize(text)
sentences[0]

'Moroccanâ\x80\x93Saudi Arabian relations refers to the current and historical relations between Morocco and Saudi Arabia.'