In [5]:
import logging
from classes.utilities import doc_utilities
from classes.persist_index_memory import persist_index_memory
from classes.inverted_index import inverted_index
from classes.ranking import ranking
from classes.preprocessing import preprocessing
from classes.snip import snip
from classes.match import match
import nltk

logging.basicConfig(level=logging.DEBUG, format='%(filename)s %(levelname)s: %(asctime)s: %(message)s')
logger = logging.getLogger('main')
logger.info('Executing indexing module')
logger.info('Reading file')

# LOAD DATASET
u = doc_utilities()
u.read_data_set(file='data/12000_docs.p')
memory_unit = persist_index_memory()
u.process_documents_for_indexing()
i_i = inverted_index(memory_unit)

# CREATE INDEX FROM DATASET
i_i.create_index(collection=u.get_collection_json(),
                     process_text=True)
i_i.create_term_document_matrix()

# GIVEN QUERY FROM FRONT-END, FIND RELEVANT RESULTS
query = 'When is shark week?' # user input
print('input:',query)
matcher = match()
q = preprocessing().the_works(query)
CR = i_i.lookup_query(q)
CR = matcher.boolean(CR)
# added in case not every token matches
doctoken_matchnums =[len(i) for i in CR.values()]
scaler = max(doctoken_matchnums)
CR = matcher.scale(CR,scaler)

# RANK RELEVANT RESULTS
r_ranking = ranking()
resources = list(CR.keys())
max_freq = r_ranking.get_max_frequencies(index=CR) # , num_docs=len(i_i.storage.index)
# Now save this into the persisted memory object within the index
i_i.storage.max_frequency_terms_per_doc = max_freq
res = r_ranking.relevance_ranking(query = query,
                           num_results=5,
                            index=i_i.index,
                            resources=resources,
                            max_freq=i_i.storage.max_frequency_terms_per_doc,
                            N=len(i_i.storage.index),
                            term_doc_matrix=i_i.doc_term_matrix_all)

<ipython-input-5-db4d6e435bb1> INFO: 2019-10-12 17:03:25,676: Executing indexing module
<ipython-input-5-db4d6e435bb1> INFO: 2019-10-12 17:03:25,676: Reading file
persist_index_memory.py INFO: 2019-10-12 17:03:25,698: Instantiating storage object
utilities.py INFO: 2019-10-12 17:03:25,698: Converting document to be ready to be indexed
[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
inverted_index.py INFO: 2019-10-12 17:03:25,954: Instantiating inverted index object
inverted_index.py DEBUG: 2019-10-12 17:03:25,955: Collection length = 12700
12700it [00:39, 323.55it/s]


input: When is shark week?


[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 314/314 [00:00<00:00, 600005.22it/s]


In [None]:
# GENERATE RANKED JSON_SNIPPETS FOR FRONT-END
# snipper = snip(r_ranking)
# json = snipper.get_snippets(res, resources=resources, query=query, i_i=i_i)
print('output:',json)

In [6]:
res

[(101, 120.96872540393481),
 (231, 120.96872540393481),
 (285, 120.96872540393481),
 (289, 120.96872540393481),
 (33, 60.484362701967406)]

In [10]:
resources[101]

4126

In [13]:
sent_split = nltk.data.load('tokenizers/punkt/english.pickle')
json_frontend = []

for id, weight in tqdm(res):
    doc_id = resources[id]

    # retrieve original unprocessed doc-string    
    original = i_i.storage.get(doc_id)['content']

    # get doc title
    title, *text = original.split('\n\n')
    text = ' '.join(text)

    # separate text into sentences
    sentences = sent_split.tokenize(text)
    # generate snippet
#     doc_snippet = self.gen_snip(document=sentences, i_i=i_i, query=query)

100%|██████████| 5/5 [00:00<00:00, 377.69it/s]


In [14]:
sentences

['"One Art" is a poem by American poet Elizabeth Bishop.',
 'The poem was included in her collection "Geography III".',
 "It is also the name of a collection of Bishop's letters.",
 'Bishop wrote seventeen drafts of the poem, with titles including "How to Lose Things," "The Gift of Losing Things," and "The Art of Losing Things".',
 'By the fifteenth draft, Bishop had chosen "One Art" as her title.',
 'The poem was written over the course of two weeks, an unusually short time for Bishop.',
 'Some of the piece is adapted from a longer poem, "Elegy", that Bishop never completed or published.',
 'Bishop\'s life was marked by loss and instability, which is reflected in many of the poems of "Geography III".',
 '"One Art" is narrated by a speaker who details losing small items, which gradually become more significant, moving, for example, from the misplacement of "door keys" to the loss of "two cities" where the speaker presumably lived.',
 'The poem is a villanelle, an originally French poet

In [94]:
def gen_snip(document=[], i_i={}, query=''):
    q = preproc.the_works(query)
    q_max_freqs = r_ranking.get_max_frequencies(index=i_i.index, sentence_tokens=q)
    i_i.storage.max_frequency_terms_per_doc = q_max_freqs
    
    print(q_max_freqs)
        
    # derive weights for tokens in query
    q_weights=relevance_ranking(query = query,
                           num_results=len(q),
                            index=i_i.index,
                            resources=[],
                            max_freq=i_i.storage.max_frequency_terms_per_doc,
                            N=len(i_i.storage.index),
                            term_doc_matrix=i_i.doc_term_matrix_all,
                            weigh=True,
                            qu_we=True)     
    
    q_w = [w[1] for w in sorted(q_weights, key=lambda x: x[0])]

    print('num results for q_weights', query)
    print('query weights generated', q_weights)
    cos_score = -1.0

In [99]:
gen_snip(document=sentences, i_i=i_i, query=query)

[7, 10]
num results for q_weights When is shark week?
query weights generated [(0, 17.2812464862764), (1, 12.096872540393482)]


In [98]:
def relevance_ranking(query='', num_results=5, index=[], resources=[], max_freq=[], N=0, term_doc_matrix=[],
                      weigh=False, qu_we=False):
    # Now we will preprocess and tokenize our query.
    q=preproc.the_works(text=query)
    results={}

    if weigh: resources=q

    # We will iterate through all the documents in the resources list
    for id, val in enumerate(resources):
        TF=0
        IDF=0
        #Iterate through query tokens
#             print('Id = ', id, '-->', val)
        for w in q:
            if w not in index:
                print('Term {} not found in index'.format(w))
                continue
            if not qu_we: freq = r_ranking.get_term_frequency(entries=index[w], doc_id=id)
            else: freq = query.count(w) 
                
            max_d = max_freq[id] #For base 0 reason
            # Now calculate TF
            if max_d==0: TF=0
            else: TF=TF+freq/max_d
            # Now calculate IDF
            n_w=r_ranking.n_w(term_doc_matrix=term_doc_matrix, term=w)
            IDF = IDF+math.log2(N/n_w)
        # Now let's join the TF-IDF
        results[id] = TF*IDF
    sorted_result = sorted(results.items(), key=operator.itemgetter(1), reverse = True)
    return sorted_result[:num_results]

In [69]:
weigh = True
q=preproc.the_works(text=query)
results={}

if weigh: resources=q

In [79]:
i_i.index[w]

[{'docId': 289, 'frequency': 2},
 {'docId': 2574, 'frequency': 1},
 {'docId': 3095, 'frequency': 1},
 {'docId': 3556, 'frequency': 7},
 {'docId': 3860, 'frequency': 1},
 {'docId': 5138, 'frequency': 1},
 {'docId': 7047, 'frequency': 1},
 {'docId': 7785, 'frequency': 1},
 {'docId': 9185, 'frequency': 4},
 {'docId': 9334, 'frequency': 1},
 {'docId': 9615, 'frequency': 1},
 {'docId': 9749, 'frequency': 1},
 {'docId': 10161, 'frequency': 2},
 {'docId': 10955, 'frequency': 1}]

In [72]:
# We will iterate through all the documents in the resources list
for id, val in enumerate(resources):
    TF=0
    IDF=0
    #Iterate through query tokens
    for w in q:
        print('word', w)
        print(w, 'in index', w in i_i.index)
        print('entries',i_i.index[w])
        if w not in i_i.index:
            print('Term {} not found in index'.format(w))
            continue
        if not qu_we: freq = r_ranking.get_term_frequency(entries=i_i.index[w], doc_id=id)
        else: query.count(w)
            
        print('freq', freq)
        
        max_d = max_freq[id] #For base 0 reason
        
        # Now calculate TF
        if max_d==0: TF=0
        else: TF=TF+freq/max_d
            
        print('TF', TF)
        
        # Now calculate IDF
        n_w=r_ranking.n_w(term_doc_matrix=term_doc_matrix, term=w)
        IDF = IDF+math.log2(N/n_w)
    
        print('IDF', IDF)
        
    # Now let's join the TF-IDF
    results[id] = TF*IDF
sorted_result = sorted(results.items(), key=operator.itemgetter(1), reverse = True)

word shark
shark in index True
entries [{'docId': 289, 'frequency': 2}, {'docId': 2574, 'frequency': 1}, {'docId': 3095, 'frequency': 1}, {'docId': 3556, 'frequency': 7}, {'docId': 3860, 'frequency': 1}, {'docId': 5138, 'frequency': 1}, {'docId': 7047, 'frequency': 1}, {'docId': 7785, 'frequency': 1}, {'docId': 9185, 'frequency': 4}, {'docId': 9334, 'frequency': 1}, {'docId': 9615, 'frequency': 1}, {'docId': 9749, 'frequency': 1}, {'docId': 10161, 'frequency': 2}, {'docId': 10955, 'frequency': 1}]
freq 0
TF 0.0


NameError: name 'term_doc_matrix' is not defined

In [48]:
#     # save only the 2 closest sentences
#     snippet = collections.deque(maxlen=2)

#     # derive weights for sentences in document...
#     for i in range(len(document)):
#         s = document[i]
#         s_max_freqs = r_ranking.get_max_frequencies(index=i_i.index, sentence_tokens=preproc.the_works(text=s))
#         i_i.storage.max_frequency_terms_per_doc = s_max_freqs
#         #print('num_results for s_weights',s)

#         s_weights = r_ranking.relevance_ranking(query = s,
#                                num_results=len(s),
#                                 index=i_i.index,
#                                 resources=[],
#                                 max_freq=i_i.storage.max_frequency_terms_per_doc,
#                                 N=len(i_i.storage.index),
#                                 term_doc_matrix=i_i.doc_term_matrix_all,
#                                 weigh=True)

#         ordered_s_weights = [w[1] for w in sorted(s_weights, key=lambda x: x[0])]
#         #print('weighted words',ordered_s_weights)

#         #print('doc weights generated', ordered_s_weights)
#         cosine_sim = cos_similarity(ordered_s_weights, q_w)

#         print('cosine score', cosine_sim)

#         #print('cosine similarity calculated', cosine_sim)
#         if cosine_sim > cos_score:
#             cos_score = cosine_sim
#             snippet.append(s)
#     return snippet

In [26]:
preproc = preprocessing()

[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [1]:
from collections import Counter
import math
import logging
from classes.preprocessing import preprocessing
import operator
from classes.inverted_index import *
from classes.ranking import ranking
from tqdm import tqdm
import collections
import nltk

In [2]:
def cos_similarity(d_weights, q_weights):
    """
    calculate cosine similarity between 2 vectors (lists of numbers)
    For snippet generation
    """
    numerator = sum([d*q for d,q in zip(d_weights, q_weights)])
    d_norm = sum([d*d for d in d_weights])
    q_norm = sum([q*q for q in q_weights])
    denom = np.sqrt(d_norm * q_norm)
    return numerator/denom

In [3]:
def get_snippets(ranked_results, resources=[], query="", i_i={}):

    #print('get_snippets called')
    sent_split = nltk.data.load('tokenizers/punkt/english.pickle')
    json_frontend = []

    for id, weight in tqdm(ranked_results):
     #   print('id', id)
        doc_id = resources[id]
      #  print('doc id found')
        # retrieve original unprocessed doc-string    
        original = i_i.storage.get(doc_id)['content']
       # print('doc retrieved from i_i\'s stroage')
        # get doc title
        title, *text = original.split('\n\n')
        text = ' '.join(text)

        # separate text into sentences
        sentences = sent_split.tokenize(text)

     #   print('before gen snip call', id)
        # generate snippet
        doc_snippet = self.gen_snip(document=sentences, i_i=i_i, query=query)

        # prepare for frontend
        json_frontend.append({'title':title,     'snippet':'...'.join(doc_snippet)})

      #  print('title', title, 'snippet','...'.join(doc_snippet))

    return json_frontend

In [None]:
class snip:
    """
    Class to handle generating snippets operations.
    """
    def __init__(self, ranker=''):
        self.preproc = preprocessing()
        self.ranker = ranker
        pass