In [1]:
import pandas as pd
import numpy as np
import math
from collections import Counter

import xml.etree.ElementTree as ET

import logging

from query_utils import *

In [2]:
get_grades('corpus.qrel')


{'202': {'clueweb12-0904wb-87-00462': '0',
  'clueweb12-0900tw-23-04411': '0',
  'clueweb12-0900tw-24-00475': '0',
  'clueweb12-0502wb-85-19368': '0',
  'clueweb12-1415wb-40-03869': '0',
  'clueweb12-1406wb-39-07752': '0',
  'clueweb12-0405wb-59-02836': '0',
  'clueweb12-0204wb-17-00480': '0',
  'clueweb12-0011wb-73-29217': '0',
  'clueweb12-1900tw-27-05755': '0',
  'clueweb12-1900tw-32-20698': '0',
  'clueweb12-1116wb-73-22091': '0',
  'clueweb12-0407wb-28-19842': '0',
  'clueweb12-0002wb-14-02885': '0',
  'clueweb12-0009wb-43-04224': '0',
  'clueweb12-1206wb-33-15528': '0',
  'clueweb12-0102wb-86-24545': '0',
  'clueweb12-1807wb-86-09003': '0',
  'clueweb12-1000wb-85-18667': '0',
  'clueweb12-0800tw-57-00770': '0',
  'clueweb12-1013wb-15-21838': '0',
  'clueweb12-0511wb-25-03910': '0',
  'clueweb12-1602wb-36-11840': '0',
  'clueweb12-0013wb-55-03388': '0',
  'clueweb12-1300tw-73-05391': '0',
  'clueweb12-0206wb-76-27846': '-2',
  'clueweb12-1100tw-35-08357': '0',
  'clueweb12-1007wb-

In [3]:

topics=parse_topics('topics.xml')
topics

{'202': {'type': 'uss carl vinson',
  'desc': 'Find the home page of the USS Carl Vinson (CVN70) carrier.'},
 '214': {'type': 'capital gains tax rate',
  'desc': 'What does the US capital tax rate consist of and how is it broken down?'},
 '216': {'type': 'nicolas cage movies',
  'desc': 'What movies has Nicolas Cage appeared in?'},
 '221': {'type': 'electoral college 2008 results',
  'desc': 'What were the results of the electoral college for the 2008 US presidential race?'},
 '227': {'type': 'i will survive lyrics',
  'desc': 'Find the lyrics to the song "I Will Survive".'},
 '230': {'type': "world's biggest dog",
  'desc': "What is the world's biggest dog?"},
 '234': {'type': 'dark chocolate health benefits',
  'desc': 'What are the health benefits associated with eating dark chocolate?'},
 '243': {'type': 'afghanistan flag',
  'desc': 'Find pictures of the Afghanistan flag.'},
 '246': {'type': 'civil war battles in South Carolina',
  'desc': 'Which civil war battles were fought in S

In [2]:
inv_dict= lambda my_map : {v: k for k, v in my_map.items()}

docids=inv_dict(read_docids('docids.txt'))
termids=inv_dict(read_termids('termids.txt'))
term_info=read_terminfo('term_info.txt')

seek_inv_index_curried=lambda offset, doc_id: seek_inv_index(offset, doc_id, 'term_index.txt')

get_tf_curried= lambda word, doc_id: get_tf(word, doc_id, seek_inv_index_curried, termids, term_info)
get_total_documents_curried = lambda: get_total_documents('forwardindex.txt')
get_dfi_curried=lambda word: get_dfi(word, termids, term_info)
process_word_curried= lambda word: process_word(word, get_stop_words('stoplist.txt'))

In [48]:
def get_tfidf_from_corpus(word, doc_id, get_tf, get_tfidf):
    '''
    word: str
    doc_id: int
    get_tf: function(word, doc_id) returns tf
    get_tfidf: computes tfidf function(tf, word)
    '''
    tf=get_tf(word, doc_id)
    
    return get_tfidf(tf,word)

def get_tfidf(tf,word, get_total_documents, get_dfi):
    '''
    get_total_documents: function() return total D in corpus
    get_dfi: function(word: str) returns dfi of word
    '''
    
    D=get_total_documents()
    dfi=int(get_dfi(word))
    
    if tf==0:
        return 0
    
    return (
        (math.log(tf) + 1)*
        math.log(D/dfi)
    )
    
def get_cosine(q, d):
    return np.sum(np.multiply(q, d))/(np.linalg.norm(q)*np.linalg.norm(d))

def get_tfid_score(query, doc_id ,get_tfidf_from_corpus, get_tfidf ):
    '''
    query: array of str to search
    get_tfidf_from_corpus: function(word, doc_id)
    get_tfidf: function(tf)
    '''
    query=Counter(query)
    d_vec=np.zeros(( len(query) ,1 ))
    q_vec=np.zeros(( len(query) ,1 ))
    
    for index, q in enumerate(query):
        d_vec[index]=get_tfidf_from_corpus(q, doc_id)
        q_vec[index]=get_tfidf(query[q],q)
        
    logging.debug(f'd vector {d_vec}')
    logging.debug(f'q vector {q_vec}')
    
    return get_cosine(d_vec, q_vec)
    
get_tfidf_curried= lambda tf,word: get_tfidf(tf, word, get_total_documents_curried, get_dfi_curried)
get_tfidf_from_corpus_curried=lambda word, doc_id: get_tfidf_from_corpus(word, doc_id, get_tf_curried, get_tfidf_curried)
get_tfid_score_curried= lambda query, doc_id: get_tfid_score(query, doc_id, get_tfidf_from_corpus_curried, get_tfidf_curried)

get_tfid_score_curried(['test', 'word'], 3680)



0.7145115408789816

In [78]:
def run_query(query, doc_ids, score_function, process_word):
    query=process_word(query)
    print(query)
    
    for doc in doc_ids:
        print(doc)
        print(score_function(query, doc))
        
logging.basicConfig()
logging.getLogger().setLevel(logging.ERROR)

docids_q=get_relevant_docids('test hello', termids, process_word_curried, seek_inv_index_me_curried)
#print(docids_q)

run_query('test hello',docids_q, get_tfid_score_curried, process_word_curried)


['test', 'hello']
2595
0.6265259841815551
2734
0.7794005331954386
2959
0.7794005331954386
2352
0.6265259841815551
3912
0.9854902499284404
2312
0.6265259841815551
2710
0.7794005331954386
3391
0.6265259841815551
3390
0.6265259841815551
2507
0.6265259841815551
3527
0.6265259841815551
2195
0.6265259841815551
2699
0.7794005331954386
2884
0.7794005331954384
2728
0.7794005331954386
3003
0.7794005331954384
2695
0.7794005331954386
3044
0.7794005331954386
1954
0.6265259841815551
3242
0.7794005331954384
2661
0.6265259841815551
3325
0.6265259841815551
3382
0.6265259841815551
3292
0.7794005331954384
2700
0.7794005331954386
3288
0.6265259841815551
3370
0.6265259841815551
2662
0.7794005331954384
2738
0.7794005331954386
2713
0.7794005331954386
3845
0.7794005331954386
1924
0.6265259841815551
2467
0.6265259841815551
3821
0.6265259841815551
2845
0.7794005331954386
2712
0.7794005331954386
3378
0.6265259841815551
2508
0.6265259841815551
1900
0.626525984181555
2401
0.6265259841815551
2239
0.6265259841815551

In [26]:
logging.basicConfig()
logging.getLogger().setLevel(logging.ERROR)
logging.debug('message')

offset=term_info[termids['test']]['offset']
offset
seek_inv_index(offset,3680, 'term_index.txt')

2

In [73]:
# return relevant docids
def seek_inv_index_me(offset, doc_id, path, return_index=False):
    doc_id=str(doc_id)
    found=False
    with open(path, 'r') as f:
        f.seek(int(offset))
        line = f.readline().strip().split('\t')
        line = [x for x in line[1:] if x != '']

        logging.debug(f'inv index {line}')
        if return_index:
            return line
        for l in line:
            if l.split(':')[0] == doc_id:
                found=True
                return int(l.split(':')[1])
        if not found:
            logging.debug('docid not found')
            return 0
    f.close()
def get_relevant_docids(query, termids, process_word, seek_inv_index_me):
    '''
    query in preprocessed form str[]
    termids: word to id dict
    process_word: function(query)
    seek_inv_index: function(offset, 0, return_index)
    '''
    query=process_word(query)
    rel_doc=set()
    for q in query:
        tid=termids[q]
        temp=seek_inv_index_me(term_info[tid]['offset'],0, return_index=True)
        
        temp=[x.split(':')[0] for x in temp]
        rel_doc=rel_doc.union(set(temp))
    return list(rel_doc)
        
seek_inv_index_me_curried=lambda offset, doc_id, return_index=True: seek_inv_index_me(offset, doc_id, 'term_index.txt', return_index)
get_relevant_docids('test doc', termids, process_word_curried, seek_inv_index_me_curried)
    

['2595',
 '2352',
 '3087',
 '3912',
 '2312',
 '2050',
 '3391',
 '3390',
 '2507',
 '2581',
 '2012',
 '1076',
 '3527',
 '2195',
 '1954',
 '2661',
 '3325',
 '3382',
 '1866',
 '3288',
 '2620',
 '3370',
 '1020',
 '1924',
 '2467',
 '3821',
 '2845',
 '1167',
 '3378',
 '2508',
 '1367',
 '1900',
 '2401',
 '2239',
 '2277',
 '2453',
 '3304',
 '1895',
 '1129',
 '2254',
 '3790',
 '1435',
 '1989',
 '3389',
 '3782',
 '2430',
 '3359',
 '1937',
 '3689',
 '1088',
 '3062',
 '1953',
 '3388',
 '2830',
 '3431',
 '3385',
 '3023',
 '1892',
 '1995',
 '2139',
 '2046',
 '3785',
 '2680',
 '3308',
 '3971',
 '3377',
 '3780',
 '3355',
 '3196',
 '1865',
 '2482',
 '3383',
 '1769',
 '2824',
 '3677',
 '1555',
 '2403',
 '2635',
 '1054',
 '2843',
 '2385',
 '2580',
 '3269',
 '3779',
 '2098',
 '1285',
 '1687',
 '3379',
 '1141',
 '3613',
 '3301',
 '2104',
 '1371',
 '1151',
 '3673',
 '1890',
 '3234',
 '2118',
 '3386',
 '3092',
 '3521',
 '2392',
 '3041',
 '2733',
 '3240',
 '2862',
 '3420',
 '2402',
 '2764',
 '2044',
 '3692',
 