In [1]:
%load_ext autoreload
%autoreload 2
import itertools
import numpy as np
import gzip
import io
import math
import json
import time
from collections import defaultdict
import operator
import tempfile

## Cartesian product

In [2]:
all_combi = list(itertools.product(range(6), repeat=6))

In [51]:
print(len(all_combi))
print(all_combi[20000])

%store all_combi

46656
(2, 3, 2, 3, 3, 2)
Stored 'all_combi' (list)


In [4]:
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

## Metrics

In [5]:
def idf(N, df):
    return math.log((N - df + 0.5) / (df + 0.5))

In [7]:
def weighted_tf_with_postings(weights, positions, doc_len):
    steps = [0,100,200,400,600,1000]
    tfs = np.zeros(6)
    for pos in positions:
        for step_i, step_j in pairwise(steps):
            if pos >= step_i and pos < step_j:
                tfs[steps.index(step_i)] += 1
        if pos > steps[-1]:
            tfs[-1] += 1
    print(tfs)
    return sum([a*b for a,b in zip(weights,tfs)])

In [8]:
def weighted_tf(weights, tfs):
    return sum([a*b for a,b in zip(weights,tfs)])

In [12]:
print(weighted_tf((0, 0, 1, 1, 0, 1), [0, 0, 2, 4, 0, 0]))

6


In [13]:
def bm25(tf, df, doclen, N, avg_doclen, k1=1.2, b=0.75):
    return idf(N, df) * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doclen / avg_doclen))))

In [14]:
## From Terrier
# double K = this.k_1 * (1.0D - this.b + this.b * docLength / this.averageDocumentLength);
# return WeightingModelLibrary.log((this.numberOfDocuments - this.documentFrequency + 0.5D)
#         / (this.documentFrequency + 0.5D)) * ((this.k_1 + 1.0D) * tf / (K + tf)) *
#         ((this.k_3 + 1.0D) * this.keyFrequency / (this.k_3 + this.keyFrequency));

def bm25Terrier(tf, df, doclen, N, avg_doclen, k1=1.2, k3=8, b=0.75):
    K = k1 * (1.0 - b + b * doclen / avg_doclen)
    bm25 = math.log((N - df + 0.5) / (df + 0.5)) * ((k1 + 1.0) * tf / (K + tf)) * ((k3 + 1.0) * 1 / (k3 + 1)) 
    return bm25

#bm25terrier = bm25Terrier(new_tf, df_list[i], doclen, N, AVG_LEN)

## Read file with info

## 1 query - all combinations

In [50]:
# Read doc txt: TAB separated
# qid,
# N,
# Arrays.toString(documentFrequencies),
# avgDocLen,
# docid,
# docno,
# doclen,
# tf_q1, tf_q2, tf_q3, tf_q4, tf_q5, tf_q6

N = 50220423
AVG_LEN = 963.90334


bm25Matrix_aslist = list()
docno_aslist = list()
query2BM25Matrix = dict()
query2docno = dict()

start = time.time()cal
print("Starting", start)

counter = 0
current_id = 136
with open("/home/muntean/terrier-passage/tfs-per-cluster/all-matches-fields-tfs-cluster-7.txt", "r") as inputFile:
    for line in inputFile:
        
        counter+=1
        if counter%10==0:
            break
        
#         print line
        data = line.replace("\n","").split("\t")
        qid = int(data[0])
        df_list = json.loads(data[2])
        docid = int(data[4])
        docno = data[5]
        doclen = int(data[6])
        tf_list = np.array(json.loads(data[7]))
        
        if qid!=current_id:
            query2BM25Matrix[current_id] = np.array(bm25Matrix_aslist)
            query2docno[current_id] = docno_aslist
            
            bm25Matrix_aslist = list()
            docno_aslist = list()
            
            current_id = qid
        
        
        bm25Array = np.zeros(len(all_combi))
        for combo_index, combo in enumerate(all_combi):
            doc_bm25 = 0
            for i, term_tf_list in enumerate(tf_list):  
                new_tf = weighted_tf(combo, term_tf_list)
                bm25classic =  bm25(new_tf, df_list[i], doclen, N, AVG_LEN)
                doc_bm25 += bm25classic
            bm25Array[combo_index] = doc_bm25
        
        bm25Matrix_aslist.append(bm25Array)
        docno_aslist.append(docno)
    
    query2BM25Matrix[current_id] = np.array(bm25Matrix_aslist)
    query2docno[current_id] = docno_aslist
        
        
print ("Finished, ", (time.time() - start) )#/ 3600

Starting 1513776048.8882127
Finished,  28.063118934631348


In [16]:
print(query2BM25Matrix[195].shape)
print(len(docno_aslist))

(10000, 5)
10000


In [17]:
query2BM25Matrix.keys()


dict_keys([136, 195])

In [18]:
query2BM25Matrix

{136: array([[  0.        ,   0.        ,   0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ,   0.        ,   0.        ],
        ..., 
        [  0.        ,  10.40001438,  12.32755011,  13.15527257,
          13.61647204],
        [  0.        ,   0.        ,   0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ,   0.        ,   0.        ]]),
 195: array([[  0.        ,   0.        ,   0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ,   0.        ,   0.        ],
        [  0.        ,   8.86346466,  12.7403077 ,  14.91487623,
          16.30650737],
        ..., 
        [  0.        ,   0.        ,   0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ,   0.       

In [19]:
query2docno

{136: ['clueweb09-en0000-00-03340',
  'clueweb09-en0000-00-04952',
  'clueweb09-en0000-00-05824',
  'clueweb09-en0000-00-25112',
  'clueweb09-en0000-00-26973',
  'clueweb09-en0000-01-06049',
  'clueweb09-en0000-02-07838',
  'clueweb09-en0000-02-10273',
  'clueweb09-en0000-02-26555',
  'clueweb09-en0000-02-26638',
  'clueweb09-en0000-02-27578',
  'clueweb09-en0000-03-02648',
  'clueweb09-en0000-03-02662',
  'clueweb09-en0000-03-11221',
  'clueweb09-en0000-03-11240',
  'clueweb09-en0000-03-11273',
  'clueweb09-en0000-03-11278',
  'clueweb09-en0000-03-16836',
  'clueweb09-en0000-04-26972',
  'clueweb09-en0000-04-26975',
  'clueweb09-en0000-04-26984',
  'clueweb09-en0000-04-28201',
  'clueweb09-en0000-04-30295',
  'clueweb09-en0000-05-12917',
  'clueweb09-en0000-05-17917',
  'clueweb09-en0000-06-00815',
  'clueweb09-en0000-06-03706',
  'clueweb09-en0000-06-05472',
  'clueweb09-en0000-07-06818',
  'clueweb09-en0000-07-10691',
  'clueweb09-en0000-07-28172',
  'clueweb09-en0000-07-28409',
  '

In [20]:
### order BM25 per query per column descending

weight_index = 2
# print all_combi[2]

# 1. sort column
# we need to iterate all queries
def rankDocumentsInQuery(matrix, docno_list, weight_index):
    doc2score = [(a,b) for a, b in zip(docno_list, list(matrix[:,weight_index]))]
    doc2score.sort(key=operator.itemgetter(1), reverse=True)
    return doc2score

# 2. prepare data for rankeval
def prepareData(qid, doc2score, all_combi, weight_index):
    # 1 Q0 clueweb09-en0010-57-32595 18 26.023832769770642 BM25P
    for i, item in enumerate(doc2score[:1000]):
        line = " ".join([str(qid), "Q0", item[0], str(i), str(item[1]), "BM25P"+"".join([str(x) for x in all_combi[weight_index]])])
        if "clueweb09-enwp01-11-10197" in line:
            print(line)
        yield line + "\n"
        

In [21]:
# create TEMP file
tp = tempfile.NamedTemporaryFile("w+")
for qid in query2BM25Matrix.keys():
    matrix = query2BM25Matrix[qid]
    docno_list = query2docno[qid]

    doc2score = rankDocumentsInQuery(matrix, docno_list, weight_index)

    for line in prepareData(qid, doc2score, all_combi, weight_index):
        tp.write(line)
    print(line)
tp.flush()

#     tp.seek(0)
#     print(tp.read())

136 Q0 clueweb09-enwp00-38-16113 999 12.3958077175 BM25P000002

195 Q0 clueweb09-en0005-59-20952 999 11.7401494731 BM25P000002



In [23]:
### Trec_eval
### https://github.com/cvangysel/pytrec_eval/blob/master/examples/trec_eval.py

import pytrec_eval

In [48]:
# Loading data

with open("/home/muntean/cw09b_urls_blocks_nostem/eval/wt-cluster-7.qrels", 'r') as f_qrel:
    qrel = pytrec_eval.parse_qrel(f_qrel)

with open(tp.name, 'r') as f_run:
    run = pytrec_eval.parse_run(f_run)

evaluator = pytrec_eval.RelevanceEvaluator(
    qrel, pytrec_eval.supported_measures)

results = evaluator.evaluate(run)




def print_line(measure, scope, value):
    print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))

for query_id, query_measures in sorted(results.items()):
    for measure, value in sorted(query_measures.items()):
        print_line(measure, query_id, value)

# Scope hack: use query_measures of last item in previous loop to
# figure out all unique measure names.
#
# TODO(cvangysel): add member to RelevanceEvaluator
#                  with a list of measure names.
for measure in sorted(query_measures.keys()):
    print_line(
        measure,
        'all',
        pytrec_eval.compute_aggregated_measure(measure, [query_measures[measure] for query_measures in results.values()]))


['11pt_avg', 'G', 'P_10', 'P_100', 'P_1000', 'P_15', 'P_20', 'P_200', 'P_30', 'P_5', 'P_500', 'Rndcg', 'Rprec', 'Rprec_mult_0.20', 'Rprec_mult_0.40', 'Rprec_mult_0.60', 'Rprec_mult_0.80', 'Rprec_mult_1.00', 'Rprec_mult_1.20', 'Rprec_mult_1.40', 'Rprec_mult_1.60', 'Rprec_mult_1.80', 'Rprec_mult_2.00', 'binG', 'bpref', 'gm_bpref', 'gm_map', 'infAP', 'iprec_at_recall_0.00', 'iprec_at_recall_0.10', 'iprec_at_recall_0.20', 'iprec_at_recall_0.30', 'iprec_at_recall_0.40', 'iprec_at_recall_0.50', 'iprec_at_recall_0.60', 'iprec_at_recall_0.70', 'iprec_at_recall_0.80', 'iprec_at_recall_0.90', 'iprec_at_recall_1.00', 'map', 'map_cut_10', 'map_cut_100', 'map_cut_1000', 'map_cut_15', 'map_cut_20', 'map_cut_200', 'map_cut_30', 'map_cut_5', 'map_cut_500', 'ndcg', 'ndcg_cut_10', 'ndcg_cut_100', 'ndcg_cut_1000', 'ndcg_cut_15', 'ndcg_cut_20', 'ndcg_cut_200', 'ndcg_cut_30', 'ndcg_cut_5', 'ndcg_cut_500', 'ndcg_rel', 'num_nonrel_judged_ret', 'num_q', 'num_rel', 'num_rel_ret', 'num_ret', 'recall_10', 'recal

In [None]:
### output should be

# cluster weight [list of metrics]

In [30]:
# Loading data

with open("/home/muntean/cw09b_urls_blocks_nostem/eval/wt-cluster-7.qrels", 'r') as f_qrel:
    qrel = pytrec_eval.parse_qrel(f_qrel)

with open(tp.name, 'r') as f_run:
    run = pytrec_eval.parse_run(f_run)

evaluator = pytrec_eval.RelevanceEvaluator(
    qrel, pytrec_eval.supported_measures)

results = evaluator.evaluate(run)

def print_line(measure, scope, value):
#     print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))
    return '{}:{}:{:.6f}'.format(measure, scope, value)

# for query_id, query_measures in sorted(results.items()):
#     for measure, value in sorted(query_measures.items()):
#         print_line(measure, query_id, value)

# Scope hack: use query_measures of last item in previous loop to
# figure out all unique measure names.
#
# TODO(cvangysel): add member to RelevanceEvaluator
#                  with a list of measure names.

measure_list = list()
for measure in sorted(query_measures.keys()):
    m = print_line(
        measure,
        'all',
        pytrec_eval.compute_aggregated_measure(measure, [query_measures[measure] for query_measures in results.values()]))
    measure_list.append(m)

In [32]:
",".join(measure_list)

'11pt_avg:all:0.003362,G:all:0.017024,P_10:all:0.000000,P_100:all:0.010000,P_1000:all:0.003000,P_15:all:0.000000,P_20:all:0.000000,P_200:all:0.007500,P_30:all:0.000000,P_5:all:0.000000,P_500:all:0.005000,Rndcg:all:0.024469,Rprec:all:0.000000,Rprec_mult_0.20:all:0.000000,Rprec_mult_0.40:all:0.000000,Rprec_mult_0.60:all:0.000000,Rprec_mult_0.80:all:0.000000,Rprec_mult_1.00:all:0.000000,Rprec_mult_1.20:all:0.000000,Rprec_mult_1.40:all:0.000000,Rprec_mult_1.60:all:0.000000,Rprec_mult_1.80:all:0.000000,Rprec_mult_2.00:all:0.020833,binG:all:0.016763,bpref:all:0.012153,gm_bpref:all:0.000493,gm_map:all:0.000383,infAP:all:0.002558,iprec_at_recall_0.00:all:0.022284,iprec_at_recall_0.10:all:0.008242,iprec_at_recall_0.20:all:0.006460,iprec_at_recall_0.30:all:0.000000,iprec_at_recall_0.40:all:0.000000,iprec_at_recall_0.50:all:0.000000,iprec_at_recall_0.60:all:0.000000,iprec_at_recall_0.70:all:0.000000,iprec_at_recall_0.80:all:0.000000,iprec_at_recall_0.90:all:0.000000,iprec_at_recall_1.00:all:0.000