In [1]:
import itertools
import numpy as np
import gzip
import io
import math
import json
import time
from collections import defaultdict

## Cartesian product

In [2]:
all_combi = list(itertools.product(range(6), repeat=6))

In [15]:
print len(all_combi)
print all_combi[20000]

46656
(2, 3, 2, 3, 3, 2)


In [4]:
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

## Metrics

In [5]:
def idf(N, df):
    return math.log((N - df + 0.5) / (df + 0.5))

In [6]:
def weighted_tf_with_postings(weights, positions, doc_len):
    steps = [0,100,200,400,600,1000]
    tfs = np.zeros(6)
    for pos in positions:
        for step_i, step_j in pairwise(steps):
            if pos >= step_i and pos < step_j:
                tfs[steps.index(step_i)] += 1
        if pos > steps[-1]:
            tfs[-1] += 1
    print tfs
    return sum([a*b for a,b in zip(weights,tfs)])

In [17]:
def weighted_tf(weights, tfs):
    return sum([a*b for a,b in zip(weights,tfs)])

In [9]:
def bm25(tf, df, doclen, N, avg_doclen, k1=1.2, b=0.75):
    return idf(N, df) * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doclen / avg_doclen))))

In [33]:
## From Terrier
# double K = this.k_1 * (1.0D - this.b + this.b * docLength / this.averageDocumentLength);
# return WeightingModelLibrary.log((this.numberOfDocuments - this.documentFrequency + 0.5D)
#         / (this.documentFrequency + 0.5D)) * ((this.k_1 + 1.0D) * tf / (K + tf)) *
#         ((this.k_3 + 1.0D) * this.keyFrequency / (this.k_3 + this.keyFrequency));

def bm25Terrier(tf, df, doclen, N, avg_doclen, k1=1.2, k3=8, b=0.75):
    K = k1 * (1.0 - b + b * doclen / avg_doclen)
    bm25 = math.log((N - df + 0.5) / (df + 0.5)) * ((k1 + 1.0) * tf / (K + tf)) * ((k3 + 1.0) * 1 / (k3 + 1)) 
    return bm25

#bm25terrier = bm25Terrier(new_tf, df_list[i], doclen, N, AVG_LEN)

## Read file with info

## 1 query - all combinations

In [39]:
# Read doc txt: TAB separated
# qid,
# N,
# Arrays.toString(documentFrequencies),
# avgDocLen,
# docid,
# docno,
# doclen,
# tf_q1, tf_q2, tf_q3, tf_q4, tf_q5, tf_q6

i = 0 
N = 50220423
AVG_LEN = 963.90334

docid2docno = dict()
docid2weightbm25 = dict()

start = time.time()
print "Starting", start
with io.BufferedReader(gzip.open("/home/muntean/terrier-passage/tfs-per-qid/all-matches-fields-tfs-qid-159.txt.gz", "rb")) as inputFile:
    for line in inputFile:
#         print line
        i += 1
        if i%10000 == 0:
            print "Processed ", i*10000, " documents"
        data = line.replace("\n","").split("\t")
        qid = int(data[0])
        df_list = json.loads(data[2])
        docid = int(data[4])
        docno = data[5]
        doclen = int(data[6])
        tf_list = np.array(json.loads(data[7]))
        
        
        weight2bm25 = defaultdict(float)
        
        a = time.time()
        
        for combo in all_combi:
#             print combo
            doc_bm25 = 0
            for i, term_tf_list in enumerate(tf_list):  
                new_tf = weighted_tf(combo, term_tf_list)
                bm25classic =  bm25(new_tf, df_list[i], doclen, N, AVG_LEN)
                doc_bm25 += bm25classic
            weight2bm25[combo] = doc_bm25
        
        docid2docno[docid] = docno
        docid2weightbm25[docid] = weight2bm25
#         print weight2bm25
#         print
#         print docid2weightbm25.keys()
        
#         print time.time() - a
#         break
print "Finished, ", (time.time() - start) #/ 3600

        

Starting 1513614203.21
Finished,  2.06184005737


## All documents - 1 combination - on the biggest file

In [37]:
# Read doc txt: TAB separated
# qid,
# N,
# Arrays.toString(documentFrequencies),
# avgDocLen,
# docid,
# docno,
# doclen,
# tf_q1, tf_q2, tf_q3, tf_q4, tf_q5, tf_q6

def alldocs1combo(qid):

    i = 0 
    N = 50220423
    AVG_LEN = 963.90334

    docid2docno = dict()
    docid2weightbm25 = dict()

    start = time.time()
    print "Starting", start
    with io.BufferedReader(gzip.open("/home/muntean/terrier-passage/tfs-per-qid/all-matches-fields-tfs-qid-"+str(qid)+".txt.gz", "rb")) as inputFile:
        for line in inputFile:
    #         print line
            i += 1
#             if i%10000==0:
#                 print "Processed ", i, " documents"
            data = line.replace("\n","").split("\t")
            qid = int(data[0])
            df_list = json.loads(data[2])
            docid = int(data[4])
            docno = data[5]
            doclen = int(data[6])
            tf_list = np.array(json.loads(data[7]))


            weight2bm25 = defaultdict(float)

            a = time.time()


            doc_bm25 = 0
            for i, term_tf_list in enumerate(tf_list):
    #             print combo[20000]
    #             print term_tf_list
                new_tf = weighted_tf(all_combi[20000], term_tf_list)
                bm25classic =  bm25(new_tf, df_list[i], doclen, N, AVG_LEN)
                doc_bm25 += bm25classic
            weight2bm25[combo] = doc_bm25

            docid2docno[docid] = docno
            docid2weightbm25[docid] = weight2bm25
    #         print weight2bm25
    #         print
    #         print docid2weightbm25.keys()

    #         print time.time() - a

    print "Finished, ", (time.time() - start) #/ 3600
    print len(docid2docno)
    print len(docid2weightbm25) 

#### For the biggest file q: 66

Starting 1513589752.1

Finished,  3193.33556914

#### Only reading time

Starting 1513601966.22

Finished,  730.704658031

In [38]:
alldocs1combo(66)

Starting 1513601966.22
Finished,  730.704658031
0
0


In [28]:
# q:2, file: 52M, docs: 4.547.353 

alldocs1combo(2)

Starting 1513593846.79
Finished,  549.572243929
4547350
4547350


In [30]:
# q: 90, file: 93k, docs: 7.425

alldocs1combo(90)

Starting 1513594475.38
Finished,  0.405613183975
7425
7425


In [31]:
# q: 168, file: 36k, docs: 2.621

alldocs1combo(168)

Starting 1513594574.81
Finished,  0.140665054321
2621
2621
