In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import ast
import os
import xml.etree.ElementTree
from collections import Counter, defaultdict
import pandas as pd
from scipy import stats
import random
import math

In [27]:
def indexOf(l, e, start=0):
    for idx in range(start, len(l)):
        if l[idx] is e:
            return idx
    return -1

In [2]:
def idf(N, df):
    return math.log((N - df + 0.5) / (df + 0.5))

In [3]:
def bm25(tf, df, doclen, N, avg_doclen, k1=1.2, b=0.75):
    return idf(N, df) * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doclen / avg_doclen))))

In [4]:
def computePhraseFrequency(plists, window): 
            
    MAX=float("+inf")
    MIN=float("-inf")
        
    results=[]
    for l in plists:
        if len(l) < 1:
            return results
    
    positions=plists 
    index = [0] * len(positions)
    curr_occ = [0] * len (positions)
    max_curr_occ = MIN
    
    for i in range(0, len(positions)):
        curr_occ[i] = positions[i][0]
        if curr_occ[i] > max_curr_occ:
            max_curr_occ = curr_occ[i]
    
    lsym=0
    rsym=0
    lpos=0
    while max_curr_occ < MAX:
        _max = MIN
        _min = MAX
        for i in range(0, len(positions)):
            if curr_occ[i] > _max:
                _max = curr_occ[i]
                rsym = i
            if curr_occ[i] < _min:
                _min = curr_occ[i]
                lsym = i
                
        if index[lsym] == len(positions[lsym]) - 1 :
            lpos = MAX
        else :
            index[lsym]+=1
            lpos = positions[lsym][index[lsym]]
        
        if lpos > curr_occ[rsym] and curr_occ[rsym] - curr_occ[lsym] < window:
                results.append(curr_occ[lsym])
                
        max_curr_occ = MAX
        if lpos != MAX :
            max_curr_occ = MIN
            curr_occ[lsym] = lpos
            for i in range(0, len(positions)):
                if curr_occ[i] > max_curr_occ:
                    max_curr_occ = curr_occ[i]
    
    return results

## Load Data

In [5]:
# qid N dfs docid docno doclen avgdl rel positions
def loadData():
    qid2docs = defaultdict(list)    
    i = 0 
    with open("matteo_all.txt") as f:
        for l in f:
            # split row
            fields = l.split("\t")            
            # get the data
            qid = int(fields[0])
            N = int(fields[1])
            dfs = ast.literal_eval(fields[2])
            docid = int(fields[3])
            docno = fields[4]
            doclen=int(fields[5])
            avgdl=float(fields[6])
            rel=int(fields[7])
            plists = ast.literal_eval(fields[8])
            
            qid2docs[qid].append({"N":N, "dfs":dfs, "docid":docid, "docno":docno, "doclen": doclen, "avgdl": avgdl, "rel":rel, "pos":plists})

    return  qid2docs

In [6]:
qid2docs = loadData()

## Load Query Info

In [7]:
def loadQidInfo():
    qidDict = dict()
    for filename in os.listdir(os.curdir):
        if filename.endswith(".xml"):
            e = xml.etree.ElementTree.parse(filename).getroot()
            for query in e.findall('topic'):
                qidDict[int(query.get("number"))] = tuple([int(query.get("number")), query.get("type"), query[0].text])
    return qidDict

In [8]:
#qidInfo = loadQidInfo()
#print len(qidInfo)
#print qidInfo[6]

## 2. Build OCCURENCE Probability MATRIX

In [32]:
def buildScores(qid2docs):
    for qid in qid2docs:
        print qid
        qid2scores=defaultdict(list)
        docs=qid2docs[qid]
        for d in docs:
            p2t={}
            for idx in range(0, len(d["pos"])):
                for p in d["pos"][idx]:
                    p2t[p]=idx
            N = d["N"]
            dfs = d["dfs"]
            tfs = np.zeros((len(d["pos"]),), dtype=np.int)
            doclen=d["doclen"]
            avg_doclen = d["avgdl"]
            scores = np.zeros((doclen,), dtype=np.float)
            for pos in range(0, doclen):
                if pos in p2t:
                    t = p2t[pos]
                    tfs[t]+=1
                score=0
                for idx in range(0, len(tfs)):
                    score+=bm25(tfs[idx], dfs[idx], pos+1, N, avg_doclen)
                scores[pos]=score
            docid=d["docid"]
            docno=d["docno"]
            rel=d["rel"]
            qid2scores[qid].append({"rel": rel,  "docid": docid, "docno": docno, "scores":scores})
    return qid2scores

In [33]:
qid2scores = buildScores(qid2docs)

1


KeyboardInterrupt: 

In [None]:
qid_RelevantDocsDistribFlat = dict()
qid_AllDocsDistribFlat = dict() 
qid_RelevantDocsDistribProx = dict()
qid_AllDocsDistribProx = dict() 

In [None]:
# at PER QUERY level

#### RELEVANT
for qid, relDocidList in qid2reldoc.iteritems():
    flatlists = [docid2flatlist[qid, docid] for docid in relDocidList]
    p = buildProbability(flatlists)
    qid_RelevantDocsDistribFlat[qid] = p
    proxlists = [docid2proxlist[qid, docid] for docid in relDocidList]
    p = buildProbability(proxlists)
    qid_RelevantDocsDistribProx[qid] = p


In [None]:
#### ALL for that query
allflatlists={}
allproxlists={}
for qid, docid in docid2flatlist:
    if qid not in allflatlists:
        allflatlists[qid]=[]
    allflatlists[qid].append(docid2flatlist[qid, docid])
for qid, docid in docid2proxlist:
    if qid not in allproxlists:
        allproxlists[qid]=[]
    allproxlists[qid].append(docid2proxlist[qid, docid])    
    
for qid in allflatlists:
    p = buildProbability(allflatlists[qid])
    qid_AllDocsDistribFlat[qid] = p
        
for qid in allproxlists:
    p = buildProbability(allproxlists[qid])
    qid_AllDocsDistribProx[qid] = p

## 3. KL divergence

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html

scipy.stats.entropy(pk, qk=None, base=None)

Calculate the entropy of a distribution for given probability values.
If only probabilities pk are given, the entropy is calculated as S = -sum(pk * log(pk), axis=0).
If qk is not None, then compute the Kullback-Leibler divergence S = sum(pk * log(pk / qk), axis=0).

This routine will normalize pk and qk if they don’t sum to 1.

Parameters:	

    pk : sequence
       Defines the (discrete) distribution. pk[i] is the (possibly unnormalized) probability of event i.
       
    qk : sequence, optional
        Sequence against which the relative entropy is computed. Should be in the same format as pk.
        
    base : float, optional
        The logarithmic base to use, defaults to e (natural logarithm).
Returns:	

    S : float
    The calculated entropy.


In [None]:
from sklearn import metrics

kl_Rel_All_Flat = dict()
kl_Rel_All_Prox = dict()
for qid in qid_RelevantDocsDistribFlat.keys():
    kl_Rel_All_Flat[qid] = stats.entropy(qid_RelevantDocsDistribFlat[qid], qid_AllDocsDistribFlat[qid])
for qid in qid_RelevantDocsDistribProx.keys():
    kl_Rel_All_Prox[qid] = stats.entropy(qid_RelevantDocsDistribProx[qid], qid_AllDocsDistribProx[qid])

In [None]:
for qid in qidInfo:
    
    plt.figure(figsize=(16, 3))
    
    plt.subplot(1,4,1)
    if qid in qid_RelevantDocsDistribFlat:
        a = qid_RelevantDocsDistribFlat[qid].reshape((40, 50))
        plt.imshow(a, cmap='hot')

    plt.subplot(1,4,2)
    if qid in qid_AllDocsDistribFlat:
        a = qid_AllDocsDistribFlat[qid].reshape((40, 50))
        plt.imshow(a, cmap='hot')
    
    plt.subplot(1,4,3)
    if qid in qid_RelevantDocsDistribProx:
        a = qid_RelevantDocsDistribProx[qid].reshape((40, 50))    
        plt.imshow(a, cmap='hot')

    plt.subplot(1,4,4)
    if qid in qid_AllDocsDistribProx:
        a = qid_AllDocsDistribProx[qid].reshape((40, 50))
        plt.imshow(a, cmap='hot')

    if qid in kl_Rel_All_Flat and qid in kl_Rel_All_Prox:
        plt.suptitle(str(qidInfo[qid])+ " " + str(kl_Rel_All_Flat[qid]) + " " + str(kl_Rel_All_Prox[qid]))
        plt.show()

In [None]:
# at PER QUERY level

#### RELEVANT
global_flat = []
for qid, relDocidList in qid2reldoc.iteritems():
    global_flat += [docid2flatlist[qid, docid] for docid in relDocidList]
RelevantDocsDistribFlat = buildProbability(global_flat)

global_prox = []
for qid, relDocidList in qid2reldoc.iteritems():
    global_prox += [docid2proxlist[qid, docid] for docid in relDocidList]
RelevantDocsDistribProx = buildProbability(global_prox)

In [None]:
#### ALL for that query
global_flat=[]
global_prox=[]
for qid, docid in docid2flatlist:
    global_flat.append(docid2flatlist[qid, docid])
for qid, docid in docid2proxlist:
    global_prox.append(docid2proxlist[qid, docid])    
    
AllDocsDistribFlat = buildProbability(global_flat)
AllDocsDistribProx = buildProbability(global_prox)

In [None]:
qid2and={}
for qid in qidInfo:
    for qid2, docid in docid2flatlist:
        if qid == qid2:
            v = np.zeros(2000)
            for x in docid2flatlist[qid, docid]:
                if x < 2000:
                    v[x] = 1
            p = 1.0
            for x in range(0, 2000):
                if v[x] == 1:
                    p *= qid_AllDocsDistribFlat[qid][x]
                else:
                    p *= 1 - qid_AllDocsDistribFlat[qid][x]
            if p == 0:
                print("%d %d %d" % (qid, docid, docid in qid2reldoc[qid]))
            qid2and[qid, docid] = p

In [None]:
print(qid2and)