In [None]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import ast
import os
import xml.etree.ElementTree
from collections import Counter, defaultdict
import pandas as pd

In [None]:
def loadQidInfo():
    qidDict = dict()
    for filename in os.listdir(os.curdir):
        if filename.endswith(".xml"):
            e = xml.etree.ElementTree.parse(filename).getroot()
            for query in e.findall('topic'):
                qidDict[int(query.get("number"))] = tuple([int(query.get("number")), query.get("type"), query[0].text])
    return qidDict

In [None]:
qidInfo = loadQidInfo()
print len(qidInfo)
print qidInfo[6]

In [None]:
# qid docid docno doclen rel positions
def get2Distrib(paragraph_len=100):
    qid2rel_points={}
    qid2irr_points={}
    i = 0
    with open("wtall_qrels_pos.clean.txt") as f:
        for l in f:
            fields=l.split("\t")

            qid=int(fields[0])
            flat_list = [item for sublist in ast.literal_eval(fields[5]) for item in sublist]
            rel=int(fields[4])
            doclen=int(fields[3])

            qid2points = None
            if rel > 0:
                qid2points=qid2rel_points
            else:
                qid2points=qid2irr_points
            if qid not in qid2points:
                qid2points[qid]=[]
            qid2points[qid].append([item//paragraph_len + 1 for item in flat_list])
     
    return qid2rel_points, qid2irr_points

In [None]:
# qid docid docno doclen rel positions
def get2Distrib2(paragraph_len=100):
    qid2rel_points=defaultdict(list)
    qid2irr_points=defaultdict(list)
    qid2rel_numpar={}
    qid2irr_numpar={}
    
    i = 0
    with open("wtall_qrels_pos.clean.txt") as f:
        for l in f:
            fields=l.split("\t")

            qid=int(fields[0])
            flat_list = [item for sublist in ast.literal_eval(fields[5]) for item in sublist]
            rel=int(fields[4])
            doclen=int(fields[3])
            num_par = int(round(doclen/paragraph_len))

            qid2points = None
            qid2numpar = None
            if rel > 0:
                qid2points=qid2rel_points
                qid2numpar=qid2rel_numpar
            else:
                qid2points=qid2irr_points
                qid2numpar=qid2irr_numpar

            if qid not in qid2numpar:
                qid2numpar[qid]=defaultdict(int)
            for p in range(1, num_par+1):
                qid2numpar[qid][p]+=1
                
            qid2points[qid].append([item//paragraph_len + 1 for item in flat_list])
     
    return qid2rel_points, qid2rel_numpar, qid2irr_points, qid2irr_numpar

In [None]:
qid2rel_points, qid2rel_numpar, qid2irr_points, qid2irr_numpar = get2Distrib2(250)

In [None]:
print len(qid2rel_points[1])
print len(qid2irr_points[1])

type(qid2rel_numpar)

In [None]:
def countRatio(queryID, qid2rel_points, qid2irr_points):
    
    ex = qid2rel_points[queryID]  
    par2numdoc = defaultdict(int)
    par2totfreq = defaultdict(int)
    for x in ex:
        for y in x:
            par2totfreq[y]+=1
        z=set(x)
        for y in z:
            par2numdoc[y]+=1

    ex = qid2irr_points[queryID]
    ipar2numdoc = defaultdict(int)
    ipar2totfreq = defaultdict(int)
    for x in ex:
        for y in x:
            ipar2totfreq[y]+=1
        z=set(x)
        for y in z:
            ipar2numdoc[y]+=1
    
    return par2numdoc, par2totfreq, ipar2numdoc, ipar2totfreq

In [None]:
def countRatio2(queryID, qid2rel_points, qid2irr_points):
    
    ex = qid2rel_points[queryID] #all the occurrences of all the query terms in all the rel docs for qid 
    par2totfreq = defaultdict(int)
    for x in ex:
        #x is a document
        for y in x: # y is a list, the occurrences of a term in x
            par2totfreq[y]+=1

    ex = qid2irr_points[queryID]
    ipar2totfreq = defaultdict(int)
    for x in ex:
        for y in x:
            ipar2totfreq[y]+=1
    
    return par2totfreq, ipar2totfreq

In [None]:
par2totfreq, ipar2totfreq = countRatio2(2, qid2rel_points, qid2irr_points)

In [None]:
df = pd.DataFrame([par2totfreq, qid2rel_numpar[2], ipar2totfreq, qid2irr_numpar[2]]).T

df['RelRation'] =  df[0]/df[1]
df['IRelRation'] =  df[2]/df[3]

# df["deriv"] = np.diff(df[1], axis=1)
df

In [None]:
# 2 bar plot

def plot4D(df, title, maxnumpoints=100):

    plt.style.use('ggplot')
    fig = plt.figure(figsize=(20,10)) 

    ax = fig.add_subplot(111) 
    barax = ax.twinx()
    
    a = [x-1 for x in df[1].keys().tolist()]
    
    barax.plot(a[:maxnumpoints], df[1].iloc[:maxnumpoints], "ko-", label="RelNumDoc")
    barax.plot(a[:maxnumpoints], df[3].iloc[:maxnumpoints], "g*-", label="IRelNumDoc")

    width = 0.4
    
    df.RelRation.iloc[:maxnumpoints].plot(kind='bar', color='red', width=width, position=1, ax = ax, label="RelRatio")
    df.IRelRation.iloc[:maxnumpoints].plot(kind='bar', color='blue', width=width, position=0, ax = ax, label="IRelRatio")
    
    barax.legend(loc=2)
    ax.legend(loc=1)
#     plt.legend()
    plt.title(title)
    
    plt.show()

In [None]:
plot4D(df, "Query 2")

In [None]:
# put everything together
i=0
for queryId, queryInfo in qidInfo.iteritems():
    if queryId in qid2rel_points:
    
        par2totfreq, ipar2totfreq = countRatio2(queryId, qid2rel_points, qid2irr_points)

        df = pd.DataFrame([par2totfreq, qid2rel_numpar[queryId], ipar2totfreq, qid2irr_numpar[queryId]]).T
        df['RelRation'] =  df[0]/df[1]
        df['IRelRation'] =  df[2]/df[3]
        
#         df['RelRation'] =  df[0]
#         df['IRelRation'] =  df[2]

        plot4D(df, queryInfo)
        i+=1
    #     if i>5:
    #         break