In [2]:
%load_ext autoreload
%autoreload 2

from time import time
import pandas as pd
import numpy as np
import os
from collections import Counter, defaultdict

In [3]:
import pyterrier as pt
pt.init()
# pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

PyTerrier 0.7.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
from jnius import autoclass
tokeniser = autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()

In [5]:
def terrier_tokenizer(raw_utterance):
    new_utterance = " ".join(tokeniser.getTokens(raw_utterance))
    return new_utterance

# Retrieve docs per query

In [6]:
index_ref = pt.IndexRef.of("/data3/muntean/conversational-cache/indexes/CAST2020-stemmed/data.properties")
index = pt.IndexFactory.of(index_ref)

di = index.getDirectIndex()
doi = index.getDocumentIndex()
lex = index.getLexicon()

print(index.getCollectionStatistics())

14:42:50.397 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading lookup file directly from disk (SLOW) - try index.meta.index-source=fileinmem in the index properties file. 294.8 MiB of memory would be required.
14:42:50.431 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 2 GiB of memory would be required.
Number of documents: 38636520
Number of terms: 9333281
Number of postings: 1052145306
Number of fields: 0
Number of tokens: 1300558544
Field names: []
Positions:   false



In [7]:
qrel_path = "../data/CAST_qrels/qrels-docs.2019.txt"
qrels_df = pd.read_csv(qrel_path, delimiter=" ", header=None)
qrels_df[[3]] = qrels_df[[3]].astype(int)
qrels_df = qrels_df.drop([1], axis=1)
qrels_df.columns=["qid", "docno", "label"]
qrels = qrels_df

In [8]:
topics_path='../data/CAST-2019/test_manual_utterance.tsv' #manual

topics_df = pd.read_csv(topics_path, delimiter="\t", header=None)

topics_df[2] = topics_df[1].apply(lambda s: terrier_tokenizer(s))

topics_df = topics_df.drop([1], axis=1)
topics_df.columns=["qid", "query"]

topics = topics_df
topics.head()

Unnamed: 0,qid,query
0,31_1,what is throat cancer
1,31_2,is throat cancer treatable
2,31_3,tell me about lung cancer
3,31_4,what are lung cancer s symptoms
4,31_5,can lung cancer spread to the throat


In [9]:
%%time
DPH_2000 = pt.BatchRetrieve(index, wmodel="DPH", num_results=2000)  
res_2000 = DPH_2000.transform(topics)

CPU times: user 1min 54s, sys: 3.45 s, total: 1min 57s
Wall time: 3min 24s


In [None]:
# %%time
# pt.Experiment([DPH_2000], topics, qrels, names=["DPH_2000"], 
#               eval_metrics=["map", "recip_rank", "recall_200", "P_3", "P_1", "ndcg_cut_3"]) 

In [10]:
res_2000['rank'].astype(int)

# 1. group by qid and take the first 10 docid?
qid_list = list(topics["qid"])
qid_retrieve_docids_df = res_2000.loc[res_2000['qid'].isin(qid_list)]
qid_retrieve_docids_df = qid_retrieve_docids_df.groupby('qid')['docno'].apply(list)
qid_retrieve_docids_dict = qid_retrieve_docids_df.to_dict()

In [11]:
len(qid_retrieve_docids_dict["31_1"])

2000

In [12]:
### REMMBER to check the YEAR

qid_rel_dict = defaultdict(int)
path = "../data/CAST_qrels/"
with open(path+"qrels-docs.2019.txt", 'r') as f:
    for line in f:
        line_fields = line.rstrip().split(" ")
        utt_id = line_fields[0]
        doc_id = line_fields[2]
        relevance = int(line_fields[3])
        if relevance > 0:
            qid_rel_dict[utt_id]+=1

In [13]:
qid_rel_dict

defaultdict(int,
            {'31_1': 89,
             '31_2': 77,
             '31_3': 171,
             '31_4': 98,
             '31_5': 58,
             '31_6': 63,
             '31_7': 58,
             '31_8': 68,
             '31_9': 55,
             '32_1': 96,
             '32_2': 67,
             '32_3': 98,
             '32_4': 17,
             '32_5': 43,
             '32_6': 17,
             '32_7': 88,
             '32_8': 17,
             '32_9': 20,
             '32_10': 16,
             '32_11': 9,
             '33_1': 46,
             '33_2': 17,
             '33_3': 16,
             '33_4': 4,
             '33_5': 35,
             '33_6': 24,
             '33_7': 9,
             '33_8': 31,
             '34_1': 71,
             '34_2': 32,
             '34_3': 30,
             '34_4': 52,
             '34_5': 25,
             '34_6': 30,
             '34_7': 32,
             '34_8': 21,
             '37_1': 58,
             '37_2': 22,
             '37_3': 37,
        

In [14]:
#functions to check the intersection between top-k results of current turn and top-k results of the previous turns 

def intersection(lst1, lst2):
    return set(lst1).intersection(lst2)

def compute_overapping_documents_set(conv_id, curr_turn, f, qid_rel_dict):
    
    total_overlapping = 0
    
    utt_id = str(conv_id)+"_"+str(curr_turn)
    docs_current_turn =  map_utt_docs[utt_id]

    docs_so_far = set()
    for turn_id in range(1, curr_turn):
        if str(conv_id)+"_"+str(turn_id) in map_utt_docs:
            my_list = map_utt_docs[str(conv_id)+"_"+str(turn_id)]
            for el in my_list:                
                docs_so_far.add(el)
        else:
            f.write(str(conv_id)+"_"+str(turn_id) + " not found\n")
    total_overlapping = len(intersection(docs_so_far, docs_current_turn))
    # print(docs_so_far, docs_current_turn, total_overlapping)
    normalized_overlapping = "not in qrel"
    if utt_id in qid_rel_dict:
        if qid_rel_dict[utt_id]>0:
            normalized_overlapping = len(intersection(docs_so_far, docs_current_turn))/qid_rel_dict[utt_id]
        else:
            normalized_overlapping = "0 relevants"
        
    f.write(utt_id+"\t"+str(total_overlapping)+"\t"+str(qid_rel_dict[utt_id])+"\t"+str(normalized_overlapping)+"\n")

In [15]:
def compute_overapping_with_first_utt_set(conv_id, curr_turn, f, qid_rel_dict, qid_retrieve_docids_dict):
    
    total_overlapping = 0
    
    utt_id = str(conv_id)+"_"+str(curr_turn)
    docs_current_turn =  qid_retrieve_docids_dict[utt_id]
    docs_first_turn =  qid_retrieve_docids_dict[str(conv_id)+"_1"]

    
    total_overlapping = len(intersection(docs_first_turn, docs_current_turn))
    normalized_overlapping = "not in qrel"
    
    if utt_id in qid_rel_dict:
        if qid_rel_dict[utt_id]>0:
            normalized_overlapping = len(intersection(docs_first_turn, docs_current_turn))/qid_rel_dict[utt_id]
        else:
            normalized_overlapping = "0 relevants"
        
    f.write(utt_id+"\t"+str(total_overlapping)+"\t"+str(qid_rel_dict[utt_id])+"\t"+str(normalized_overlapping)+"\n")

In [16]:
# main (with sets, no duplicates): to run after loading data

path = "../data/proof_of_concepts_preliminary_results/retrieved/top2000/"
with open(path+"normalized-overalapping-sets-with-first-utt-docs.res_2019_manual.txt", 'w') as f:
    for el in qid_retrieve_docids_dict:
        conv_id = int(el.split("_")[0])
        curr_turn = int(el.split("_")[1])
        compute_overapping_with_first_utt_set(conv_id, curr_turn, f, qid_rel_dict, qid_retrieve_docids_dict)