In [1]:
!pip install python-terrier
#!pip install --upgrade git+https://github.com/terrier-org/pyterrier.git#egg=python-terrier

Collecting python-terrier
  Downloading python-terrier-0.7.2.tar.gz (95 kB)
[?25l[K     |███▍                            | 10 kB 26.7 MB/s eta 0:00:01[K     |██████▉                         | 20 kB 9.5 MB/s eta 0:00:01[K     |██████████▎                     | 30 kB 8.0 MB/s eta 0:00:01[K     |█████████████▊                  | 40 kB 7.3 MB/s eta 0:00:01[K     |█████████████████▏              | 51 kB 4.3 MB/s eta 0:00:01[K     |████████████████████▋           | 61 kB 4.5 MB/s eta 0:00:01[K     |████████████████████████        | 71 kB 4.4 MB/s eta 0:00:01[K     |███████████████████████████▌    | 81 kB 5.0 MB/s eta 0:00:01[K     |███████████████████████████████ | 92 kB 5.4 MB/s eta 0:00:01[K     |████████████████████████████████| 95 kB 2.3 MB/s 
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Collecting pyjnius~=1.3.0
  Downloading pyjnius-1.3.0-cp37-cp37m-manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 25.9 MB/s 
[?25hCollecting

In [2]:
import pyterrier as pt
if not pt.started():
  pt.init()

terrier-assemblies 5.6 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.6 jar not found, downloading to /root/.pyterrier...
Done
PyTerrier 0.7.2 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


In [3]:
import pandas as pd
!rm -rf ./pd_index
pd_indexer = pt.DFIndexer("./pd_index")

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import torch
import numpy as np
import random
from tqdm import tqdm
import re

def same_seeds(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
same_seeds(42)

def mean_average_precision(df, ans):  
    MAP = 0
    for query_id, doc_list_str in df.iterrows():
        doc_list = doc_list_str["doc"].split()[:50]
        ans_doc_set = set(ans.loc[query_id, "doc"].split())
        AP = 0
        rel_cnt = 0
        for i, doc in enumerate(doc_list):
          if doc in ans_doc_set:
              rel_cnt += 1
              AP += rel_cnt / (i + 1)
        print(f'predict:{rel_cnt}, ground:{len(ans_doc_set)}')
        AP /= min(len(ans_doc_set), 50)
        MAP += AP
    MAP /= len(df)
    return MAP

In [6]:
train_ans = pd.read_csv('drive/My Drive/data/IR2021/train_ans.csv')

In [7]:
document = pd.read_csv('drive/My Drive/data/IR2021/document.csv')
train_query = pd.read_csv('drive/My Drive/data/IR2021/train_query.csv')
test_query = pd.read_csv('drive/My Drive/data/IR2021/test_query.csv')

In [8]:
document = document.astype({"doc": object, "document": object})
train_query = train_query.astype({"topic": object, "train_query": object})
test_query = test_query.astype({"topic": object, "test_query": object})

In [9]:
docno = list(map(str, document['doc']))
url = []
for i in docno:
  url.append(f'url{int(i)}')

In [10]:
df = pd.DataFrame({ 
'docno':docno,
'url':url,
'text':document['document'].values
})

In [11]:
meta_fields={"docno":docno,"url":url}
indexref2 = pd_indexer.index(df["text"], **meta_fields)

In [12]:
new_train_query = []
for i in train_query['train_query']:
  res = re.sub(r'[^\w\s]', '', i)
  new_train_query.append(res)

new_test_query = []
for i in test_query['test_query']:
  res = re.sub(r'[^\w\s]', '', i)
  new_test_query.append(res)

In [13]:
train_topics = pd.DataFrame({ 
'qid':train_query['topic'].values,
'query':new_train_query
})

test_topics = pd.DataFrame({ 
'qid':test_query['topic'].values,
'query':new_test_query
})

In [14]:
pyterrier_result = []
pyterrier_document = []
for i in tqdm(train_topics['query'].values):
  output1 = pt.BatchRetrieve(indexref2, wmodel="BM25") 
  output2 = pt.BatchRetrieve(indexref2, wmodel="DPH")
  output3 = pt.BatchRetrieve(indexref2, wmodel="PL2")
  output4 = pt.BatchRetrieve(indexref2, wmodel="DirichletLM")
  pipeline = output1 >> (output2**output4)
  output = pipeline.search(i)['docno'].values[:50]
  pyterrier_result.append(' '.join(output))
pyt_df = pd.DataFrame({'topic':train_ans['topic'], 'doc':pyterrier_result})
print()
print('MAP:', mean_average_precision(pyt_df, train_ans)) # 0.14637526743011098

100%|██████████| 15/15 [00:16<00:00,  1.13s/it]


predict:15, ground:34
predict:1, ground:18
predict:21, ground:64
predict:11, ground:51
predict:12, ground:56
predict:10, ground:50
predict:15, ground:54
predict:13, ground:50
predict:8, ground:62
predict:2, ground:8
predict:6, ground:52
predict:10, ground:58
predict:12, ground:73
predict:22, ground:56
predict:14, ground:34
MAP: 0.11617588963502402





In [15]:
test_pyterrier_result = []
test_pyterrier_document = []
for i in tqdm(test_topics['query'].values):
  output1 = pt.BatchRetrieve(indexref2, wmodel="BM25")
  output2 = pt.BatchRetrieve(indexref2, wmodel="DPH")
  output3 = pt.BatchRetrieve(indexref2, wmodel="PL2")
  pipeline = 0.2*output1 + 0.5*output2 + 0.5*output3
  output = pipeline.search(i)['docno'].values
  test_pyterrier_result.append(' '.join(output))
pyt_df_test = pd.DataFrame({'topic':test_query['topic'], 'doc':test_pyterrier_result})

100%|██████████| 15/15 [00:11<00:00,  1.30it/s]


In [16]:
pyt_df_test.to_csv('test_pyterrier.csv', index=False)

In [17]:
pyt_df_test.head()

Unnamed: 0,topic,doc
0,1,3767862 3457840 2515291 3471405 2813652 448018...
1,3,3169500 4173523 4666222 3354941 3230995 448175...
2,5,4050073 3206111 3920447 4514331 2876706 420691...
3,7,4332755 2841416 2867990 1560153 3269105 387851...
4,8,4332755 3139186 3193777 3876300 2571050 443090...
