In [1]:
import json
import pandas as pd
import numpy as np
from collections import Counter
import pyterrier as pt
import os
from sklearn.model_selection import train_test_split
import re

In [2]:
from sklearn.ensemble import RandomForestRegressor
import fastrank

In [3]:
import tqdm

In [4]:
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-19"

In [5]:
if not pt.started():
    pt.init()

PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



## Read the Original Data

In [6]:
with open("selected_articles.json", 'r') as file_read:
    selected_articles = json.load(file_read)

In [7]:
articles_df = pd.DataFrame(selected_articles)
articles_df 

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,0704.0046,Denes Petz,"I. Csiszar, F. Hiai and D. Petz",A limit relation for entropy and channel capac...,"LATEX file, 11 pages","J. Math. Phys. 48(2007), 092102.",10.1063/1.2779138,,quant-ph cs.IT math.IT,,"In a quantum mechanical model, Diosi, Feldma...","[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2009-11-13,"[[Csiszar, I., ], [Hiai, F., ], [Petz, D., ]]"
1,0704.0047,Igor Grabec,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,"5 pages, 5 eps figures, uses IEEEtran.cls",,,,cs.NE cs.AI,,The intelligent acoustic emission locator is...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2009-09-29,"[[Kosel, T., ], [Grabec, I., ]]"
2,0704.0050,Igor Grabec,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,"5 pages, 7 eps figures, uses IEEEtran.cls",,,,cs.NE cs.AI,,Part I describes an intelligent acoustic emi...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2007-05-23,"[[Kosel, T., ], [Grabec, I., ]]"
3,0704.0098,Jack Raymond,"Jack Raymond, David Saad",Sparsely-spread CDMA - a statistical mechanics...,"23 pages, 5 figures, figure 1 amended since pu...",J. Phys. A: Math. Theor. 40 No 41 (12 October ...,10.1088/1751-8113/40/41/004,,cs.IT math.IT,,"Sparse Code Division Multiple Access (CDMA),...","[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2009-11-13,"[[Raymond, Jack, ], [Saad, David, ]]"
4,0704.0217,Wiroonsak Santipach,Wiroonsak Santipach and Michael L. Honig,Capacity of a Multiple-Antenna Fading Channel ...,,"IEEE Trans. Inf. Theory, vol. 55, no. 3, pp. 1...",10.1109/TIT.2008.2011437,,cs.IT math.IT,http://arxiv.org/licenses/nonexclusive-distrib...,Given a multiple-input multiple-output (MIMO...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2010-08-27,"[[Santipach, Wiroonsak, ], [Honig, Michael L., ]]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219356,quant-ph/0703112,Martin Roetteler,"Markus Grassl, Andreas Klappenecker, Martin Ro...","Graphs, Quadratic Forms, and Quantum Codes","5 pages, 2 figures, paper presented at the 200...",Proceedings 2002 IEEE International Symposium ...,10.1109/ISIT.2002.1023317,,quant-ph cs.IT math.IT,,We show that any stabilizer code over a fini...,"[{'version': 'v1', 'created': 'Tue, 13 Mar 200...",2009-05-24,"[[Grassl, Markus, ], [Klappenecker, Andreas, ]..."
219357,quant-ph/0703113,Martin Roetteler,"Salah A. Aly, Markus Grassl, Andreas Klappenec...",Quantum Convolutional BCH Codes,"4 pages, minor changes, accepted for publicati...",Proceedings 2007 Canadian Workshop on Informat...,10.1109/CWIT.2007.375730,,quant-ph cs.IT math.IT,,Quantum convolutional codes can be used to p...,"[{'version': 'v1', 'created': 'Tue, 13 Mar 200...",2009-05-24,"[[Aly, Salah A., ], [Grassl, Markus, ], [Klapp..."
219358,quant-ph/0703181,Martin Roetteler,"Markus Grassl, Martin Roetteler",Quantum Block and Convolutional Codes from Sel...,"5 pages, paper presented at the 2005 IEEE Inte...",Proceedings 2005 IEEE International Symposium ...,10.1109/ISIT.2005.1523493,,quant-ph cs.IT math.IT,,We present a construction of self-orthogonal...,"[{'version': 'v1', 'created': 'Mon, 19 Mar 200...",2009-05-24,"[[Grassl, Markus, ], [Roetteler, Martin, ]]"
219359,quant-ph/0703182,Martin Roetteler,"Markus Grassl, Martin Roetteler",Constructions of Quantum Convolutional Codes,"5 pages, to appear in the Proceedings of the 2...",Proceedings 2007 IEEE International Symposium ...,10.1109/ISIT.2007.4557325,,quant-ph cs.IT math.IT,,We address the problems of constructing quan...,"[{'version': 'v1', 'created': 'Mon, 19 Mar 200...",2009-05-24,"[[Grassl, Markus, ], [Roetteler, Martin, ]]"


## Read the Annotated Data

In [8]:
data_df = pd.read_csv("annotate.csv")
data_df = data_df.drop(data_df.columns[0], axis=1)
data_df

Unnamed: 0,qid,docid,docno,query,authors,title,abstract,label
0,12,191618,2204.01848,COVID-19 and social media,"Ekagra Ranjan, Naman Poddar",Multilingual Abusiveness Identification on Cod...,Social Media platforms have been seeing adop...,1.0
1,18,57973,1709.07952,infomation retrieval time complexity,Julien Lavauzelle,Private Information Retrieval from Transversal...,Private information retrieval (PIR) protocol...,3.0
2,10,87914,1905.03836,web archive,"Mohamed Aturban, Michael L. Nelson, Michele C....",Collecting 16K archived web pages from 17 publ...,We document the creation of a data set of 16...,5.0
3,18,27576,1407.1065,infomation retrieval time complexity,"Emmanuel Candes, Xiaodong Li, Mahdi Soltanolko...",Phase Retrieval via Wirtinger Flow: Theory and...,We study the problem of recovering the phase...,0.0
4,5,28542,1409.0203,matrix completion,"Mohammad J. Taghizadeh, Reza Parhizkar, Philip...",Ad Hoc Microphone Array Calibration: Euclidean...,This paper addresses the problem of ad hoc m...,3.0
...,...,...,...,...,...,...,...,...
1003,17,212698,2210.05577,robustness of neural networks,"Nikolaos Tsilivis, Julia Kempe",What Can the Neural Tangent Kernel Tell Us Abo...,The adversarial vulnerability of neural nets...,5.0
1004,18,7182,1012.5208,information retrieval time complexity,"Nadia Baaziz, Omar Abahmane and Rokia Missaoui",Texture feature extraction in the spatial-freq...,The advent of large scale multimedia databas...,2.0
1005,18,28575,1409.0749,information retrieval time complexity,Vikas Verma,Image Retrieval And Classification Using Local...,Content Based Image Retrieval(CBIR) is one o...,3.0
1006,18,45997,1609.07027,information retrieval time complexity,"Simon R. Blackburn, Tuvi Etzion, Maura B. Pate...",PIR schemes with small download complexity and...,In the classical model for (information theo...,3.0


In [9]:
# becuase of format problem, some docno in above DF are diffrent with the original
# correct that

def modify_docno(row):
    if row.docno in articles_df.id.values:
        return row.docno
    else:
        return articles_df[articles_df.title == row.title].id.values[0]
    
data_df["docno"] = data_df.apply(modify_docno, axis = 1)

In [10]:
qrels = data_df[["qid", "docno", "label"]]
qrels = qrels.astype({'qid': 'string', 'label': 'int'})
qrels

Unnamed: 0,qid,docno,label
0,12,2204.01848,1
1,18,1709.07952,3
2,10,1905.03836,5
3,18,1407.1065,0
4,5,1409.0203,3
...,...,...,...
1003,17,2210.05577,5
1004,18,1012.5208,2
1005,18,1409.0749,3
1006,18,1609.07027,3


In [11]:
document_df = data_df[["docno", "abstract"]].drop_duplicates()
document_df

Unnamed: 0,docno,abstract
0,2204.01848,Social Media platforms have been seeing adop...
1,1709.07952,Private information retrieval (PIR) protocol...
2,1905.03836,We document the creation of a data set of 16...
3,1407.1065,We study the problem of recovering the phase...
4,1409.0203,This paper addresses the problem of ad hoc m...
...,...,...
1003,2210.05577,The adversarial vulnerability of neural nets...
1004,1012.5208,The advent of large scale multimedia databas...
1005,1409.0749,Content Based Image Retrieval(CBIR) is one o...
1006,1609.07027,In the classical model for (information theo...


In [12]:
pt_index_path = "F:/2022Fall/SI650/Project/arxiv_index"

if not os.path.exists(pt_index_path + "/data.properties"):

    indexer = pt.DFIndexer(pt_index_path, overwrite=True, blocks = True)
    index_ref = indexer.index(document_df["abstract"], document_df["docno"])

else:
    
    index_ref = pt.IndexRef.of(pt_index_path + "/data.properties")
    
index = pt.IndexFactory.of(index_ref)


In [13]:
print(index.getCollectionStatistics().toString())

Number of documents: 992
Number of terms: 5695
Number of postings: 65766
Number of fields: 0
Number of tokens: 100128
Field names: []
Positions:   true



In [14]:
queries = ["learning to rank with partitioned preference",
           "advanced search engine",
           "random forests",
           "database management system",
           "pre-trained language model",
           "matrix completion",
           "query expansion for information retrieval",
           "gradient boosting",
           "node embedding for graph",
           "language model for long documents",
           "web archive",
           "PageRank for web search",
           "COVID-19 and social media",
           "social network analysis with natural language processing",
           "text summarization model",
           "relevance feedback for information retrieval",
           "activation function in neural networks",
           "robustness of neural networks",
           "information retrieval time complexity",
           "artificial intelligence for low carbon"
]

In [15]:
query_df = pd.DataFrame( queries, columns = ["query"])
query_df["qid"] = query_df.index
query_df = query_df.astype({'qid': 'string'})
query_df

Unnamed: 0,query,qid
0,learning to rank with partitioned preference,0
1,advanced search engine,1
2,random forests,2
3,database management system,3
4,pre-trained language model,4
5,matrix completion,5
6,query expansion for information retrieval,6
7,gradient boosting,7
8,node embedding for graph,8
9,language model for long documents,9


## Define and Test Baseline Models

In [16]:
SEED=42
train_topics, test_topics = train_test_split(query_df, test_size= 8 , random_state=SEED)
train_topics

Unnamed: 0,query,qid
18,information retrieval time complexity,18
16,activation function in neural networks,16
13,social network analysis with natural language ...,13
2,random forests,2
9,language model for long documents,9
19,artificial intelligence for low carbon,19
4,pre-trained language model,4
12,COVID-19 and social media,12
7,gradient boosting,7
10,web archive,10


In [17]:
tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF")
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [18]:
pt.Experiment(
    [tfidf, bm25],
    test_topics,
    qrels,
    eval_metrics=["map", "ndcg", "ndcg_cut_5", "ndcg_cut_10"],
    names=["TF_IDF", "BM25"],
)

Unnamed: 0,name,map,ndcg,ndcg_cut_5,ndcg_cut_10
0,TF_IDF,0.937215,0.925268,0.832472,0.798361
1,BM25,0.929768,0.924852,0.827465,0.787191


## New Model

In [19]:
# define the function to calculate the bo
def position_score(keyFreq, posting, entryStats, collStats):
    
    dlength = posting.getDocumentLength()
    first_pos = int(re.findall(r'\[(.*?)\]', posting.toString())[0].split(",")[0])
    c_n = 1 - first_pos/(dlength)
    
    return c_n

def custom_weighting(keyFreq, posting, entryStats, collStats):
    
    dft = entryStats.getDocumentFrequency()
    N = collStats.numberOfDocuments
    ctd = posting.getFrequency()
    dlength = posting.getDocumentLength()
    avg_length = collStats.averageDocumentLength
    Nt = entryStats.getFrequency() # the total frequency of term 
    k1 = 1.2
    b = 0.25
    k3 = 8
    first_pos = int(re.findall(r'\[(.*?)\]', posting.toString())[0].split(",")[0]) # get the first position 
    
    
    idf = np.log(( N - dft +0.5)/(dft+0.5))
    tf = (k1+1)*ctd/(k1*(1 - b + b*dlength/avg_length) + ctd)
    qtf = (k3+1)*keyFreq/(k3+keyFreq)
    c_n = 1 - first_pos/(dlength)
    
    return idf*tf*qtf*c_n

In [20]:
new_score = pt.BatchRetrieve(index, wmodel = custom_weighting)

base_IR = bm25 | tfidf

# define two types of the query expansion for later comparison
bo1 = pt.rewrite.Bo1QueryExpansion(index)
begin_pip_bo1 = new_score >> bo1 >> bm25

kl = pt.rewrite.KLQueryExpansion(index)
begin_pip_kl = new_score >> kl >> bm25

In [21]:
## Add other features 
CM = pt.BatchRetrieve(index, wmodel = "CoordinateMatch")
dlh = pt.BatchRetrieve(index, wmodel="DLH13")
pos_score = pt.BatchRetrieve(index, wmodel = position_score)

second_pip_bo1 = begin_pip_bo1 >> (bm25**tfidf**CM**dlh**pos_score)
second_pip_kl = begin_pip_kl >> (bm25**tfidf**CM**dlh**pos_score)

In [22]:
category_dict = {"cs.AI": 0, "cs.DB":1, "cs.DL":2, "cs.IR":3, "cs.IT":4, "cs.LG":5, "cs.SI":6}
category_doc = {}
for dn in tqdm.tqdm(document_df.docno):
    category_vector = np.zeros(7)
    categories = articles_df[articles_df.id == dn]["categories"].values[0].split(" ")
    for c in categories:
        if c in category_dict.keys():
            category_vector[category_dict[c]] = 1 
            
    category_doc[dn] = category_vector
    
category_query = np.array([[1, 0, 0, 1, 0, 1, 0],
                          [0, 0, 0, 1, 0, 0, 0],
                          [1, 0, 0, 0, 0, 1, 0],
                          [0, 1, 1, 0, 0, 0, 0],
                          [1, 0, 0, 1, 0, 1, 0],
                          [1, 0, 0, 0, 0, 1, 0],
                          [0, 0, 0, 0, 1, 1, 0],
                          [1, 0, 0, 0, 0, 1, 0],
                          [1, 0, 0, 0, 1, 1, 0],
                          [1, 0, 0, 0, 0, 1, 0],
                          [0, 0, 1, 1, 0, 0, 1],
                          [0, 0, 0, 1, 0, 0, 0],
                          [0, 0, 0, 0, 1, 0, 1],
                          [1, 0, 0, 0, 0, 1, 1],
                          [1, 0, 0, 0, 0, 1, 0],
                          [0, 0, 0, 1, 1, 0, 0],
                          [1, 0, 0, 0, 0, 1, 0],
                          [1, 0, 0, 0, 0, 1, 0],
                          [0, 0, 0, 1, 1, 0, 0],
                          [1, 0, 0, 0, 1, 1, 1]])

100%|████████████████████████████████████████████████████████████████████████████████| 992/992 [00:27<00:00, 36.67it/s]


In [23]:
# define the fuction to add category 

def add_catgory(row):         
        
    return np.append(row.features, (category_doc[row.docno] == category_query[int(row.qid)]).mean())

third_pip_bo1 = second_pip_bo1 >> pt.apply.doc_features(add_catgory)
third_pip_kl = second_pip_kl >> pt.apply.doc_features(add_catgory)

In [24]:
# Random Forest

rf_bo1 = RandomForestRegressor(n_estimators=400, verbose=1, random_state=SEED, n_jobs=2)
rf_kl = RandomForestRegressor(n_estimators=400, verbose=1, random_state=SEED, n_jobs=2)

rf_pipe_bo1 = third_pip_bo1 >> pt.ltr.apply_learned_model(rf_bo1)
rf_pipe_kl = third_pip_kl >> pt.ltr.apply_learned_model(rf_kl)

rf_pipe_bo1.fit(train_topics, qrels)
rf_pipe_kl.fit(train_topics, qrels)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.9s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    1.9s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    1.0s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    2.1s finished


In [29]:
BO1_train_request = fastrank.TrainRequest.coordinate_ascent()
BO1_params = BO1_train_request.params
BO1_params.init_random = True
BO1_params.normalize = True
BO1_params.seed = 1234567
ca_pipe_bo1 = third_pip_bo1 >> pt.ltr.apply_learned_model(BO1_train_request, form='fastrank')
ca_pipe_bo1.fit(train_topics, qrels)

KL_train_request = fastrank.TrainRequest.coordinate_ascent()
KL_params = KL_train_request.params
KL_params.init_random = True
KL_params.normalize = True
KL_params.seed = 1234567
ca_pipe_kl = third_pip_kl >> pt.ltr.apply_learned_model(KL_train_request, form='fastrank')
ca_pipe_kl.fit(train_topics, qrels)

In [30]:
pt.Experiment(
    [tfidf, bm25, rf_pipe_bo1, rf_pipe_kl, ca_pipe_bo1, ca_pipe_kl],
    test_topics,
    qrels,
    eval_metrics=["map", "ndcg", "ndcg_cut_5", "ndcg_cut_10"],
    names=["TF_IDF", "BM25", "RF_BO1", "RF_KL", "CA_BO1", "CA_KL"],
)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.0s finished


Unnamed: 0,name,map,ndcg,ndcg_cut_5,ndcg_cut_10
0,TF_IDF,0.937215,0.925268,0.832472,0.798361
1,BM25,0.929768,0.924852,0.827465,0.787191
2,RF_BO1,0.855165,0.870023,0.671123,0.671994
3,RF_KL,0.856116,0.86077,0.605083,0.649868
4,CA_BO1,0.914211,0.920907,0.82572,0.801409
5,CA_KL,0.907968,0.920566,0.833395,0.813251


## Ablation

In [31]:
rf_bo1_abl = RandomForestRegressor(n_estimators=400, verbose=1, random_state=SEED, n_jobs=2)
rf_kl_abl = RandomForestRegressor(n_estimators=400, verbose=1, random_state=SEED, n_jobs=2)

rf_pipe_bo1_abl = second_pip_bo1 >> pt.ltr.apply_learned_model(rf_bo1_abl)
rf_pipe_kl_abl = second_pip_kl >> pt.ltr.apply_learned_model(rf_kl_abl)

rf_pipe_bo1_abl.fit(train_topics, qrels)
rf_pipe_kl_abl.fit(train_topics, qrels)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.8s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    1.8s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.9s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    2.0s finished


In [33]:
BO1_train_request_abl = fastrank.TrainRequest.coordinate_ascent()
BO1_params_abl = BO1_train_request_abl.params
BO1_params_abl.init_random = True
BO1_params_abl.normalize = True
BO1_params_abl.seed = 1234567
ca_pipe_bo1_abl = second_pip_bo1 >> pt.ltr.apply_learned_model(BO1_train_request_abl, form='fastrank')
ca_pipe_bo1_abl.fit(train_topics, qrels)

KL_train_request_abl = fastrank.TrainRequest.coordinate_ascent()
KL_params_abl = KL_train_request_abl.params
KL_params_abl.init_random = True
KL_params_abl.normalize = True
KL_params_abl.seed = 1234567
ca_pipe_kl_abl = second_pip_kl >> pt.ltr.apply_learned_model(KL_train_request_abl, form='fastrank')
ca_pipe_kl_abl.fit(train_topics, qrels)

In [35]:
pt.Experiment(
    [tfidf, bm25, rf_pipe_bo1_abl, rf_pipe_kl_abl, ca_pipe_bo1_abl, ca_pipe_kl_abl],
    test_topics,
    qrels,
    eval_metrics=["map", "ndcg", "ndcg_cut_5", "ndcg_cut_10"],
    names=["TF-IDF", "BM25", "RF_BO1", "RF_KL", "CA_BO1", "CA_KL"],
)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.4s finished


Unnamed: 0,name,map,ndcg,ndcg_cut_5,ndcg_cut_10
0,TF-IDF,0.937215,0.925268,0.832472,0.798361
1,BM25,0.929768,0.924852,0.827465,0.787191
2,RF_BO1,0.876173,0.885839,0.755464,0.736696
3,RF_KL,0.868766,0.869415,0.665746,0.674122
4,CA_BO1,0.915412,0.927415,0.875383,0.823951
5,CA_KL,0.902083,0.924053,0.868452,0.816679
