In [1]:
import json
import pandas as pd
from collections import Counter

In [2]:
import pyterrier as pt
import os

In [3]:
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-19"

In [4]:
if not pt.started():
    pt.init()

PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



In [6]:
with open("selected_articles.json", 'r') as file_read:
    selected_articles = json.load(file_read)

In [7]:
articles_df = pd.DataFrame(selected_articles)
modified_df = articles_df[["id", "authors", "title", "abstract"]].dropna()

In [8]:
modified_df.columns = ["docno", "authors", "title", "abstract"]
modified_df

Unnamed: 0,docno,authors,title,abstract
0,0704.0046,"I. Csiszar, F. Hiai and D. Petz",A limit relation for entropy and channel capac...,"In a quantum mechanical model, Diosi, Feldma..."
1,0704.0047,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is...
2,0704.0050,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emi...
3,0704.0098,"Jack Raymond, David Saad",Sparsely-spread CDMA - a statistical mechanics...,"Sparse Code Division Multiple Access (CDMA),..."
4,0704.0217,Wiroonsak Santipach and Michael L. Honig,Capacity of a Multiple-Antenna Fading Channel ...,Given a multiple-input multiple-output (MIMO...
...,...,...,...,...
219356,quant-ph/0703112,"Markus Grassl, Andreas Klappenecker, Martin Ro...","Graphs, Quadratic Forms, and Quantum Codes",We show that any stabilizer code over a fini...
219357,quant-ph/0703113,"Salah A. Aly, Markus Grassl, Andreas Klappenec...",Quantum Convolutional BCH Codes,Quantum convolutional codes can be used to p...
219358,quant-ph/0703181,"Markus Grassl, Martin Roetteler",Quantum Block and Convolutional Codes from Sel...,We present a construction of self-orthogonal...
219359,quant-ph/0703182,"Markus Grassl, Martin Roetteler",Constructions of Quantum Convolutional Codes,We address the problems of constructing quan...


In [None]:
pt_index_path = "F:/2022Fall/SI650/Project/pre_arxiv_index"

if not os.path.exists(pt_index_path + "/data.properties"):

    indexer = pt.DFIndexer(pt_index_path, overwrite=True)
    index_ref = indexer.index(modified_df["abstract"], modified_df["docno"])

else:
    
    index_ref = pt.IndexRef.of(pt_index_path + "/data.properties")
    
index = pt.IndexFactory.of(index_ref)




In [None]:
print(index.getCollectionStatistics().toString())

In [None]:
tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF", num_results = 70)
bm25 = pt.BatchRetrieve(index, wmodel="BM25", num_results = 70)
pl2 = pt.BatchRetrieve(index, wmodel="PL2", num_results = 70)
dlh13 = pt.BatchRetrieve(index, wmodel="DLH13", num_results = 70)
XSqrA_M = pt.BatchRetrieve(index, wmodel="XSqrA_M", num_results = 70)
pipline = tfidf | bm25 | pl2 | dlh13 | XSqrA_M

In [None]:
pipline.search("random forests")

In [None]:
queries = ["learning to rank with partitioned preference",
           "advanced search engine",
           "random forests",
           "database management system",
           "pre-trained language model",
           "matrix completion",
           "query expansion for information retrieval",
           "gradient boosting",
           "node embedding for graph",
           "language model for long documents",
           "web archive",
           "PageRank for web search",
           "COVID-19 and social media",
           "social network analysis with natural language processing",
           "text summarization model",
           "relevance feedback for information retrieval",
           "activation function in neural networks",
           "robustness of neural networks",
           "information retrieval time complexity",
           "artificial intelligence for low carbon"
]

In [None]:
query_df = pd.DataFrame( queries, columns = ["query"])
query_df["qid"] = query_df.index
query_df

In [None]:
raw_results = pipline(query_df)

In [None]:
raw_results

In [None]:
results = raw_results.merge(modified_df, on = "docno")

In [None]:
results[results["qid"] == "19"]

In [None]:
results.to_csv("raw_doc_query.csv")

In [None]:
results.sample(frac=1).to_csv("doc_query.csv")

In [None]:
results