In [1]:
import pandas as pd

print("Step 1: Loading Dataset...")
path = 'data.json'
df = pd.read_json(path, lines=True)
df_sample = df.sample(n=500, random_state=42)
print(f"Loaded {len(df_sample)} papers.")

Step 1: Loading Dataset...
Loaded 500 papers.


In [2]:
df.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Bal치zs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"


In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
df['publication_year'] = pd.to_datetime(df['update_date']).dt.year
print(df[['update_date', 'publication_year']].head())


  update_date  publication_year
0  2008-11-26              2008
1  2008-12-13              2008
2  2008-01-13              2008
3  2007-05-23              2007
4  2013-10-15              2013


In [5]:
category_map = {
'acc-phys': 'Accelerator Physics',
'adap-org': 'Not available',
'q-bio': 'Not available',
'cond-mat': 'Not available',
'chao-dyn': 'Not available',
'patt-sol': 'Not available',
'dg-ga': 'Not available',
'solv-int': 'Not available',
'bayes-an': 'Not available',
'comp-gas': 'Not available',
'alg-geom': 'Not available',
'funct-an': 'Not available',
'q-alg': 'Not available',
'ao-sci': 'Not available',
'atom-ph': 'Atomic Physics',
'chem-ph': 'Chemical Physics',
'plasm-ph': 'Plasma Physics',
'mtrl-th': 'Not available',
'cmp-lg': 'Not available',
'supr-con': 'Not available',
'econ.GN': 'General Economics',
'econ.TH': 'Theoretical Economics',
'eess.SY': 'Systems and Control',

'astro-ph': 'Astrophysics',
'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
'astro-ph.EP': 'Earth and Planetary Astrophysics',
'astro-ph.GA': 'Astrophysics of Galaxies',
'astro-ph.HE': 'High Energy Astrophysical Phenomena',
'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
'astro-ph.SR': 'Solar and Stellar Astrophysics',
'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
'cond-mat.mtrl-sci': 'Materials Science',
'cond-mat.other': 'Other Condensed Matter',
'cond-mat.quant-gas': 'Quantum Gases',
'cond-mat.soft': 'Soft Condensed Matter',
'cond-mat.stat-mech': 'Statistical Mechanics',
'cond-mat.str-el': 'Strongly Correlated Electrons',
'cond-mat.supr-con': 'Superconductivity',
'cs.AI': 'Artificial Intelligence',
'cs.AR': 'Hardware Architecture',
'cs.CC': 'Computational Complexity',
'cs.CE': 'Computational Engineering, Finance, and Science',
'cs.CG': 'Computational Geometry',
'cs.CL': 'Computation and Language',
'cs.CR': 'Cryptography and Security',
'cs.CV': 'Computer Vision and Pattern Recognition',
'cs.CY': 'Computers and Society',
'cs.DB': 'Databases',
'cs.DC': 'Distributed, Parallel, and Cluster Computing',
'cs.DL': 'Digital Libraries',
'cs.DM': 'Discrete Mathematics',
'cs.DS': 'Data Structures and Algorithms',
'cs.ET': 'Emerging Technologies',
'cs.FL': 'Formal Languages and Automata Theory',
'cs.GL': 'General Literature',
'cs.GR': 'Graphics',
'cs.GT': 'Computer Science and Game Theory',
'cs.HC': 'Human-Computer Interaction',
'cs.IR': 'Information Retrieval',
'cs.IT': 'Information Theory',
'cs.LG': 'Machine Learning',
'cs.LO': 'Logic in Computer Science',
'cs.MA': 'Multiagent Systems',
'cs.MM': 'Multimedia',
'cs.MS': 'Mathematical Software',
'cs.NA': 'Numerical Analysis',
'cs.NE': 'Neural and Evolutionary Computing',
'cs.NI': 'Networking and Internet Architecture',
'cs.OH': 'Other Computer Science',
'cs.OS': 'Operating Systems',
'cs.PF': 'Performance',
'cs.PL': 'Programming Languages',
'cs.RO': 'Robotics',
'cs.SC': 'Symbolic Computation',
'cs.SD': 'Sound',
'cs.SE': 'Software Engineering',
'cs.SI': 'Social and Information Networks',
'cs.SY': 'Systems and Control',
'econ.EM': 'Econometrics',
'eess.AS': 'Audio and Speech Processing',
'eess.IV': 'Image and Video Processing',
'eess.SP': 'Signal Processing',
'gr-qc': 'General Relativity and Quantum Cosmology',
'hep-ex': 'High Energy Physics - Experiment',
'hep-lat': 'High Energy Physics - Lattice',
'hep-ph': 'High Energy Physics - Phenomenology',
'hep-th': 'High Energy Physics - Theory',
'math.AC': 'Commutative Algebra',
'math.AG': 'Algebraic Geometry',
'math.AP': 'Analysis of PDEs',
'math.AT': 'Algebraic Topology',
'math.CA': 'Classical Analysis and ODEs',
'math.CO': 'Combinatorics',
'math.CT': 'Category Theory',
'math.CV': 'Complex Variables',
'math.DG': 'Differential Geometry',
'math.DS': 'Dynamical Systems',
'math.FA': 'Functional Analysis',
'math.GM': 'General Mathematics',
'math.GN': 'General Topology',
'math.GR': 'Group Theory',
'math.GT': 'Geometric Topology',
'math.HO': 'History and Overview',
'math.IT': 'Information Theory',
'math.KT': 'K-Theory and Homology',
'math.LO': 'Logic',
'math.MG': 'Metric Geometry',
'math.MP': 'Mathematical Physics',
'math.NA': 'Numerical Analysis',
'math.NT': 'Number Theory',
'math.OA': 'Operator Algebras',
'math.OC': 'Optimization and Control',
'math.PR': 'Probability',
'math.QA': 'Quantum Algebra',
'math.RA': 'Rings and Algebras',
'math.RT': 'Representation Theory',
'math.SG': 'Symplectic Geometry',
'math.SP': 'Spectral Theory',
'math.ST': 'Statistics Theory',
'math-ph': 'Mathematical Physics',
'nlin.AO': 'Adaptation and Self-Organizing Systems',
'nlin.CD': 'Chaotic Dynamics',
'nlin.CG': 'Cellular Automata and Lattice Gases',
'nlin.PS': 'Pattern Formation and Solitons',
'nlin.SI': 'Exactly Solvable and Integrable Systems',
'nucl-ex': 'Nuclear Experiment',
'nucl-th': 'Nuclear Theory',
'physics.acc-ph': 'Accelerator Physics',
'physics.ao-ph': 'Atmospheric and Oceanic Physics',
'physics.app-ph': 'Applied Physics',
'physics.atm-clus': 'Atomic and Molecular Clusters',
'physics.atom-ph': 'Atomic Physics',
'physics.bio-ph': 'Biological Physics',
'physics.chem-ph': 'Chemical Physics',
'physics.class-ph': 'Classical Physics',
'physics.comp-ph': 'Computational Physics',
'physics.data-an': 'Data Analysis, Statistics and Probability',
'physics.ed-ph': 'Physics Education',
'physics.flu-dyn': 'Fluid Dynamics',
'physics.gen-ph': 'General Physics',
'physics.geo-ph': 'Geophysics',
'physics.hist-ph': 'History and Philosophy of Physics',
'physics.ins-det': 'Instrumentation and Detectors',
'physics.med-ph': 'Medical Physics',
'physics.optics': 'Optics',
'physics.plasm-ph': 'Plasma Physics',
'physics.pop-ph': 'Popular Physics',
'physics.soc-ph': 'Physics and Society',
'physics.space-ph': 'Space Physics',
'q-bio.BM': 'Biomolecules',
'q-bio.CB': 'Cell Behavior',
'q-bio.GN': 'Genomics',
'q-bio.MN': 'Molecular Networks',
'q-bio.NC': 'Neurons and Cognition',
'q-bio.OT': 'Other Quantitative Biology',
'q-bio.PE': 'Populations and Evolution',
'q-bio.QM': 'Quantitative Methods',
'q-bio.SC': 'Subcellular Processes',
'q-bio.TO': 'Tissues and Organs',
'q-fin.CP': 'Computational Finance',
'q-fin.EC': 'Economics',
'q-fin.GN': 'General Finance',
'q-fin.MF': 'Mathematical Finance',
'q-fin.PM': 'Portfolio Management',
'q-fin.PR': 'Pricing of Securities',
'q-fin.RM': 'Risk Management',
'q-fin.ST': 'Statistical Finance',
'q-fin.TR': 'Trading and Market Microstructure',
'quant-ph': 'Quantum Physics',
'stat.AP': 'Applications',
'stat.CO': 'Computation',
'stat.ME': 'Methodology',
'stat.ML': 'Machine Learning',
'stat.OT': 'Other Statistics',
'stat.TH': 'Statistics Theory'
}

def get_category_names(categories):
    if pd.isna(categories):
        return ""

    category_codes = categories.split()

    category_names = []
    for code in category_codes:
        if code in category_map:
            if category_map[code] != "Not available":
                category_names.append(category_map[code])
        else:
            category_names.append(code)

    return ", ".join(category_names)

df['keywords'] = df['categories'].apply(get_category_names)

def clean_text(x):
    new_text = x.replace("\n", " ")
    new_text = new_text.strip()

    return new_text

df['title'] = df['title'].apply(clean_text)
df['publication_year'] = df.loc[df_sample.index,'publication_year']

In [15]:
df_sample['keywords'] = df.loc[df_sample.index,'keywords']

In [22]:
df_sample['publication_year'] = df.loc[df_sample.index,'publication_year']

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed,publication_year,keywords
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Bal치zs, C., ], [Berger, E. L., ], [Nadolsky,...",,
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]",,
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]",,
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]",,
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]",,


In [6]:
def combine_text(row):
    title = row.get('title', '')
    publication_year = row.get('publication_year', '')
    keywords = row.get('keywords', '')

    if isinstance(keywords, list):
        keywords = ', '.join(keywords)
    else:
        keywords = str(keywords) if pd.notnull(keywords) else ''

    return f"{title}. {publication_year}. {keywords}"

In [7]:
df_sample['combined_text'] = df_sample.apply(combine_text, axis=1)

In [23]:
df_sample.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed,combined_text,vector_embedding,keywords,publication_year
2395581,cond-mat/0606114,Maikel Rheinstadter,"Maikel C. Rheinstadter, Wolfgang Haussler, Tim...",Dispersion relation of lipid membrane shape fl...,,"Phys. Rev. Lett. 97, 048103 (2006)",10.1103/PhysRevLett.97.048103,,cond-mat.soft,,We have studied the mesoscopic shape fluctua...,"[{'version': 'v1', 'created': 'Mon, 5 Jun 2006...",2007-05-23,"[[Rheinstadter, Maikel C., ], [Haussler, Wolfg...",Dispersion relation of lipid membrane shape fl...,"[-0.054421708, -0.15462224, -0.026906788, 0.00...",Soft Condensed Matter,2007.0
206606,1008.1847,Priyam Das Dr.,Priyam Das and Prasanta K. Panigrahi,Chirp Control of Sinusoidal Lattice Modes in B...,This Paper has been withdrawn by the authors. ...,,,,cond-mat.quant-gas,http://arxiv.org/licenses/nonexclusive-distrib...,A parametrically forced Bose-Einstein conden...,"[{'version': 'v1', 'created': 'Wed, 11 Aug 201...",2015-06-04,"[[Das, Priyam, ], [Panigrahi, Prasanta K., ]]",Chirp Control of Sinusoidal Lattice Modes in B...,"[-0.04300152, -0.04268448, -0.040876452, 0.064...",Quantum Gases,2015.0
1237143,2001.11730,A. M. Rucklidge,"Jennifer K. Castelino, Daniel J. Ratliff, Alas...",Spatiotemporal chaos and quasipatterns in coup...,"35 pages, 11, figures. For associated data fil...",Physica D 409 (2020) 132475,10.1016/j.physd.2020.132475,,nlin.PS,http://arxiv.org/licenses/nonexclusive-distrib...,"In coupled reaction-diffusion systems, modes...","[{'version': 'v1', 'created': 'Fri, 31 Jan 202...",2020-04-14,"[[Castelino, Jennifer K., ], [Ratliff, Daniel ...",Spatiotemporal chaos and quasipatterns in coup...,"[0.02043986, -0.060283806, 0.05019444, -0.0131...",Pattern Formation and Solitons,2020.0
214198,1009.4042,Enno Lenzmann,Rupert L. Frank and Enno Lenzmann,Uniqueness and Nondegeneracy of Ground States ...,45 pages,"Acta Math. 210 (2013), no. 2, 261-318",10.1007/s11511-013-0095-9,,math.AP math-ph math.MP,http://arxiv.org/licenses/nonexclusive-distrib...,We prove uniqueness of ground state solution...,"[{'version': 'v1', 'created': 'Tue, 21 Sep 201...",2015-03-24,"[[Frank, Rupert L., ], [Lenzmann, Enno, ]]",Uniqueness and Nondegeneracy of Ground States ...,"[-0.053714212, -0.040661618, -0.028220592, 0.0...","Analysis of PDEs, Mathematical Physics, Mathem...",2015.0
515763,1404.2710,Wolfdieter Lang,Wolfdieter Lang,"On Collatz' Words, Sequences and Trees","13 pages, 1 figure","Journal of Integer Sequences, Vol. 17 (2014) 1...",,,math.NT,http://arxiv.org/licenses/nonexclusive-distrib...,"Motivated by a recent work of Tr\""umper we c...","[{'version': 'v1', 'created': 'Thu, 10 Apr 201...",2015-02-04,"[[Lang, Wolfdieter, ]]","On Collatz' Words, Sequences and Trees. .","[-0.04481096, -0.026074953, 0.024543585, 0.034...",Number Theory,2015.0


In [24]:
df_sample['vector_embedding'] = df_sample['combined_text'].apply(lambda text: model.encode(text))
paper_documents = df_sample[['title', 'abstract', 'authors', 'publication_year', 'vector_embedding', 'keywords', 'doi']].copy()
display(paper_documents.head())

Unnamed: 0,title,abstract,authors,publication_year,vector_embedding,keywords,doi
2395581,Dispersion relation of lipid membrane shape fl...,We have studied the mesoscopic shape fluctua...,"Maikel C. Rheinstadter, Wolfgang Haussler, Tim...",2007.0,"[-0.054421708, -0.15462224, -0.026906788, 0.00...",Soft Condensed Matter,10.1103/PhysRevLett.97.048103
206606,Chirp Control of Sinusoidal Lattice Modes in B...,A parametrically forced Bose-Einstein conden...,Priyam Das and Prasanta K. Panigrahi,2015.0,"[-0.04300152, -0.04268448, -0.040876452, 0.064...",Quantum Gases,
1237143,Spatiotemporal chaos and quasipatterns in coup...,"In coupled reaction-diffusion systems, modes...","Jennifer K. Castelino, Daniel J. Ratliff, Alas...",2020.0,"[0.02043986, -0.060283806, 0.05019444, -0.0131...",Pattern Formation and Solitons,10.1016/j.physd.2020.132475
214198,Uniqueness and Nondegeneracy of Ground States ...,We prove uniqueness of ground state solution...,Rupert L. Frank and Enno Lenzmann,2015.0,"[-0.053714212, -0.040661618, -0.028220592, 0.0...","Analysis of PDEs, Mathematical Physics, Mathem...",10.1007/s11511-013-0095-9
515763,"On Collatz' Words, Sequences and Trees","Motivated by a recent work of Tr\""umper we c...",Wolfdieter Lang,2015.0,"[-0.04481096, -0.026074953, 0.024543585, 0.034...",Number Theory,


In [18]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
uri = "mongodb+srv://Duch:Alika123@paperrecsys.gs99u.mongodb.net/?retryWrites=true&w=majority&appName=PaperRecSys"

client = MongoClient(uri, server_api=ServerApi('1'))

try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [20]:
db = client["papersv2"]
collection = db["papers2"]

search_index_command = {
    "createSearchIndexes": "papers2",
    "indexes": [
        {
            "name": "embedding_vector_index",
            "definition": {
                "mappings": {
                    "dynamic": False,
                    "fields": {
                        "vector_embedding": {
                            "type": "knnVector",
                            "dimensions": 384,
                            "similarity": "cosine"
                        }
                    }
                }
            }
        }
    ]
}

try:
    result = db.command(search_index_command)
    print("Vector search index created successfully!")
    print(result)
except Exception as e:
    print("Error creating vector index:", e)


Vector search index created successfully!
{'indexesCreated': [{'id': '67cc2c22551c644756ac8a8e', 'name': 'embedding_vector_index'}], 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1741433890, 2), 'signature': {'hash': b'\x99#\xabb\xe4\xfa2\x9faXyH\xe0R/muZ\x89\x8d', 'keyId': 7475778976875544580}}, 'operationTime': Timestamp(1741433890, 2)}


In [25]:
for _, row in paper_documents.iterrows():
    doc = {
        "title": row['title'],
        "abstract": row['abstract'],
        "authors": row['authors'],
        "vector_embedding": row['vector_embedding'].tolist(),
        "keywords": row['keywords'],
        "doi": row['doi'],
        "publication_year": row['publication_year']
    }
    collection.insert_one(doc)

print("All documents uploaded successfully!")


All documents uploaded successfully!


In [31]:
def search_papers(query, top_k=5):
    query_embedding = model.encode(query).tolist()

    pipeline = [
        {
            "$search": {
                "index": "embedding_vector_index",
                "knnBeta": {
                    "vector": query_embedding,
                    "path": "vector_embedding",
                    "k": top_k
                }
            }
        },
        {"$limit": top_k}
    ]

    results = collection.aggregate(pipeline)
    for result in results:
        print(f"Title: {result['title']}\nAbstract: {result['abstract']}\n {result['keywords']}\n Year: {result['publication_year']}")

search_papers("2022 publications")

Title: Publication bias and the canonization of false facts
Abstract:   In the process of scientific inquiry, certain claims accumulate enough
support to be established as facts. Unfortunately, not every claim accorded the
status of fact turns out to be true. In this paper, we model the dynamic
process by which claims are canonized as fact through repeated experimental
confirmation. The community's confidence in a claim constitutes a Markov
process: each successive published result shifts the degree of belief, until
sufficient evidence accumulates to accept the claim as fact or to reject it as
false. In our model, publication bias --- in which positive results are
published preferentially over negative ones --- influences the distribution of
published results. We find that when readers do not know the degree of
publication bias and thus cannot condition on it, false claims often can be
canonized as facts. Unless a sufficient fraction of negative results are
published, the scientific pr