In [1]:
import pandas as pd

print("Step 1: Loading Dataset...")
path = 'data.json'
df = pd.read_json(path, lines=True)
df_sample = df.sample(n=500, random_state=42)
print(f"Loaded {len(df_sample)} papers.")

Step 1: Loading Dataset...
Loaded 500 papers.


In [2]:
df.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"


In [25]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

In [18]:
df['publication_year'] = pd.to_datetime(df['update_date']).dt.year
print(df[['update_date', 'publication_year']].head())


  update_date  publication_year
0  2008-11-26              2008
1  2008-12-13              2008
2  2008-01-13              2008
3  2007-05-23              2007
4  2013-10-15              2013


In [21]:
def combine_text(row):
    title = row.get('title', '')
    abstract = row.get('abstract', '')
    keywords = row.get('keywords', '')

    if isinstance(keywords, list):
        keywords = ', '.join(keywords)
    else:
        keywords = str(keywords) if pd.notnull(keywords) else ''

    return f"{title}. {abstract}. {keywords}"

In [22]:
df_sample['combined_text'] = df_sample.apply(combine_text, axis=1)
df_sample['vector_embedding'] = df_sample['combined_text'].apply(lambda text: model.encode(text))
paper_documents = df_sample[['title', 'abstract', 'authors', 'vector_embedding', 'keywords', 'doi']].copy()


In [23]:
display(paper_documents.head())

Final sample of paper documents:


Unnamed: 0,title,abstract,authors,vector_embedding,keywords,doi
2395581,Dispersion relation of lipid membrane shape fl...,We have studied the mesoscopic shape fluctua...,"Maikel C. Rheinstadter, Wolfgang Haussler, Tim...","[-0.113512814, -0.1746171, 0.02478824, 0.00834...",[cond-mat.soft],10.1103/PhysRevLett.97.048103
206606,Chirp Control of Sinusoidal Lattice Modes in B...,A parametrically forced Bose-Einstein conden...,Priyam Das and Prasanta K. Panigrahi,"[-0.014262573, -0.03486597, -0.06049868, 0.061...",[cond-mat.quant-gas],
1237143,Spatiotemporal chaos and quasipatterns in coup...,"In coupled reaction-diffusion systems, modes...","Jennifer K. Castelino, Daniel J. Ratliff, Alas...","[-0.0055501787, -0.12504546, 0.054105412, -0.0...",[nlin.PS],10.1016/j.physd.2020.132475
214198,Uniqueness and Nondegeneracy of Ground States ...,We prove uniqueness of ground state solution...,Rupert L. Frank and Enno Lenzmann,"[-0.061425075, -0.03604232, -0.022594946, 0.00...","[math.AP, math-ph, math.MP]",10.1007/s11511-013-0095-9
515763,"On Collatz' Words, Sequences and Trees","Motivated by a recent work of Tr\""umper we c...",Wolfdieter Lang,"[-0.08144374, -0.02624556, 0.0032431656, -0.00...",[math.NT],


In [24]:
paper_documents['publication_year'] = df.loc[df_sample.index,'publication_year']
display(paper_documents.head())

Unnamed: 0,title,abstract,authors,vector_embedding,keywords,doi,publication_year
2395581,Dispersion relation of lipid membrane shape fl...,We have studied the mesoscopic shape fluctua...,"Maikel C. Rheinstadter, Wolfgang Haussler, Tim...","[-0.113512814, -0.1746171, 0.02478824, 0.00834...",[cond-mat.soft],10.1103/PhysRevLett.97.048103,2007
206606,Chirp Control of Sinusoidal Lattice Modes in B...,A parametrically forced Bose-Einstein conden...,Priyam Das and Prasanta K. Panigrahi,"[-0.014262573, -0.03486597, -0.06049868, 0.061...",[cond-mat.quant-gas],,2015
1237143,Spatiotemporal chaos and quasipatterns in coup...,"In coupled reaction-diffusion systems, modes...","Jennifer K. Castelino, Daniel J. Ratliff, Alas...","[-0.0055501787, -0.12504546, 0.054105412, -0.0...",[nlin.PS],10.1016/j.physd.2020.132475,2020
214198,Uniqueness and Nondegeneracy of Ground States ...,We prove uniqueness of ground state solution...,Rupert L. Frank and Enno Lenzmann,"[-0.061425075, -0.03604232, -0.022594946, 0.00...","[math.AP, math-ph, math.MP]",10.1007/s11511-013-0095-9,2015
515763,"On Collatz' Words, Sequences and Trees","Motivated by a recent work of Tr\""umper we c...",Wolfdieter Lang,"[-0.08144374, -0.02624556, 0.0032431656, -0.00...",[math.NT],,2015


In [29]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
uri = "mongodb+srv://Duch:Alika123@paperrecsys.gs99u.mongodb.net/?retryWrites=true&w=majority&appName=PaperRecSys"

client = MongoClient(uri, server_api=ServerApi('1'))

try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [31]:
db = client["research_papers"]
collection = db["papers"]

search_index_command = {
    "createSearchIndexes": "papers",
    "indexes": [
        {
            "name": "embedding_vector_index",
            "definition": {
                "mappings": {
                    "dynamic": False,
                    "fields": {
                        "vector_embedding": {
                            "type": "knnVector",
                            "dimensions": 384,
                            "similarity": "cosine"
                        }
                    }
                }
            }
        }
    ]
}

try:
    result = db.command(search_index_command)
    print("Vector search index created successfully!")
    print(result)
except Exception as e:
    print("Error creating vector index:", e)


Vector search index created successfully!
{'indexesCreated': [{'id': '67c9a8761592432ec2f083a5', 'name': 'embedding_vector_index'}], 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1741269110, 13), 'signature': {'hash': b'\xb1\x85\xc1EtS\x08j\x92_\x170\x8f^z\x18\xe9\xa9kB', 'keyId': 7475778976875544580}}, 'operationTime': Timestamp(1741269110, 13)}


In [32]:
for _, row in paper_documents.iterrows():
    doc = {
        "title": row['title'],
        "abstract": row['abstract'],
        "authors": row['authors'],
        "vector_embedding": row['vector_embedding'].tolist(),
        "keywords": row['keywords'],
        "doi": row['doi'],
        "publication_year": row['publication_year']
    }
    collection.insert_one(doc)

print("All documents uploaded successfully!")


All documents uploaded successfully!


In [35]:
def search_papers(query, top_k=5):
    query_embedding = model.encode(query).tolist()

    pipeline = [
        {
            "$search": {
                "index": "embedding_vector_index",
                "knnBeta": {
                    "vector": query_embedding,
                    "path": "vector_embedding",
                    "k": top_k
                }
            }
        },
        {"$limit": top_k}
    ]

    results = collection.aggregate(pipeline)
    for result in results:
        print(f"Title: {result['title']}\nAbstract: {result['abstract']}\n")

search_papers("Mathmatic Search")

Title: On Collatz' Words, Sequences and Trees
Abstract:   Motivated by a recent work of Tr\"umper we consider the general Collatz word
(up-down pattern) and the sequences following this pattern. The recurrences for
the first and last sequence entries are given, obtained from repeated
application of the general solution of a binary linear inhomogeneous
Diophantine equation. These recurrences are then solved. The Collatz tree is
also discussed.


Title: Calculus and digital natives in rendezvous: wxMaxima impact
Abstract:   This article covers how a computer algebra system (CAS) wxMaxima can be
explored for teaching and learning Single-Variable and Multivariable Calculus
for Korean digital natives. We present several examples where \emph{wxMaxima}
can handle Calculus problems easily, not straightforwardly but still
successfully solved with some human intervention, and unsuccessfully. By
soliciting qualitative feedback on students' experience in exploiting the CAS,
we gathered a mixed rea