In [2]:
import pandas as pd

print("Step 1: Loading Dataset...")
path = 'data.json'
df = pd.read_json(path, lines=True)
df_sample = df.sample(n=500, random_state=42)
print(f"Loaded {len(df_sample)} papers.")

Step 1: Loading Dataset...
Loaded 500 papers.


In [3]:
df.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"


In [None]:
# !pip install sentence-transformers

In [7]:
import re
from ast import literal_eval
from sentence_transformers import SentenceTransformer

def extract_publication_year(versions):
    if pd.notna(versions):
        try:
            versions_list = literal_eval(versions)
            v1_date = versions_list[0]['created']
            year_match = re.search(r'\b(\d{4})\b', v1_date)
            if year_match:
                return int(year_match.group(1))
        except (ValueError, IndexError, KeyError):
            pass
    return None

model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_embedding(title, abstract):
    text = f"{title} {abstract}"
    return model.encode(text).tolist()

In [None]:
df['keywords'] = df['categories'].apply(
    lambda x: x.split() if pd.notna(x) else []
)

df['vector_embedding'] = df.apply(
    lambda row: generate_embedding(row['title'], row['abstract']),
    axis=1
)

# paper_documents = df[[
#     'title',
#     'abstract',
#     'authors',
#     'publication_year',
#     'vector_embedding',
#     'keywords',
#     'doi'
# ]]

In [None]:
df['publication_year'] = df['versions'].apply(extract_publication_year)


paper_documents['doi'] = paper_documents['doi'].where(
    pd.notna(paper_documents['doi']),
    None
)

paper_documents['keywords'] = paper_documents['keywords'].apply(
    lambda x: x if isinstance(x, list) else []
)