In [1]:
# Importing necessary libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

In [190]:
# Queries
# Fetch 50 articles in Mathematics from 2022-03-01 to 2022-03-07
# query = 'https://arxiv.org/search/advanced?advanced=1&terms-0-operator=AND&terms-0-term=&terms-0-field=title&classification-mathematics=y&classification-physics_archives=all&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2022-03-01&date-to_date=2022-03-07&date-date_type=submitted_date&abstracts=show&order=-announced_date_first'
# Fetch 200 (this is the maximum) articles in Mathematics from 2022-03-01 to 2022-03-07
query = 'https://arxiv.org/search/advanced?advanced=1&terms-0-term=&terms-0-operator=AND&terms-0-field=title&classification-mathematics=y&classification-physics_archives=all&classification-include_cross_list=include&date-filter_by=date_range&date-year=&date-from_date=2022-03-01&date-to_date=2022-03-07&date-date_type=submitted_date&abstracts=show&size=200&order=-announced_date_first'
# Fetch next 200 articles in Mathematics from 2022-03-01 to 2022-03-01 (same date)
# query = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=&terms-0-field=title&classification-mathematics=y&classification-physics_archives=all&classification-include_cross_list=include&date-filter_by=past_12&date-year=&date-from_date=2022-03-01&date-to_date=2022-03-01&date-date_type=submitted_date&abstracts=show&size=200&order=-announced_date_first&start=200'

In [191]:
page = requests.get(query)
soup = BeautifulSoup(page.content, 'html.parser')

In [248]:
arxiv_id_list = []
title_list = []
authors_list = []
classification_list = []
classification_detailed_list= []
for article in soup.find_all('li',{'class':"arxiv-result"}):
    arxiv_id = article.find_all('p',{'class':"list-title is-inline-block"}) 
    arxiv_id = arxiv_id[0].find_all('a',href=True)[0].get_text()
    title = article.find_all('p',{'class':"title is-5 mathjax"})
    title = title[0].get_text()
    title = title.strip()
    authors=article.find_all('p',{'class':"authors"})
    authors = [x.get_text().strip() for x in authors[0].find_all('a')]
    comments = article.find_all('p',{'class':"comments is-size-7"})
    classification_primary = article.find_all('span',{'class':'tag is-small is-link tooltip is-tooltip-top'})[0].attrs.get('data-tooltip')
    classification_secondary =  article.find_all('span',{'class':'tag is-small is-grey tooltip is-tooltip-top'})
    if len(classification_secondary)>0:
        classification_secondary = classification_secondary[0].attrs.get('data-tooltip')
        classification = [classification_primary,classification_secondary]
    else:
        classification = [classification_primary]
    if len(comments)>0 :
        comments = comments[0].get_text()
        classification_detailed = re.search('MSC Class:\n(.*)\n',comments)
        if classification_detailed != None:
            classification_detailed = classification_detailed.group(1)
            classification_detailed = classification_detailed.strip().split(sep=';')
        else :
            classification_detailed = ['N/A']
        #print(title)
        #print(authors)
        #print(classification)
        #print(100*'-')
    arxiv_id_list.append(arxiv_id)
    title_list.append(title)
    authors_list.append(authors)
    classification_list.append(classification)
    classification_detailed_list.append(classification_detailed)

In [250]:
df = pd.DataFrame({'airxv_id':arxiv_id_list,
                  'title':title_list,
                  'authors':authors_list,
                  'classification':classification_list,
                  'classification_detailed':classification_detailed_list})

In [251]:
df.head()

Unnamed: 0,airxv_id,title,authors,classification,classification_detailed
0,arXiv:2203.07873,"Asymptotic Fermat for signatures $(p,p,2)$ and...",[Diana Mocanu],[Number Theory],"[11F80, 11G05, 11D41]"
1,arXiv:2203.05018,Application of neural-network hybrid models in...,"[Chentong Li, Zhou Changsheng, Junmin Liu, Yao...","[Dynamical Systems, Physics and Society]","[11F80, 11G05, 11D41]"
2,arXiv:2203.05017,Asymmetric Duffing oscillator: jump manifold a...,"[Jan Kyzioł, Andrzej Okniński]",[Dynamical Systems],[N/A]
3,arXiv:2203.04148,Numerical solution of optimal control of ather...,"[F. Nasresfahani, M. R. Eslahchi]","[Optimization and Control, Numerical Analysis]",[N/A]
4,arXiv:2203.04145,The structure of the linearizer of a connected...,[Oleg Aristov],[Group Theory],[N/A]


In [195]:
len(df.classification)

200

In [196]:
neo4j_data=[]
for arxiv_id,title,author,classification in zip(arxiv_id_list,title_list,authors_list,classification_list):
    neo4j_data.append({'arxiv_id':arxiv_id,'title':title,'author':author,'classification':classification})

In [197]:
from neo4j import GraphDatabase
host = 'bolt://localhost:7687'
user = 'neo4j'
password = 'arxiv'
driver = GraphDatabase.driver(host,auth=(user, password))

In [198]:
def run_query(query, params={}):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [199]:
run_query("CREATE CONSTRAINT IF NOT EXISTS ON (a:Article) ASSERT a.arxiv_id IS UNIQUE;")
run_query("CREATE CONSTRAINT IF NOT EXISTS ON (a:Author) ASSERT a.name IS UNIQUE;")
run_query("CREATE CONSTRAINT IF NOT EXISTS ON (a:classification) ASSERT a.name IS UNIQUE;")

In [200]:
import_pubmed_query = """
UNWIND $data AS row
// Store article
MERGE (a:Article {arxiv_id: row.arxiv_id})
SET a.title = row.title
// Store authors    
FOREACH (author IN row.author  |
    MERGE (au:Author {name: author})
    MERGE (a)<-[:AUTHORED]-(au))
// Store classifications
FOREACH (class IN row.classification  |
    MERGE (cl:classification {name: class})
    MERGE (a)-[:BELONGS]->(cl))
"""

In [201]:
result = run_query(import_pubmed_query, {'data': neo4j_data})

In [202]:
neo4j_data[0]

{'arxiv_id': 'arXiv:2203.07873',
 'title': 'Asymptotic Fermat for signatures $(p,p,2)$ and $(p,p,3)$ over totally real fields',
 'author': ['Diana Mocanu'],
 'classification': ['11F80', ' 11G05', ' 11D41']}

In [246]:
 classification_secondary =  article.find_all('span',{'class':'tag is-small is-blue tooltip is-tooltip-top'})[0].attrs.get('data-tooltip')

IndexError: list index out of range

In [245]:
len(classification_secondary)

13