In [38]:
import requests

url = "https://api-beta.openaire.eu/graph/researchProducts"
params = {
    "page": 1,
    "pageSize": 100,
    "sortBy": "relevance DESC",
    "type": "publication"
}
headers = {
    "accept": "application/json"
}

response = requests.get(url, headers=headers, params=params)

if response.status_code == 200:
    data = response.json()
    print(data)
else:
    print(f"Failed {response.status_code}")

{'header': {'numFound': 375839206, 'maxScore': 1.0, 'queryTime': 5810, 'page': 1, 'pageSize': 100}, 'results': [{'openAccessColor': None, 'publiclyFunded': False, 'type': 'publication', 'language': {'code': 'und', 'label': 'Undetermined'}, 'mainTitle': 'Atrial caval shunting', 'publicationDate': '1987-11-01', 'publisher': 'Elsevier BV', 'source': ['Crossref'], 'bestAccessRight': {'code': 'c_14cb', 'label': 'CLOSED', 'scheme': 'http://vocabularies.coar-repositories.org/documentation/access_rights/'}, 'container': {'name': 'Injury', 'issnPrinted': '0020-1383', 'issnOnline': None, 'issnLinking': None, 'ep': None, 'iss': None, 'sp': '433', 'vol': '18', 'edition': None, 'conferencePlace': None, 'conferenceDate': None}, 'id': 'doi_________::5425833b523ea99312dbe62360a960fd', 'originalId': ['0020138387903214', '10.1016/0020-1383(87)90321-4', '50|doiboost____|5425833b523ea99312dbe62360a960fd'], 'pid': [{'scheme': 'doi', 'value': '10.1016/0020-1383(87)90321-4'}], 'indicators': {'citationImpact'

In [31]:
!python3.11 -m pip install neo4j



In [40]:
import json
import csv

def flatten_main(record):
    indicators = record.get('indicators', {}).get('citationImpact', {})
    instance = record.get('instance', [{}])[0]
    return {
        'id': record.get('id', ''),
        'title': record.get('mainTitle', ''),
        'description': ' '.join(record.get('description', [])),
        'type': record.get('type', ''),
        'citationCount': indicators.get('citationCount', 0),
        'influence': indicators.get('influence', 0),
        'popularity': indicators.get('popularity', 0),
        'url': instance.get('url', [None])[0],
    }

def flatten_language(record):
    languages = []
    languages.append({
        'id': record.get('id', ''),
        'language': record.get('language', {}).get('label', ''),
    })
    return languages

def flatten_publicationDate(record):
    date = []
    date.append({
        'id': record.get('id', ''),
        'publicationDate': record.get('publicationDate', ''),
    })
    return date

def flatten_publisher(record):
    publisher = []
    publisher.append({
        'id': record.get('id', ''),
        'publisher': record.get('publisher', ''),
    })
    return publisher

def flatten_authors(record):
    authors = []
    for author in record.get('author', []):
        authors.append({
            'id': record.get('id', ''),
            'fullName': author.get('fullName', ''),
            'rank': author.get('rank', '')
        })
    return authors

def flatten_keywords(record):
    keywords = []
    for subject in record.get('subjects', []):
        keywords.append({
            'id': record['id'],
            'keyword': subject['subject']['value']
        })
    return keywords

main_records = []
authors_records = []
keywords_records = []
languanges_records = []
publicationDate_records = []
publisher_records = []

for result in data['results']:
    main_records.append(flatten_main(result))
    authors_records.extend(flatten_authors(result))
    keywords_records.extend(flatten_keywords(result))
    languanges_records.extend(flatten_language(result))
    publicationDate_records.extend(flatten_publicationDate(result))
    publisher_records.extend(flatten_publisher(result))

def write_csv(filename, fieldnames, records):
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(records)

write_csv('main.csv', ['id', 'title', 'description', 'type', 'citationCount', 'influence', 'popularity', 'url'], main_records)
write_csv('authors.csv', ['id', 'fullName', 'rank'], authors_records)
write_csv('keywords.csv', ['id', 'keyword'], keywords_records)
write_csv('languages.csv', ['id', 'language'], languanges_records)
write_csv('publicationDate.csv', ['id', 'publicationDate'], publicationDate_records)
write_csv('publisher.csv', ['id', 'publisher'], publisher_records)


In [41]:
import pandas as pd

df = pd.read_csv("./main.csv")
df.head(6)

Unnamed: 0,id,title,description,type,citationCount,influence,popularity,url
0,doi_________::5425833b523ea99312dbe62360a960fd,Atrial caval shunting,,publication,0.0,1.88158e-09,1.560339e-10,https://doi.org/10.1016/0020-1383(87)90321-4
1,doi_________::575e5eb26ef7c09a7f4827eab8e947d3,Network Expansion For Practical Training Accel...,,publication,0.0,1.88158e-09,2.489088e-09,https://doi.org/10.1109/cvpr52729.2023.01941
2,doi_________::57da457a8910961374d00f8d86b64550,Second Language Acquisition in Childhood,,publication,0.0,1.88158e-09,1.560339e-10,https://doi.org/10.2307/326760
3,doi_________::5886cde8283d7a4eb66f9afcdea34284,Thermo-mechanical characterization of on-chip ...,We report on the thermomechanical and thermal ...,publication,0.0,1.88158e-09,7.993854e-10,https://doi.org/10.48550/arxiv.1510.07766
4,doi_________::58c3ba20c750294e09d37b0e3a2d2fe8,Microsoft MB-210 Dumps - Accurate MB-210 Exam ...,ExamsSpy offer reliable Microsoft MB-210 dumps...,publication,0.0,1.88158e-09,2.143041e-09,https://doi.org/10.5281/zenodo.6075908
5,doi_________::5941d55c147ba1e56e3377f9d58b4445,Trenched microwave resonator integrated with p...,,publication,0.0,1.88158e-09,1.48659e-10,https://doi.org/10.1016/j.jhazmat.2024.134553


In [43]:
# I decided to clean keywords because they contain a lot of noise information
import re
df = pd.read_csv("./keywords.csv")

def clean_keyword(keyword):
    keyword = re.sub(r"\[.*?\]", "", keyword)
    keyword = re.sub(r"\b\d+\b", "", keyword)
    keyword = re.sub(r"\s+", " ", keyword).strip()
    return keyword

df["keyword"] = df["keyword"].apply(clean_keyword)

In [44]:
import numpy as np

df = df.replace('', np.nan).dropna()
df = df.replace('.', np.nan).dropna()

In [45]:
df.to_csv("keywords.csv", index=False)

In [46]:
df = pd.read_csv("./publisher.csv")
df = df.dropna()
df.to_csv("publisher.csv", index=False)

In [52]:
from neo4j import GraphDatabase
import csv

uri = "neo4j+s://6bf30dc9.databases.neo4j.io" 
username = "neo4j"
password = "EZwPS1wiwdWkxc1vwZNEDRoDVpi37G2TDF4i45-Oq5c"
driver = GraphDatabase.driver(uri, auth=(username, password))

def upload_data_from_csv(file_path, query):
    with driver.session() as session:
        with open(file_path, 'r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for row in reader:
                session.run(query, row)

query_main = """
MERGE (r:Record {id: $id})
SET r.title = $title, 
    r.description = $description, 
    r.type = $type, 
    r.citationCount = $citationCount, 
    r.influence = $influence, 
    r.popularity = $popularity,
    r.url = $url
"""

query_authors = """
MERGE (a:Author {fullName: $fullName})
SET a.rank = $rank
WITH a
MATCH (r:Record {id: $id})
MERGE (r)-[:HAS_AUTHOR]->(a)
"""

query_keywords = """
MERGE (k:Keyword {keyword: $keyword})
WITH k
MATCH (r:Record {id: $id})
MERGE (r)-[:HAS_KEYWORD]->(k)
"""

query_publisher = """
MERGE (k:Publisher {publisher: $publisher})
WITH k
MATCH (r:Record {id: $id})
MERGE (r)-[:HAS_PUBLISHER]->(k)
"""

query_publicationDate = """
MERGE (k:PublicationDate {publicationDate: $publicationDate})
WITH k
MATCH (r:Record {id: $id})
MERGE (r)-[:HAS_PUBLICATION_DATE]->(k)
"""

query_language = """
MERGE (k:Language {language: $language})
WITH k
MATCH (r:Record {id: $id})
MERGE (r)-[:HAS_LANGUAGE]->(k)
"""

upload_data_from_csv('main.csv', query_main)
upload_data_from_csv('authors.csv', query_authors)
upload_data_from_csv('keywords.csv', query_keywords)
upload_data_from_csv('publisher.csv', query_publisher)
upload_data_from_csv('publicationDate.csv', query_publicationDate)
upload_data_from_csv('languages.csv', query_language)

driver.close()
