<a href="https://colab.research.google.com/github/danebencedavid/NLP-A-Agent/blob/master/graphdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import os
import requests
import time

In [None]:
df = pd.read_csv("/content/drive/MyDrive/relevant_papers_4.csv")

In [None]:
df.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id', 'smoking_score', 'covid_score', 'combined_score'],
      dtype='object')

In [None]:
output_path = "/content/drive/MyDrive/neo4j_data/"

In [None]:
paper_nodes = []
author_nodes = []
keyword_nodes = []

In [None]:
author_wrote_paper_relationships = []
paper_has_keyword_relationships = []


In [None]:
unique_papers = set()
unique_authors = set()
unique_keywords = set()

In [None]:
for idx, paper in df.iterrows():
    paper_id = paper.get("cord_uid")
    if not paper_id or paper_id in unique_papers:
        continue
    unique_papers.add(paper_id)

    paper_nodes.append({
        "paper_id": paper_id,
        "title": paper.get("title", ""),
        "abstract": paper.get("abstract", ""),
        "doi": paper.get("doi", ""),
        "publish_date": paper.get("publish_time", "")
    })

    authors_raw = paper.get("authors", "")
    authors = [a.strip() for a in authors_raw.split(";") if a.strip()]
    for author_name in authors:
        if author_name not in unique_authors:
            author_nodes.append({
                "author_name": author_name
            })
            unique_authors.add(author_name)

        author_wrote_paper_relationships.append({
            "author_name": author_name,
            "paper_id": paper_id
        })

    abstract_text = str(paper.get("abstract", "")).lower()
    words = re.findall(r'\b\w+\b', abstract_text)
    word_counts = Counter(words)

    for keyword, count in word_counts.items():
        if len(keyword) <= 2:
            continue
        if keyword not in unique_keywords:
            keyword_nodes.append({
                "keyword_name": keyword,
                "frequency": count
            })
            unique_keywords.add(keyword)

        paper_has_keyword_relationships.append({
            "paper_id": paper_id,
            "keyword_name": keyword
        })

In [None]:
pd.DataFrame(paper_nodes).to_csv(os.path.join(output_path, "paper_nodes.csv"), index=False)
pd.DataFrame(author_nodes).to_csv(os.path.join(output_path, "author_nodes.csv"), index=False)
pd.DataFrame(keyword_nodes).to_csv(os.path.join(output_path, "keyword_nodes.csv"), index=False)

pd.DataFrame(author_wrote_paper_relationships).to_csv(os.path.join(output_path, "author_wrote_paper_relationships.csv"), index=False)
pd.DataFrame(paper_has_keyword_relationships).to_csv(os.path.join(output_path, "paper_has_keyword_relationships.csv"), index=False)

print(f"CSV files saved in {output_path}")

CSV files saved in /content/drive/MyDrive/neo4j_data/


In [None]:
new_papers_dois = [
    '10.1371/journal.pone.0287794',
    '10.4103/ecdt.ecdt_106_22',
    '10.1186/s12887-025-05434-w',
    '10.3390/ijerph20042768',
    '10.3390/healthcare10020303',
    '10.1371/journal.pone.0295040',
    '10.3390/covid3100109',
    '10.32920/ihtp.v4i2.2148',
    '10.18203/2320-6012.ijrms20231341',
    '10.3390/brainsci14040377',
    '10.37796/2211-8039.1429',
    '10.18502/ijdo.v15i3.13739',
    '10.1136/bmjopen-2021-052777',
    '10.21203/rs.3.rs-3849240/v1',
    '10.1017/gmh.2023.47',
    ' 10.1007/s13300-024-01681-9'
]

In [None]:
extended_papers = []

In [None]:
new_rows = []

In [None]:
for doi in new_papers_dois:
    doi = doi.lower().strip()
    openalex_url = f"https://api.openalex.org/works/doi:{doi}"

    response = requests.get(openalex_url)
    if response.status_code != 200:
        print(f"Failed to fetch info for DOI: {doi}")
        continue

    data = response.json()

    new_rows.append({
        'cord_uid': data.get('id', ''),
        'title': data.get('title', ''),
        'doi': data.get('doi', ''),
        'abstract': data.get('abstract', ''),
        'authors': "; ".join([auth['author']['display_name'] for auth in data.get('authorships', [])]),
        'journal': data.get('host_venue', {}).get('display_name', ''),
        'publish_time': data.get('publication_date', ''),
    })

In [None]:
new_df = pd.DataFrame(new_rows)

In [None]:
full_df = pd.concat([df, new_df], ignore_index=True)

In [None]:
full_df.to_csv('/content/drive/MyDrive/fulldf_papers_5.csv', index=False)

In [None]:
for idx, row in full_df.iterrows():
    print(f"Processing paper {idx+1}/{len(full_df)}: {row['title'][:60]}...")

    doi = row['doi']
    if pd.isna(doi):
        continue

    doi = doi.lower().strip()
    openalex_url = f"https://api.openalex.org/works/doi:{doi}"

    response = requests.get(openalex_url)
    if response.status_code != 200:
        print(f"  -> Failed to fetch OpenAlex data for DOI: {doi}")
        continue

    data = response.json()
    openalex_id = data.get('id', None)

    if openalex_id is None:
        print(f"  -> No OpenAlex ID found for DOI: {doi}")
        continue

    extended_papers.append({
        'cord_uid': row['cord_uid'],
        'title': data.get('title', ''),
        'doi': data.get('doi', ''),
        'abstract': data.get('abstract', ''),
        'authors': "; ".join([auth['author']['display_name'] for auth in data.get('authorships', [])]),
        'journal': data.get('host_venue', {}).get('display_name', ''),
        'publish_time': data.get('publication_date', ''),
    })

    cited_by_count = data.get('cited_by_count', 0)
    print(f"  -> Found {cited_by_count} papers that cited this paper...")

    if cited_by_count > 0:
        citing_query_url = f"https://api.openalex.org/works?filter=cites:{openalex_id}"

        citing_response = requests.get(citing_query_url)
        if citing_response.status_code != 200:
            print(f"  -> Failed to fetch citing papers for OpenAlex ID: {openalex_id}")
            continue

        citing_data = citing_response.json()

        citing_papers = citing_data.get('results', [])
        print(f"    -> Retrieved {len(citing_papers)} citing papers.")

        for citing_paper in citing_papers:
            publish_time = citing_paper.get('publication_date', '')

            if publish_time and publish_time >= "2020-01-01":
                extended_papers.append({
                    'cord_uid': citing_paper.get('id', ''),
                    'title': citing_paper.get('title', ''),
                    'doi': citing_paper.get('doi', ''),
                    'abstract': citing_paper.get('abstract', ''),
                    'authors': "; ".join([auth['author']['display_name'] for auth in citing_paper.get('authorships', [])]),
                    'journal': citing_paper.get('host_venue', {}).get('display_name', ''),
                    'publish_time': publish_time
                })

    time.sleep(1)


expanded_df = pd.DataFrame(extended_papers)
expanded_df.to_csv('/content/drive/MyDrive/extended_papers.csv', index=False)

Processing paper 1/44: Characterization of Angiotensin Converting Enzyme-...
  -> Found 25 papers that cited this paper...
    -> Retrieved 25 citing papers.
Processing paper 2/44: CD-sACE2 Inclusion Compounds: An Effective Treatme...
  -> Found 44 papers that cited this paper...
    -> Retrieved 25 citing papers.
Processing paper 3/44: Angiotensin-converting enzyme 2: implications for ...
  -> Found 66 papers that cited this paper...
    -> Retrieved 25 citing papers.
Processing paper 4/44: Targeting tumour necrosis factor to ameliorate vir...
  -> Found 26 papers that cited this paper...
    -> Retrieved 25 citing papers.
Processing paper 5/44: Ectodomain shedding of angiotensin converting enzy...
  -> Found 337 papers that cited this paper...
    -> Retrieved 25 citing papers.
Processing paper 6/44: Single-cell RNA-sequencing reveals profibrotic rol...
  -> Found 37 papers that cited this paper...
    -> Retrieved 25 citing papers.
Processing paper 7/44: Angiotensin-converting enzym