<a href="https://colab.research.google.com/github/danebencedavid/NLP-A-Agent/blob/master/graphdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import os
import requests
import time

In [8]:
df = pd.read_csv("/content/drive/MyDrive/relevant_papers_4.csv")

In [9]:
df.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id', 'smoking_score', 'covid_score', 'combined_score'],
      dtype='object')

In [None]:
new_papers_dois = [
    '10.1371/journal.pone.0287794',
    '10.4103/ecdt.ecdt_106_22',
    '10.1186/s12887-025-05434-w',
    '10.3390/ijerph20042768',
    '10.3390/healthcare10020303',
    '10.1371/journal.pone.0295040',
    '10.3390/covid3100109',
    '10.32920/ihtp.v4i2.2148',
    '10.18203/2320-6012.ijrms20231341',
    '10.3390/brainsci14040377',
    '10.37796/2211-8039.1429',
    '10.18502/ijdo.v15i3.13739',
    '10.1136/bmjopen-2021-052777',
    '10.21203/rs.3.rs-3849240/v1',
    '10.1017/gmh.2023.47',
    '10.1007/s13300-024-01681-9'
]

In [11]:
new_rows = []

In [None]:
for doi in new_papers_dois:
    doi = doi.lower().strip()
    openalex_url = f"https://api.openalex.org/works/doi:{doi}"

    response = requests.get(openalex_url)
    if response.status_code != 200:
        print(f"Failed to fetch info for DOI: {doi}")
        continue

    data = response.json()

    new_rows.append({
        'cord_uid': data.get('id', ''),
        'title': data.get('title', ''),
        'doi': data.get('doi', ''),
        'abstract': data.get('abstract', ''),
        'authors': "; ".join([auth['author']['display_name'] for auth in data.get('authorships', [])]),
        'journal': data.get('host_venue', {}).get('display_name', ''),
        'publish_time': data.get('publication_date', ''),
    })

In [None]:
new_df = pd.DataFrame(new_rows)

In [None]:
full_df = pd.concat([df, new_df], ignore_index=True)

In [None]:
full_df.to_csv('/content/drive/MyDrive/fulldf_papers_5.csv', index=False)

In [12]:
papers = pd.read_csv('/content/drive/MyDrive/fulldf_papers.csv')

In [59]:
doi_nan_rows = papers[papers['doi'].isna()]
print(doi_nan_rows[['title','doi','cord_uid']])

Empty DataFrame
Columns: [title, doi, cord_uid]
Index: []


In [64]:
papers.head()['doi']

Unnamed: 0,doi
0,10.1111/obr.13225
1,10.1111/obr.13225
2,10.1111/obr.13225
3,10.3389/fmolb.2020.588618
4,10.3390/molecules27051740


In [68]:
all_extended_papers = []

In [65]:
def reconstruct_abstract(abstract_inverted_index):
    """Reconstructs an abstract string from an OpenAlex inverted index."""
    if not abstract_inverted_index:
        return ""
    max_index = max((idx for indices in abstract_inverted_index.values() for idx in indices), default=-1)
    abstract_words = [""] * (max_index + 1)
    for word, indices in abstract_inverted_index.items():
        for idx in indices:
            abstract_words[idx] = word
    return " ".join(abstract_words)

In [66]:
def fetch_openalex_data(doi):
    """Fetches data from OpenAlex for a given DOI."""
    doi = doi.lower().strip()
    openalex_url = f"https://api.openalex.org/works/doi:{doi}"
    response = requests.get(openalex_url)
    if response.status_code != 200:
        print(f"  -> Failed to fetch OpenAlex data for DOI: {doi}")
        return None
    return response.json()

In [67]:
def process_paper(row):
    """Processes a single paper and its citing papers."""
    extended_papers = []
    doi = row.get('doi', None)
    if pd.isna(doi) or not doi:
        print(f"  -> DOI is missing for paper: {row['cord_uid']}")
        return []

    openalex_data = fetch_openalex_data(doi)
    if not openalex_data:
        return []
    openalex_id = openalex_data.get('id', None)
    if openalex_id is None:
        print(f"  -> No OpenAlex ID found for DOI: {doi}")
        return []

    # 1. Add the original paper's data.  Crucially, cited_id is NULL here.
    paper_data = {
        'cord_uid': row.get('cord_uid', ''),
        'openalex_id': openalex_id,
        'title': openalex_data.get('title', ''),
        'doi': openalex_data.get('doi', ''),
        'abstract': reconstruct_abstract(openalex_data.get('abstract_inverted_index', {})),
        'authors': "; ".join([auth.get('author', {}).get('display_name', '') for auth in openalex_data.get('authorships', [])]),
        'journal': openalex_data.get('host_venue', {}).get('display_name', ''),
        'publish_time': openalex_data.get('publication_date', ''),
        'cited_id': None  # Original papers don't have a cited_id
    }
    extended_papers.append(paper_data)

    cited_by_count = openalex_data.get('cited_by_count', 0)
    print(f"  -> Found {cited_by_count} papers that cited this paper...")

    if cited_by_count > 0:
        citing_query_url = f"https://api.openalex.org/works?filter=cites:{openalex_id}"
        citing_response = requests.get(citing_query_url)
        if citing_response.status_code == 200:
            citing_data = citing_response.json()
            citing_papers = citing_data.get('results', [])
            print(f"    -> Retrieved {len(citing_papers)} citing papers.")
            for citing_paper in citing_papers:
                publish_time = citing_paper.get('publication_date', '')
                if publish_time and publish_time >= "2020-01-01":
                    # 2. Add the citing paper's data.  cited_id is the ORIGINAL paper's DOI.
                    citing_paper_data = {
                        'cord_uid': citing_paper.get('id', ''),
                        'openalex_id': citing_paper.get('id', ''),
                        'title': citing_paper.get('title', ''),
                        'doi': citing_paper.get('doi', ''),
                        'abstract': reconstruct_abstract(citing_paper.get('abstract_inverted_index', {})),
                        'authors': "; ".join([auth.get('author', {}).get('display_name', '') for auth in citing_paper.get('authorships', [])]),
                        'journal': citing_paper.get('host_venue', {}).get('display_name', ''),
                        'publish_time': publish_time,
                        'cited_id': openalex_data.get('doi',
                                                    '')  # <--- Store the DOI of the paper being cited
                    }
                    extended_papers.append(citing_paper_data)
        else:
            print(f"  -> Failed to fetch citing papers for OpenAlex ID: {openalex_id}")
    return extended_papers

In [69]:
print(f"Total papers to process: {len(papers)}")
for idx, row in papers.iterrows():
    print(f"Processing paper {idx + 1}/{len(papers)}: {row['title'][:65]}...")
    extended_papers = process_paper(row)
    all_extended_papers.extend(extended_papers)
    time.sleep(1)
expanded_df = pd.DataFrame(all_extended_papers)
print(expanded_df.head())

Total papers to process: 60
Processing paper 1/60: Dual role for angiotensin-converting enzyme 2 in Severe Acute Res...
  -> Found 7 papers that cited this paper...
    -> Retrieved 7 citing papers.
Processing paper 2/60: Dual role for angiotensin-converting enzyme 2 in Severe Acute Res...
  -> Found 7 papers that cited this paper...
    -> Retrieved 7 citing papers.
Processing paper 3/60: Dual role for angiotensin‐converting enzyme 2 in Severe Acute Res...
  -> Found 7 papers that cited this paper...
    -> Retrieved 7 citing papers.
Processing paper 4/60: ACE2 in the Era of SARS-CoV-2: Controversies and Novel Perspectiv...
  -> Found 98 papers that cited this paper...
    -> Retrieved 25 citing papers.
Processing paper 5/60: Virtual Screening of Natural Chemical Databases to Search for Pot...
  -> Found 2 papers that cited this paper...
    -> Retrieved 2 citing papers.
Processing paper 6/60: Pharmacologic modulation of ACE2 expression...
  -> Found 87 papers that cited this paper...

In [72]:
expanded_df = expanded_df[expanded_df['cited_id'].notnull()]

In [73]:
len(expanded_df)

724

In [79]:
(expanded_df['abstract'] != "").sum()

np.int64(588)

In [80]:
expanded_df = expanded_df[expanded_df['abstract'] != ""]

In [81]:
len(expanded_df)

588

In [82]:
expanded_df.to_csv('/content/drive/MyDrive/extended_papers_with_diabetes.csv', index=False)

In [83]:
output_path = "/content/drive/MyDrive/neo4j_data/"

In [84]:
paper_nodes = []
author_nodes = []
keyword_nodes = []

In [85]:
author_wrote_paper_relationships = []
paper_has_keyword_relationships = []
paper_cites_paper_relationships = []

In [86]:
unique_papers = set()
unique_authors = set()
unique_keywords = set()

In [87]:
for idx, paper in expanded_df.iterrows():
    paper_id = paper.get("cord_uid") or paper.get("openalex_id")
    if not paper_id or paper_id in unique_papers:
        continue
    unique_papers.add(paper_id)

    paper_nodes.append({
        "paper_id": paper_id,
        "title": paper.get("title", ""),
        "abstract": paper.get("abstract", ""),
        "doi": paper.get("doi", ""),
        "publish_date": paper.get("publish_time", "")
    })

    authors_raw = paper.get("authors", "")
    authors = [a.strip() for a in authors_raw.split(";") if a.strip()]
    for author_name in authors:
        if author_name not in unique_authors:
            author_nodes.append({
                "author_name": author_name
            })
            unique_authors.add(author_name)

        author_wrote_paper_relationships.append({
            "author_name": author_name,
            "paper_id": paper_id
        })

    abstract_text = str(paper.get("abstract", "")).lower()
    words = re.findall(r'\b\w+\b', abstract_text)
    word_counts = Counter(words)

    for keyword, count in word_counts.items():
        if len(keyword) <= 2:
            continue
        if keyword not in unique_keywords:
            keyword_nodes.append({
                "keyword_name": keyword,
                "frequency": count
            })
            unique_keywords.add(keyword)

        paper_has_keyword_relationships.append({
            "paper_id": paper_id,
            "keyword_name": keyword
        })

    cited_paper_id = paper.get("cited_id")
    if paper_id and cited_paper_id:
        paper_cites_paper_relationships.append({
            "citing_paper_id": paper_id,
            "cited_paper_id": cited_paper_id
        })

In [88]:
pd.DataFrame(paper_nodes).to_csv(os.path.join(output_path, "paper_nodes.csv"), index=False)
pd.DataFrame(author_nodes).to_csv(os.path.join(output_path, "author_nodes.csv"), index=False)
pd.DataFrame(keyword_nodes).to_csv(os.path.join(output_path, "keyword_nodes.csv"), index=False)

pd.DataFrame(author_wrote_paper_relationships).to_csv(os.path.join(output_path, "author_wrote_paper_relationships.csv"), index=False)
pd.DataFrame(paper_has_keyword_relationships).to_csv(os.path.join(output_path, "paper_has_keyword_relationships.csv"), index=False)
pd.DataFrame(paper_cites_paper_relationships).to_csv(os.path.join(output_path, "paper_cites_paper_relationships.csv"), index=False)

print(f"CSV files saved in {output_path}")

CSV files saved in /content/drive/MyDrive/neo4j_data/
