<a href="https://colab.research.google.com/github/danebencedavid/NLP-A-Agent/blob/master/graphdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [63]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import os
import requests
import time
import html
import urllib.parse
from nltk.corpus import stopwords

In [64]:
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df = pd.read_csv("/content/drive/MyDrive/relevant_papers_4.csv")

In [None]:
df.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id', 'smoking_score', 'covid_score', 'combined_score'],
      dtype='object')

In [None]:
new_papers_dois = [
    '10.1371/journal.pone.0287794',
    '10.4103/ecdt.ecdt_106_22',
    '10.1186/s12887-025-05434-w',
    '10.3390/ijerph20042768',
    '10.3390/healthcare10020303',
    '10.1371/journal.pone.0295040',
    '10.3390/covid3100109',
    '10.32920/ihtp.v4i2.2148',
    '10.18203/2320-6012.ijrms20231341',
    '10.3390/brainsci14040377',
    '10.37796/2211-8039.1429',
    '10.18502/ijdo.v15i3.13739',
    '10.1136/bmjopen-2021-052777',
    '10.21203/rs.3.rs-3849240/v1',
    '10.1017/gmh.2023.47',
    '10.1007/s13300-024-01681-9'
]

In [None]:
new_rows = []

In [None]:
for doi in new_papers_dois:
    doi = doi.lower().strip()
    openalex_url = f"https://api.openalex.org/works/doi:{doi}"

    response = requests.get(openalex_url)
    if response.status_code != 200:
        print(f"Failed to fetch info for DOI: {doi}")
        continue

    data = response.json()

    new_rows.append({
        'cord_uid': data.get('id', ''),
        'title': data.get('title', ''),
        'doi': data.get('doi', ''),
        'abstract': data.get('abstract', ''),
        'authors': "; ".join([auth['author']['display_name'] for auth in data.get('authorships', [])]),
        'journal': data.get('host_venue', {}).get('display_name', ''),
        'publish_time': data.get('publication_date', ''),
    })

In [None]:
new_df = pd.DataFrame(new_rows)

In [None]:
full_df = pd.concat([df, new_df], ignore_index=True)

In [None]:
full_df.to_csv('/content/drive/MyDrive/fulldf_papers_5.csv', index=False)

In [24]:
papers = pd.read_csv('/content/drive/MyDrive/fulldf_papers.csv')

In [25]:
def clean_cord_uid(uid):
  if isinstance(uid, str) and "https://openalex.org/" in uid:
      return uid.split("/")[-1]
  return uid

In [26]:
papers['cord_uid'] = papers['cord_uid'].apply(clean_cord_uid)

In [27]:
duplicate_title_count = papers['title'].duplicated().sum()

In [28]:
print(f"Number of duplicate titles: {duplicate_title_count}")

Number of duplicate titles: 9


In [29]:
papers = papers.sort_values(by='doi',na_position='first')

In [30]:
papers = papers.drop_duplicates(subset='title', keep='last')

In [31]:
duplicate_title_count = papers['title'].duplicated().sum()
print(f"Number of duplicate titles: {duplicate_title_count}")

Number of duplicate titles: 0


In [32]:
len(papers['doi'])

51

In [33]:
papers = papers[papers['doi'].notnull()]

In [34]:
len(papers['doi'])

42

In [35]:
all_extended_papers = []

In [36]:
papers.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id', 'smoking_score', 'covid_score', 'combined_score',
       'diabetes_score'],
      dtype='object')

In [37]:
def reconstruct_abstract(abstract_inverted_index):
    """Reconstructs an abstract string from an OpenAlex inverted index."""
    if not abstract_inverted_index:
        return ""
    max_index = max((idx for indices in abstract_inverted_index.values() for idx in indices), default=-1)
    abstract_words = [""] * (max_index + 1)
    for word, indices in abstract_inverted_index.items():
        for idx in indices:
            abstract_words[idx] = word
    return " ".join(abstract_words)

In [38]:
def fetch_openalex_data(doi):
    """Fetches data from OpenAlex for a given DOI."""
    doi = doi.lower().strip()
    openalex_url = f"https://api.openalex.org/works/doi:{doi}"
    response = requests.get(openalex_url)
    if response.status_code != 200:
        print(f"  -> Failed to fetch OpenAlex data for DOI: {doi}")
        return None
    return response.json()

In [39]:
def process_paper(row):
    """Processes a single paper and its citing papers."""
    extended_papers = []
    doi = row.get('doi', None)
    if pd.isna(doi) or not doi:
        print(f"  -> DOI is missing for paper: {row['cord_uid']}")
        return []

    openalex_data = fetch_openalex_data(doi)
    if not openalex_data:
        return []
    openalex_id = openalex_data.get('id', None)
    if openalex_id is None:
        print(f"  -> No OpenAlex ID found for DOI: {doi}")
        return []

    paper_data = {
        'cord_uid': row.get('cord_uid', ''),
        'openalex_id': openalex_id,
        'title': openalex_data.get('title', ''),
        'doi': openalex_data.get('doi', ''),
        'abstract': reconstruct_abstract(openalex_data.get('abstract_inverted_index', {})),
        'authors': "; ".join([auth.get('author', {}).get('display_name', '') for auth in openalex_data.get('authorships', [])]),
        'journal': openalex_data.get('host_venue', {}).get('display_name', ''),
        'publish_time': openalex_data.get('publication_date', ''),
        'cited_id': None
    }
    extended_papers.append(paper_data)

    cited_by_count = openalex_data.get('cited_by_count', 0)
    print(f"  -> Found {cited_by_count} papers that cited this paper...")

    if cited_by_count > 0:
        citing_query_url = f"https://api.openalex.org/works?filter=cites:{openalex_id}"
        citing_response = requests.get(citing_query_url)
        if citing_response.status_code == 200:
            citing_data = citing_response.json()
            citing_papers = citing_data.get('results', [])
            print(f"    -> Retrieved {len(citing_papers)} citing papers.")
            for citing_paper in citing_papers:
                publish_time = citing_paper.get('publication_date', '')
                if publish_time and publish_time >= "2022-01-01":
                    citing_paper_data = {
                        'cord_uid': citing_paper.get('id', ''),
                        'openalex_id': citing_paper.get('id', ''),
                        'title': citing_paper.get('title', ''),
                        'doi': citing_paper.get('doi', ''),
                        'abstract': reconstruct_abstract(citing_paper.get('abstract_inverted_index', {})),
                        'authors': "; ".join([auth.get('author', {}).get('display_name', '') for auth in citing_paper.get('authorships', [])]),
                        'journal': citing_paper.get('host_venue', {}).get('display_name', ''),
                        'publish_time': publish_time,
                        'cited_id': openalex_data.get('doi',
                                                    '')
                    }
                    extended_papers.append(citing_paper_data)
        else:
            print(f"  -> Failed to fetch citing papers for OpenAlex ID: {openalex_id}")
    return extended_papers

In [40]:
print(f"Total papers to process: {len(papers)}")
for idx, row in papers.iterrows():
    print(f"Processing paper {idx + 1}/{len(papers)}: {row['title'][:41]}...")
    extended_papers = process_paper(row)
    all_extended_papers.extend(extended_papers)
    time.sleep(1)
expanded_df = pd.DataFrame(all_extended_papers)
print(expanded_df.head())

Total papers to process: 42
Processing paper 7/42: CD-sACE2 Inclusion Compounds: An Effectiv...
  -> Found 44 papers that cited this paper...
    -> Retrieved 25 citing papers.
Processing paper 15/42: Measurement of Angiotensin Converting Enz...
  -> Found 42 papers that cited this paper...
    -> Retrieved 25 citing papers.
Processing paper 28/42: What’s new in the renin-angiotensin syste...
  -> Found 147 papers that cited this paper...
    -> Retrieved 25 citing papers.
Processing paper 59/42: The interplay of DAMPs, TLR4, and proinfl...
  -> Found 71 papers that cited this paper...
    -> Retrieved 25 citing papers.
Processing paper 10/42: The soluble catalytic ectodomain of ACE2 ...
  -> Found 15 papers that cited this paper...
    -> Retrieved 15 citing papers.
Processing paper 14/42: Characterization of Angiotensin Convertin...
  -> Found 25 papers that cited this paper...
    -> Retrieved 25 citing papers.
Processing paper 6/42: Pharmacologic modulation of ACE2 expressi...
  ->

In [41]:
expanded_df = expanded_df[expanded_df['cited_id'].notnull()]

In [42]:
len(expanded_df)

216

In [43]:
(expanded_df['abstract'] != "").sum()

np.int64(177)

In [44]:
expanded_df = expanded_df[expanded_df['abstract'] != ""]

In [45]:
len(expanded_df)

177

In [47]:
expanded_df.to_csv('/content/drive/MyDrive/extended_papers_with_diabetes.csv', index=False)

In [2]:
output_path = "/content/drive/MyDrive/neo4j_data/"

In [3]:
expanded_df = pd.read_csv("/content/drive/MyDrive/extended_papers_with_diabetes.csv")

In [77]:
paper_nodes = []
author_nodes = []
keyword_nodes = []
author_wrote_paper_relationships = []
paper_has_keyword_relationships = []
paper_cites_paper_relationships = []

unique_papers = set()
unique_authors = set()
unique_keywords = set()

In [90]:
def normalize_doi(doi):
    """Normalize DOI formats"""
    if not doi or pd.isna(doi) or str(doi).lower() == 'nan':
        return None
    doi = str(doi).lower().strip()
    for prefix in ['https://doi.org/', 'http://doi.org/', 'doi:', 'doi.org/']:
        doi = doi.replace(prefix, '')
    return doi.split('?')[0].split('#')[0]

def process_paper(paper, is_citing=False):
    """Process paper with original citation mapping preserved"""
    paper_id = paper.get("cord_uid") or paper.get("openalex_id") or paper.get("mag_id")
    if not paper_id or paper_id in unique_papers:
        return None, None

    unique_papers.add(paper_id)

    source = "citing" if is_citing else "original"
    cited_doi = normalize_doi(paper.get("cited_id")) if is_citing else None

    paper_nodes.append({
        "paper_id": paper_id,
        "title": paper.get("title", ""),
        "abstract": paper.get("abstract", ""),
        "doi": normalize_doi(paper.get("doi")),
        "publish_date": paper.get("publish_time", ""),
        "journal": paper.get("journal", ""),
        "source": source,
        "cited_paper_doi": cited_doi
    })

    authors_raw = paper.get("authors", "")
    authors = [a.strip() for a in authors_raw.split(";") if a.strip()]
    for author_name in authors:
        decoded_author_name = html.unescape(author_name)
        if decoded_author_name not in unique_authors:
            author_nodes.append({
                "author_id": f"auth_{len(unique_authors)+1}",
                "author_name": decoded_author_name
            })
            unique_authors.add(decoded_author_name)
        author_wrote_paper_relationships.append({
            "author_name": decoded_author_name,
            "paper_id": paper_id
        })

    abstract_text = str(paper.get("abstract", "")).lower()
    words = re.findall(r'\b[a-z]{3,}\b', abstract_text)
    filtered_words = [word for word in words if word not in stop_words]
    word_counts = Counter(filtered_words)

    for keyword, count in word_counts.most_common(10):
        if keyword not in unique_keywords:
            keyword_nodes.append({
                "keyword_name": keyword,
                "frequency": count
            })
            unique_keywords.add(keyword)
        paper_has_keyword_relationships.append({
            "paper_id": paper_id,
            "keyword_name": keyword
        })

    return paper_id, cited_doi

In [92]:
doi_to_id = {}
for paper in paper_nodes:
    if paper['doi']:
        doi_to_id[paper['doi']] = paper['paper_id']

In [91]:
print("Processing original papers...")
for _, paper in papers.iterrows():
    process_paper(paper, is_citing=False)

Processing original papers...


In [93]:
print("Processing citing papers...")
citation_issues = []
for _, paper in expanded_df.iterrows():
    paper_id, cited_doi = process_paper(paper, is_citing=True)

    if cited_doi and paper_id:
        cited_paper_id = doi_to_id.get(cited_doi)
        if cited_paper_id:
            paper_cites_paper_relationships.append({
                "citing_paper_id": paper_id,
                "cited_paper_id": cited_paper_id,
                "source_doi": normalize_doi(paper.get("doi")),
                "target_doi": cited_doi
            })
        else:
            citation_issues.append(f"Cited paper not found for DOI: {cited_doi}")

Processing citing papers...


In [94]:
print(f"\nFinal Statistics:")
print(f"Total papers: {len(paper_nodes)}")
print(f"  - Original: {len([p for p in paper_nodes if p['source'] == 'original'])}")
print(f"  - Citing: {len([p for p in paper_nodes if p['source'] == 'citing'])}")
print(f"Total authors: {len(author_nodes)}")
print(f"Total keywords: {len(keyword_nodes)}")
print(f"Successful citation relationships: {len(paper_cites_paper_relationships)}")
print(f"Citation issues: {len(citation_issues)}")

if citation_issues:
    print("\nSample citation issues:")
    for issue in citation_issues[:5]:
        print(issue)


Final Statistics:
Total papers: 218
  - Original: 42
  - Citing: 176
Total authors: 1751
Total keywords: 861
Successful citation relationships: 176
Citation issues: 0


In [95]:
pd.DataFrame(paper_nodes).to_csv(os.path.join(output_path, "paper_nodes.csv"), index=False)
pd.DataFrame(author_nodes).to_csv(os.path.join(output_path, "author_nodes.csv"), index=False)
pd.DataFrame(keyword_nodes).to_csv(os.path.join(output_path, "keyword_nodes.csv"), index=False)
pd.DataFrame(author_wrote_paper_relationships).to_csv(
    os.path.join(output_path, "author_wrote_paper_relationships.csv"), index=False)
pd.DataFrame(paper_has_keyword_relationships).to_csv(
    os.path.join(output_path, "paper_has_keyword_relationships.csv"), index=False)
pd.DataFrame(paper_cites_paper_relationships).to_csv(
    os.path.join(output_path, "paper_cites_paper_relationships.csv"), index=False)
