<a href="https://colab.research.google.com/github/danebencedavid/NLP-A-Agent/blob/master/graphdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import os

In [3]:
df = pd.read_csv("/content/drive/MyDrive/relevant_papers_4.csv")

In [4]:
df.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id', 'smoking_score', 'covid_score', 'combined_score'],
      dtype='object')

In [8]:
df.head(5)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,...,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id,smoking_score,covid_score,combined_score
0,0rmuvb5i,dc2348230cbe22e999d329c77b6b2183dd585ad0,PMC,Characterization of Angiotensin Converting Enz...,10.1007/s10989-006-9031-6,PMC7102017,32288695.0,no-cc,Angiotensin converting enzyme-2 (ACE2) is a re...,2006-05-05,...,,,,document_parses/pdf_json/dc2348230cbe22e999d32...,document_parses/pmc_json/PMC7102017.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,,0.660726,0.713468,0.679186
1,xkdmj1wd,,Medline,CD-sACE2 Inclusion Compounds: An Effective Tre...,10.1002/jmv.25804,,32232976.0,unk,ACE2 is a metalloproteinase and a homolog of c...,2020-03-31,...,,,,,,https://doi.org/10.1002/jmv.25804; https://www...,214751272.0,0.640098,0.740078,0.675091
2,yb3oohic,,Medline,Angiotensin-converting enzyme 2: implications ...,10.1097/mnh.0b013e32831b70ad,,19077694.0,unk,PURPOSE OF REVIEW Angiotensin-converting enzym...,2009,...,,,,,,https://doi.org/10.1097/mnh.0b013e32831b70ad; ...,32244185.0,0.661366,0.738021,0.688195
3,yqfujq75,,Medline,Targeting tumour necrosis factor to ameliorate...,10.1111/febs.15782,,33624419.0,unk,Pneumonia is a serious complication associated...,2021-02-23,...,,,,,,https://doi.org/10.1111/febs.15782; https://ww...,232036625.0,0.655438,0.669623,0.660402
4,sub4i9lr,,Medline,Ectodomain shedding of angiotensin converting ...,10.1152/ajplung.00071.2009,,19411314.0,unk,Angiotensin-converting enzyme 2 (ACE2) is a te...,2009,...,,,,,,https://doi.org/10.1152/ajplung.00071.2009; ht...,11581374.0,0.655254,0.669101,0.6601


In [9]:
def create_paper_node(paper):
    return {
        "paper_id": paper["cord_uid"],
        "title": paper["title"],
        "abstract": paper["abstract"],
        "doi": paper["doi"],
        "publish_date": paper["publish_time"]
    }

In [None]:
def create_author_node(author_name, institution_name=None):
    author_node = {"name": author_name}
    if institution_name:
        author_node["institution"] = institution_name
    return author_node

In [10]:
def create_author_node(author_name, institution_name=None):
    author_node = {"name": author_name}
    if institution_name:
        author_node["institution"] = institution_name
    return author_node

def create_keyword_node(keyword_name, frequency):
    return {"name": keyword_name, "frequency": frequency}

def create_institution_node(institution_name):
    return {"name": institution_name}

def create_paper_cites_paper_relationship(citing_paper_id, cited_paper_id):
    return {"source": citing_paper_id, "target": cited_paper_id}

def create_author_wrote_paper_relationship(author_name, paper_id):
    return {"source": author_name, "target": paper_id}

def create_author_affiliated_with_institution_relationship(author_name, institution_name):
    return {"source": author_name, "target": institution_name}

def create_paper_has_keyword_relationship(paper_id, keyword_name):
    return {"source": paper_id, "target": keyword_name}

In [11]:
paper_nodes = []
author_nodes = []
keyword_nodes = []
institution_nodes = []
paper_cites_paper_relationships = []
author_wrote_paper_relationships = []
author_affiliated_with_institution_relationships = []
paper_has_keyword_relationships = []

In [12]:
unique_authors = set()
unique_keywords = set()
unique_institutions = set()

In [13]:
for index, paper in df.iterrows():

    paper_nodes.append(create_paper_node(paper))


    authors = str(paper["authors"]).split(";")
    affiliations = str(paper["journal"]).split(";") if "journal" in paper else [""] * len(authors)
    for i, author in enumerate(authors):
        author = author.strip()
        if not author:
            continue
        institution_name = affiliations[i].strip() if i < len(affiliations) else None
        author_node = create_author_node(author, institution_name)
        if author_node["name"] not in unique_authors:
            author_nodes.append(author_node)
            unique_authors.add(author_node["name"])
        author_wrote_paper_relationships.append(create_author_wrote_paper_relationship(author, paper["cord_uid"]))

        if institution_name:
            if institution_name not in unique_institutions:
                institution_nodes.append(create_institution_node(institution_name))
                unique_institutions.add(institution_name)
            author_affiliated_with_institution_relationships.append(
                create_author_affiliated_with_institution_relationship(author, institution_name)
            )


    abstract_text = str(paper["abstract"]).lower()
    words = re.findall(r'\b\w+\b', abstract_text)
    word_counts = Counter(words)
    for keyword, count in word_counts.items():
        if keyword not in unique_keywords:
            keyword_nodes.append(create_keyword_node(keyword, count))
            unique_keywords.add(keyword)
        paper_has_keyword_relationships.append(create_paper_has_keyword_relationship(paper["cord_uid"], keyword)) #changed paper_id


    if "pubmed_id" in paper and not pd.isna(paper["pubmed_id"]):
        cited_list = str(paper["pubmed_id"]).split(";")
        for cited_paper_id in cited_list:
            cited_paper_id = cited_paper_id.strip()
            if cited_paper_id:
                paper_cites_paper_relationships.append(create_paper_cites_paper_relationship(paper["cord_uid"], cited_paper_id))


In [14]:
paper_nodes_df = pd.DataFrame(paper_nodes)
author_nodes_df = pd.DataFrame(author_nodes)
keyword_nodes_df = pd.DataFrame(keyword_nodes)
institution_nodes_df = pd.DataFrame(institution_nodes)
paper_cites_paper_relationships_df = pd.DataFrame(paper_cites_paper_relationships)
author_wrote_paper_relationships_df = pd.DataFrame(author_wrote_paper_relationships)
author_affiliated_with_institution_relationships_df = pd.DataFrame(
    author_affiliated_with_institution_relationships
)
paper_has_keyword_relationships_df = pd.DataFrame(paper_has_keyword_relationships)

In [17]:
output_path = "/content/drive/MyDrive/neo4j_data/"
paper_nodes_df.to_csv(os.path.join(output_path, "paper_nodes.csv"), index=False)
author_nodes_df.to_csv(os.path.join(output_path, "author_nodes.csv"), index=False)
keyword_nodes_df.to_csv(os.path.join(output_path, "keyword_nodes.csv"), index=False)
institution_nodes_df.to_csv(os.path.join(output_path, "institution_nodes.csv"), index=False)
paper_cites_paper_relationships_df.to_csv(os.path.join(output_path, "paper_cites_paper_relationships.csv"), index=False)
author_wrote_paper_relationships_df.to_csv(os.path.join(output_path, "author_wrote_paper_relationships.csv"), index=False)
author_affiliated_with_institution_relationships_df.to_csv(
    os.path.join(output_path, "author_affiliated_with_institution_relationships.csv"), index=False
)
paper_has_keyword_relationships_df.to_csv(os.path.join(output_path, "paper_has_keyword_relationships.csv"), index=False)

print("CSVs files are saved in the following location:")
print(output_path)


CSVs files are saved in the following location:
/content/drive/MyDrive/neo4j_data/
