# SDM - Neo4j Testing

In [1]:
import os
from dotenv import load_dotenv
from neo4j import GraphDatabase
#from ..src.neo4j_conn import Neo4jConnection

# change cwd to project root
os.chdir('../')

# Load secrets from .env
load_dotenv(dotenv_path='./env/.env')

# instantiate neo4j credentials
URI = os.environ['NEO4J_URI']
AUTH = (os.environ['NEO4J_USERNAME'], os.environ['NEO4J_PASSWORD'])
DB_NAME = os.environ['DB_NAME']

In [86]:
import os

directory = os.getcwd()
print(directory)

### Neo4j Connectivity Verification

In [2]:
# main() testing
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    try:
        driver.verify_connectivity()
        print('Neo4j Driver Connectivity Verified!')
    except Exception as e:
        print('Neo4j Driver Unavailable!', e)

Neo4j Driver Connectivity Verified!


### Custom write transaction

In [None]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
        with 

### Executing Cypher Queries

In [None]:
# Sample query from neo4j docs
'''def match_person_nodes(tx, age):
    result = tx.run(
        "MATCH (p:Person {age: $age}) RETURN p.name AS name",
        age=age)
    records = list(result)
    summary = result.consume()
    return records, summary

with driver.session(database="neo4j") as session:
    records, summary = session.execute_read(match_person_nodes, age=42)

# Summary information
print("The query `{query}` returned {records_count} records in {time} ms.".format(
    query=summary.query, records_count=len(records),
    time=summary.result_available_after,
))

# Loop through results and do something with them
for person in records:
    print(person)'''

# Synthetic Data Generation

In [34]:
import pandas as pd
import numpy as np
import random

authors_df = pd.read_csv('../data/synthetic/authors.csv', delimiter=';')
papers_df = pd.read_csv('../data/synthetic/papers.csv', delimiter=';')
conferences_df = pd.read_csv('../data/synthetic/conferences.csv', delimiter=';')
journals_df = pd.read_csv('../data/synthetic/journals.csv', delimiter=';')


In [3]:
papers_df.head()

Unnamed: 0,id,title,abstract,keywords,topic
0,1,Effects of Sleep Deprivation on Cognitive Func...,This paper explores the effects of sleep depri...,"sleep, deprivation, cognitive function, attent...",Psychology
1,2,The Role of Exercise in Managing Chronic Pain,This paper examines the role of exercise in ma...,"exercise, chronic pain, pain, perception, phys...",Physical Therapy
2,3,Impact of Social Media on Adolescent Mental He...,This paper investigates the impact of social m...,"social media, adolescent, mental health, anxie...",Psychiatry
3,4,Development of a Machine Learning Algorithm fo...,This paper describes the development of a mach...,"machine learning algorithm, heart disease, pre...",Cardiology
4,5,Assessment of Greenhouse Gas Emissions from Ur...,This paper assesses greenhouse gas emissions f...,"greenhouse, gas emissions, urban transportatio...",Environmental Science


# Creating Node Relationships

To Do:
- create keywords.csv
    - contains unique keyword name along with id
- create topics.csv
    - contains unique topic name along with id
- add institution to authors.csv
    - random university or company along with id
- add random reviewer decision as edge attribute
    - this will be included in reviewed_by.csv


## Paper Relationships

In [33]:
# written_by
paper_id_list = [paper for paper in papers_df['id']]
author_id_list = [authors_df.sample(n=1)['id'].iloc[0] for i in range(len(paper_id_list))]

# coAuthored_by
co_author_id_list = []
for i in range(len(paper_id_list)):
    author_sample = authors_df.sample(n=1)['id'].iloc[0]
    while author_sample == author_id_list[i]:
        author_sample = authors_df.sample(n=1)['id'].iloc[0]

    co_author_id_list.append(author_sample)

# reviewer1
reviewer1_id_list = []
for i in range(len(paper_id_list)):
    reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]
    while reviewer_sample in [author_id_list[i], co_author_id_list[i]]:
        reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]

    reviewer1_id_list.append(reviewer_sample)

# reviewer2
reviewer2_id_list = []
for i in range(len(paper_id_list)):
    reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]
    while reviewer_sample in [author_id_list[i], co_author_id_list[i], reviewer1_id_list[i]]:
        reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]

    reviewer2_id_list.append(reviewer_sample)

# reviewer3
reviewer3_id_list = []
for i in range(len(paper_id_list)):
    reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]
    while reviewer_sample in [author_id_list[i], co_author_id_list[i], reviewer1_id_list[i], reviewer2_id_list[i]]:
        reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]

    reviewer3_id_list.append(reviewer_sample)

# reviewer4
reviewer4_id_list = []
for i in range(len(paper_id_list)):
    reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]
    while reviewer_sample in [author_id_list[i], co_author_id_list[i], reviewer1_id_list[i], reviewer2_id_list[i], reviewer3_id_list[i]]:
        reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]

    reviewer4_id_list.append(reviewer_sample)
papers_df['reviewer4_id'] = reviewer4_id_list

# reviewer5
reviewer5_id_list = []
for i in range(len(paper_id_list)):
    reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]
    while reviewer_sample in [author_id_list[i], co_author_id_list[i], reviewer1_id_list[i], reviewer2_id_list[i], reviewer3_id_list[i], reviewer4_id_list[i]]:
        reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]

    reviewer5_id_list.append(reviewer_sample)
papers_df['reviewer5_id'] = reviewer5_id_list

# reviewer6
reviewer6_id_list = []
for i in range(len(paper_id_list)):
    reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]
    while reviewer_sample in [author_id_list[i], co_author_id_list[i], reviewer1_id_list[i], reviewer2_id_list[i], reviewer3_id_list[i], reviewer4_id_list[i], reviewer5_id_list[i]]:
        reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]

    reviewer6_id_list.append(reviewer_sample)
papers_df['reviewer6_id'] = reviewer6_id_list

# reviewer7
reviewer7_id_list = []
for i in range(len(paper_id_list)):
    reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]
    while reviewer_sample in [author_id_list[i], co_author_id_list[i], reviewer1_id_list[i], reviewer2_id_list[i], reviewer3_id_list[i], reviewer4_id_list[i], reviewer5_id_list[i], reviewer6_id_list[i]]:
        reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]

    reviewer7_id_list.append(reviewer_sample)
papers_df['reviewer7_id'] = reviewer7_id_list

# publishers
publishersDF = pd.concat([conferences_df, journals_df], ignore_index=True)
publishersDF['type'] = np.where(publishersDF['id'] < 5000000, 'conference', 'journal')
publishersDF

publisher_id_list = []
publisher_type_list = []
for i in range(len(paper_id_list)):
    publisher_sample = publishersDF.sample(n=1)
    publisher_id_list.append(publisher_sample['id'].iloc[0])
    publisher_type_list.append(publisher_sample['type'].iloc[0])

# 
paper_dict = {
    'paper_id':paper_id_list,
    'author_id':author_id_list,
    'co_author_id':co_author_id_list,
    'reviewer1_id':reviewer1_id_list,
    'reviewer2_id':reviewer2_id_list,
    'reviewer3_id':reviewer3_id_list,
    'reviewer4_id':reviewer4_id_list,
    'reviewer5_id':reviewer5_id_list,
    'reviewer6_id':reviewer6_id_list,
    'reviewer7_id':reviewer7_id_list,
    'publisher_id':publisher_id_list,
    'publisher_type':publisher_type_list
}

paper_relationships = pd.DataFrame(paper_dict)
paper_relationships

Unnamed: 0,paper_id,author_id,co_author_id,reviewer1_id,reviewer2_id,reviewer3_id,publisher_id,publisher_type
0,1,9742917,9742913,9742920,9742925,9742908,3329838,conference
1,2,9742928,9742925,9742929,9742926,9742920,13024837,journal
2,3,9742911,9742908,9742923,9742917,9742926,3329835,conference
3,4,9742914,9742928,9742917,9742922,9742909,13024840,journal
4,5,9742912,9742916,9742928,9742923,9742925,13024839,journal
...,...,...,...,...,...,...,...,...
164,165,9742909,9742914,9742923,9742918,9742924,13024836,journal
165,166,9742928,9742913,9742925,9742917,9742929,3329942,conference
166,167,9742921,9742925,9742924,9742918,9742929,13024838,journal
167,168,9742927,9742910,9742923,9742926,9742913,13024836,journal


## Paper ---> Keyword

In [142]:
papers_df

Unnamed: 0,id,title,abstract,keywords,topic
0,1,Effects of Sleep Deprivation on Cognitive Func...,This paper explores the effects of sleep depri...,"sleep, deprivation, cognitive function, attent...",Psychology
1,2,The Role of Exercise in Managing Chronic Pain,This paper examines the role of exercise in ma...,"exercise, chronic pain, pain, perception, phys...",Physical Therapy
2,3,Impact of Social Media on Adolescent Mental He...,This paper investigates the impact of social m...,"social media, adolescent, mental health, anxie...",Psychiatry
3,4,Development of a Machine Learning Algorithm fo...,This paper describes the development of a mach...,"machine learning algorithm, heart disease, pre...",Cardiology
4,5,Assessment of Greenhouse Gas Emissions from Ur...,This paper assesses greenhouse gas emissions f...,"greenhouse, gas emissions, urban transportatio...",Environmental Science
...,...,...,...,...,...
164,165,Cloud-based Big Data Storage and Processing,This paper explores the use of cloud computing...,"Big data, Cloud computing, Data management, Da...",Cloud Computing
165,166,Big Data Visualization Techniques,This paper provides an overview of various vis...,"Big data visualization, Data analysis, Informa...",Data Visualization
166,167,Parallel Computing for Big Data Processing,This paper proposes a parallel computing frame...,"Big data processing, Parallel computing, Distr...",Data Processing
167,168,Big Data Applications in Healthcare,This paper explores the applications of big da...,"Big data, Healthcare, Data management, Data an...",Healthcare Analytics


In [145]:
paper_TO_keyword = {
    'paper_id':[],
    'keyword_name':[]
}

for i in range(len(papers_df)):
    for word in str(papers_df.iloc[i]['keywords']).split(','):
        paper_TO_keyword['keyword_name'].append(word.lower().strip())
        paper_TO_keyword['paper_id'].append(papers_df.iloc[i]['id'])

paper_TO_keyword_DF = pd.DataFrame(paper_TO_keyword)
paper_TO_keyword_DF.drop_duplicates(inplace=True) 
paper_TO_keyword_DF.reset_index(inplace=True, drop=True)


keywords_df = pd.read_csv('../data/synthetic/keywords.csv', sep=';')
# mergin to get keyword and topic id
paper_TO_keyword_DF = pd.merge(paper_TO_keyword_DF, keywords_df, on="keyword_name")
paper_TO_keyword_DF.to_csv('../data/synthetic/relations/paper_TO_keyword.csv', sep=';', index=False)

## Keyword ---> Topic

In [126]:
# creating keywords_df
unique_keywords = []
for keyword in papers_df['keywords']:
    for word in keyword.split(','):
        if word.lower().strip() not in unique_keywords:
            unique_keywords.append(word.lower().strip())

keywords_dict = {
    'keyword_id':['469{}'.format(i) for i in range(len(unique_keywords))],
    'keyword_name':unique_keywords
}

keywords_df = pd.DataFrame(keywords_dict)
keywords_df.to_csv('../data/synthetic/keywords.csv', sep=';', index=False)

In [127]:
# creating topics_df
unique_topics = []
for topic in papers_df['topic']:
    if topic.lower().strip() not in unique_topics:
        unique_topics.append(topic.lower().strip())

topics_dict = {
    'topic_id':['420{}'.format(i) for i in range(len(unique_topics))],
    'topic_name':unique_topics
}

topics_df = pd.DataFrame(topics_dict)
topics_df.to_csv('../data/synthetic/topics.csv', sep=';', index=False)

In [133]:
keyword_TO_topic = {
    'keyword_name':[],
    'topic_name':[]
}

for i in range(len(papers_df)):
    for word in str(papers_df.iloc[i]['keywords']).split(','):
        keyword_TO_topic['keyword_name'].append(word.lower().strip())
        keyword_TO_topic['topic_name'].append(papers_df.iloc[i]['topic'].lower().strip())

keyword_TO_topic_DF = pd.DataFrame(keyword_TO_topic)
keyword_TO_topic_DF.drop_duplicates(inplace=True) # drop 72 duplicates
keyword_TO_topic_DF.reset_index(inplace=True, drop=True)

# mergin to get keyword and topic id
interim_DF = pd.merge(keyword_TO_topic_DF, keywords_df, on="keyword_name")
keyword_TO_topic_DF = pd.merge(interim_DF, topics_df, on="topic_name")
keyword_TO_topic_DF

Unnamed: 0,keyword_name,topic_name,keyword_id,topic_id
0,sleep,psychology,4690,4200
1,deprivation,psychology,4691,4200
2,cognitive function,psychology,4692,4200
3,attention,psychology,4693,4200
4,working,psychology,4694,4200
...,...,...,...,...
624,data representation,data visualization,469474,42070
625,big data security,data security,469468,42069
626,privacy,data security,469469,42069
627,data protection,data security,469470,42069


In [134]:
keyword_TO_topic_DF.to_csv('../data/synthetic/relations/keyword_TO_topic.csv', index=False, sep=';')

## Paper ---> Publisher

In [150]:
ddd = pd.read_csv('../data/synthetic/relations/paper_TO_publisherType.csv', sep=';')
ddd = pd.merge(ddd, conferences_df, on="topic_name")

Unnamed: 0,paper_id,publisher_id,publisher_type
0,1,3329676,conference
1,2,3329984,conference
2,3,13024838,journal
3,4,13024837,journal
4,5,3329942,conference
...,...,...,...
164,165,3329676,conference
165,166,3329984,conference
166,167,3329748,conference
167,168,13024835,journal


In [167]:
xxx = pd.merge(ddd, publishersDF, on="publisher_id")
xxx = xxx[['paper_id', 'publisherType_id']]
xxx.to_csv('../data/synthetic/relations/paper_TO_publisherType.csv', index=False, sep=';')

## Paper ---> Paper (citedBy)

In [11]:
import pandas as pd
import numpy as np

papers_df = pd.read_csv('../data/synthetic/papers.csv', sep=';')
papers_df

Unnamed: 0,id,title,abstract,keywords,topic
0,1,Effects of Sleep Deprivation on Cognitive Func...,This paper explores the effects of sleep depri...,"sleep, deprivation, cognitive function, attent...",Psychology
1,2,The Role of Exercise in Managing Chronic Pain,This paper examines the role of exercise in ma...,"exercise, chronic pain, pain, perception, phys...",Physical Therapy
2,3,Impact of Social Media on Adolescent Mental He...,This paper investigates the impact of social m...,"social media, adolescent, mental health, anxie...",Psychiatry
3,4,Development of a Machine Learning Algorithm fo...,This paper describes the development of a mach...,"machine learning algorithm, heart disease, pre...",Cardiology
4,5,Assessment of Greenhouse Gas Emissions from Ur...,This paper assesses greenhouse gas emissions f...,"greenhouse, gas emissions, urban transportatio...",Environmental Science
...,...,...,...,...,...
164,165,Cloud-based Big Data Storage and Processing,This paper explores the use of cloud computing...,"Big data, Cloud computing, Data management, Da...",Cloud Computing
165,166,Big Data Visualization Techniques,This paper provides an overview of various vis...,"Big data visualization, Data analysis, Informa...",Data Visualization
166,167,Parallel Computing for Big Data Processing,This paper proposes a parallel computing frame...,"Big data processing, Parallel computing, Distr...",Data Processing
167,168,Big Data Applications in Healthcare,This paper explores the applications of big da...,"Big data, Healthcare, Data management, Data an...",Healthcare Analytics


In [23]:
paper_TO_paperDF = papers_df[['id']]
paper_TO_paperDF.columns = ['cited_paper_id']
paper_TO_paperDF

Unnamed: 0,cited_paper_id
0,1
1,2
2,3
3,4
4,5
...,...
164,165
165,166
166,167
167,168


In [None]:
import random 

citing_paper_id = []

for paper in paper_TO_paperDF['cited_paper_id']:
    temp_list = []
    for id in paper_TO_paperDF.sample(n=random.randint(1,13))['cited_paper_id']:
        temp_list.append(id)
    citing_paper_id.append(temp_list)

paper_TO_paperDF['citing_paper_id'] = citing_paper_id
paper_TO_paperDF

In [72]:
paper_TO_paper = {
    'cited_paper_id':[],
    'citing_paper_id':[]
}

for i in range(len(paper_TO_paperDF)):
    for id in str(paper_TO_paperDF.iloc[i]['citing_paper_id']).replace('[', '').replace(']', '').split(','):
        paper_TO_paper['citing_paper_id'].append(id.strip())
        paper_TO_paper['cited_paper_id'].append(paper_TO_paperDF.iloc[i]['cited_paper_id'])

paper_TO_paper_DF = pd.DataFrame(paper_TO_paper)
paper_TO_paper_DF.to_csv('../data/synthetic/relations/paper_TO_paper.csv', sep=';', index=False)

## Paper ---> Edition

In [83]:
ppp = pd.read_csv('../data/synthetic/relations/paper_TO_publisherType.csv', sep=';')
paper_TO_edition = ppp[ppp['publisherType_id'].astype(str).str[0]=='e']
paper_TO_edition.reset_index(inplace=True, drop=True)
paper_TO_edition.rename(columns={'publisherType_id':'edition_id'}, inplace=True)
paper_TO_edition.to_csv('../data/synthetic/relations/paper_TO_edition.csv', sep=';', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  paper_TO_edition.rename(columns={'publisherType_id':'edition_id'}, inplace=True)


## Paper ---> Volume

In [82]:
paper_TO_volume = ppp[ppp['publisherType_id'].astype(str).str[0]=='v']
paper_TO_volume.reset_index(inplace=True, drop=True)
paper_TO_volume.rename(columns={'publisherType_id':'volume_id'}, inplace=True)
paper_TO_volume.to_csv('../data/synthetic/relations/paper_TO_volume.csv', sep=';', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  paper_TO_volume.rename(columns={'publisherType_id':'volume_id'}, inplace=True)


## Paper ---> Reviewer (Revisited)

In [18]:
paper_TO_reviewerDF = paper_relationships[['paper_id']]
cols = ['reviewer1_id', 'reviewer2_id', 'reviewer3_id', 'reviewer4_id', 'reviewer5_id', 'reviewer6_id', 'reviewer7_id']
paper_TO_reviewerDF['reviewers'] = paper_relationships[cols].apply(lambda row: ','.join(row.values.astype(str)), axis=1)
paper_TO_reviewerDF

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  paper_TO_reviewerDF['reviewers'] = papers_df[cols].apply(lambda row: ','.join(row.values.astype(str)), axis=1)


Unnamed: 0,paper_id,reviewers
0,1,"9742915,9742919,9742911,9742907,9742929,974292..."
1,2,"9742926,9742912,9742914,9742908,9742921,974292..."
2,3,"9742929,9742926,9742916,9742921,9742907,974290..."
3,4,"9742921,9742929,9742907,9742928,9742910,974291..."
4,5,"9742922,9742920,9742924,9742912,9742907,974291..."
...,...,...
164,165,"9742919,9742923,9742907,9742910,9742909,974292..."
165,166,"9742910,9742920,9742915,9742923,9742916,974292..."
166,167,"9742922,9742926,9742915,9742912,9742913,974292..."
167,168,"9742924,9742920,9742927,9742929,9742910,974291..."


In [22]:
paper_TO_reviewer = {
    'paper_id':[],
    'reviewer_id':[],
    'decision':[]
}

for i in range(len(paper_TO_reviewerDF)):
    for id in str(paper_TO_reviewerDF.iloc[i]['reviewers']).split(','):
        paper_TO_reviewer['reviewer_id'].append(id.strip())
        paper_TO_reviewer['paper_id'].append(paper_TO_reviewerDF.iloc[i]['paper_id'])
        paper_TO_reviewer['decision'].append(random.randint(0,1))

paper_TO_reviewer = pd.DataFrame(paper_TO_reviewer)
#paper_TO_keyword_DF.drop_duplicates(inplace=True) 
#paper_TO_keyword_DF.reset_index(inplace=True, drop=True)

paper_TO_reviewer

Unnamed: 0,paper_id,reviewer_id,decision
0,1,9742915,1
1,1,9742919,0
2,1,9742911,0
3,1,9742907,1
4,1,9742929,0
...,...,...,...
1178,169,9742914,1
1179,169,9742923,0
1180,169,9742915,0
1181,169,9742929,0


## Paper ---> Paper (Revisited)

In [2]:
import pandas as pd

paper_TO_paper_DF = pd.read_csv('../data/synthetic/relations/paper_TO_paper.csv', sep=';')
paper_TO_paper_DF

Unnamed: 0,cited_paper_id,citing_paper_id
0,1,50
1,1,15
2,1,65
3,1,162
4,1,110
...,...,...
1184,169,115
1185,169,67
1186,169,3
1187,169,160


In [6]:
ppp = paper_TO_paper_DF[paper_TO_paper_DF['cited_paper_id']!=paper_TO_paper_DF['citing_paper_id']]
ppp.to_csv('../data/synthetic/relations/paper_TO_paper.csv', sep=';', index=False)

## Review metadata (revisisted)

In [9]:
import pandas as pd

reviews_df = pd.read_csv('../data/synthetic/relations/paper_TO_reviewer.csv', sep=';')
reviews_df['review_text'] = 'This is placeholder text for each review...'
reviews_df.to_csv('../data/synthetic/relations/paper_TO_reviewer.csv', sep=';', index=False)