# SDM - Neo4j Testing

In [2]:
import os
from dotenv import load_dotenv
from neo4j import GraphDatabase
#from ..src.neo4j_conn import Neo4jConnection

# change cwd to project root
os.chdir('../')

# Load secrets from .env
load_dotenv(dotenv_path='./env/.env')

# instantiate neo4j credentials
URI = os.environ['NEO4J_URI']
AUTH = (os.environ['NEO4J_USERNAME'], os.environ['NEO4J_PASSWORD'])
DB_NAME = os.environ['DB_NAME']

### Neo4j Connectivity Verification

In [11]:
# main() testing
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    try:
        driver.verify_connectivity()
        print('Neo4j Driver Connectivity Verified!')
    except Exception as e:
        print('Neo4j Driver Unavailable!', e)

Neo4j Driver Connectivity Verified!


### Custom write transaction

In [None]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
        with 

### Executing Cypher Queries

In [None]:
# Sample query from neo4j docs
'''def match_person_nodes(tx, age):
    result = tx.run(
        "MATCH (p:Person {age: $age}) RETURN p.name AS name",
        age=age)
    records = list(result)
    summary = result.consume()
    return records, summary

with driver.session(database="neo4j") as session:
    records, summary = session.execute_read(match_person_nodes, age=42)

# Summary information
print("The query `{query}` returned {records_count} records in {time} ms.".format(
    query=summary.query, records_count=len(records),
    time=summary.result_available_after,
))

# Loop through results and do something with them
for person in records:
    print(person)'''

# Synthetic Data Generation

In [34]:
import pandas as pd
import numpy as np
import random

authors_df = pd.read_csv('../data/synthetic/authors.csv', delimiter=';')
papers_df = pd.read_csv('../data/synthetic/papers.csv', delimiter=';')
conferences_df = pd.read_csv('../data/synthetic/conferences.csv', delimiter=';')
journals_df = pd.read_csv('../data/synthetic/journals.csv', delimiter=';')


In [3]:
papers_df.head()

Unnamed: 0,id,title,abstract,keywords,topic
0,1,Effects of Sleep Deprivation on Cognitive Func...,This paper explores the effects of sleep depri...,"sleep, deprivation, cognitive function, attent...",Psychology
1,2,The Role of Exercise in Managing Chronic Pain,This paper examines the role of exercise in ma...,"exercise, chronic pain, pain, perception, phys...",Physical Therapy
2,3,Impact of Social Media on Adolescent Mental He...,This paper investigates the impact of social m...,"social media, adolescent, mental health, anxie...",Psychiatry
3,4,Development of a Machine Learning Algorithm fo...,This paper describes the development of a mach...,"machine learning algorithm, heart disease, pre...",Cardiology
4,5,Assessment of Greenhouse Gas Emissions from Ur...,This paper assesses greenhouse gas emissions f...,"greenhouse, gas emissions, urban transportatio...",Environmental Science


# Creating Node Relationships

To Do:
- create keywords.csv
    - contains unique keyword name along with id
- create topics.csv
    - contains unique topic name along with id
- add institution to authors.csv
    - random university or company along with id
- add random reviewer decision as edge attribute
    - this will be included in reviewed_by.csv


## Paper Relationships

In [33]:
# written_by
paper_id_list = [paper for paper in papers_df['id']]
author_id_list = [authors_df.sample(n=1)['id'].iloc[0] for i in range(len(paper_id_list))]

# coAuthored_by
co_author_id_list = []
for i in range(len(paper_id_list)):
    author_sample = authors_df.sample(n=1)['id'].iloc[0]
    while author_sample == author_id_list[i]:
        author_sample = authors_df.sample(n=1)['id'].iloc[0]

    co_author_id_list.append(author_sample)

# reviewer1
reviewer1_id_list = []
for i in range(len(paper_id_list)):
    reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]
    while reviewer_sample in [author_id_list[i], co_author_id_list[i]]:
        reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]

    reviewer1_id_list.append(reviewer_sample)

# reviewer2
reviewer2_id_list = []
for i in range(len(paper_id_list)):
    reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]
    while reviewer_sample in [author_id_list[i], co_author_id_list[i], reviewer1_id_list[i]]:
        reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]

    reviewer2_id_list.append(reviewer_sample)

# reviewer3
reviewer3_id_list = []
for i in range(len(paper_id_list)):
    reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]
    while reviewer_sample in [author_id_list[i], co_author_id_list[i], reviewer1_id_list[i], reviewer2_id_list[i]]:
        reviewer_sample = authors_df.sample(n=1)['id'].iloc[0]

    reviewer3_id_list.append(reviewer_sample)

# publishers
publishersDF = pd.concat([conferences_df, journals_df], ignore_index=True)
publishersDF['type'] = np.where(publishersDF['id'] < 5000000, 'conference', 'journal')
publishersDF

publisher_id_list = []
publisher_type_list = []
for i in range(len(paper_id_list)):
    publisher_sample = publishersDF.sample(n=1)
    publisher_id_list.append(publisher_sample['id'].iloc[0])
    publisher_type_list.append(publisher_sample['type'].iloc[0])

# 
paper_dict = {
    'paper_id':paper_id_list,
    'author_id':author_id_list,
    'co_author_id':co_author_id_list,
    'reviewer1_id':reviewer1_id_list,
    'reviewer2_id':reviewer2_id_list,
    'reviewer3_id':reviewer3_id_list,
    'publisher_id':publisher_id_list,
    'publisher_type':publisher_type_list
}

paper_relationships = pd.DataFrame(paper_dict)
paper_relationships

Unnamed: 0,paper_id,author_id,co_author_id,reviewer1_id,reviewer2_id,reviewer3_id,publisher_id,publisher_type
0,1,9742917,9742913,9742920,9742925,9742908,3329838,conference
1,2,9742928,9742925,9742929,9742926,9742920,13024837,journal
2,3,9742911,9742908,9742923,9742917,9742926,3329835,conference
3,4,9742914,9742928,9742917,9742922,9742909,13024840,journal
4,5,9742912,9742916,9742928,9742923,9742925,13024839,journal
...,...,...,...,...,...,...,...,...
164,165,9742909,9742914,9742923,9742918,9742924,13024836,journal
165,166,9742928,9742913,9742925,9742917,9742929,3329942,conference
166,167,9742921,9742925,9742924,9742918,9742929,13024838,journal
167,168,9742927,9742910,9742923,9742926,9742913,13024836,journal


## Keyword/Topic Relationships

In [126]:
unique_keywords = []
for keyword in papers_df['keywords']:
    for word in keyword.split(','):
        if word.lower().strip() not in unique_keywords:
            unique_keywords.append(word.lower().strip())

keywords_dict = {
    'keyword_id':['469{}'.format(i) for i in range(len(unique_keywords))],
    'keyword_name':unique_keywords
}

keywords_df = pd.DataFrame(keywords_dict)
keywords_df.to_csv('../data/synthetic/keywords.csv', sep=';', index=False)

In [127]:
unique_topics = []
for topic in papers_df['topic']:
    if topic.lower().strip() not in unique_topics:
        unique_topics.append(topic.lower().strip())

topics_dict = {
    'topic_id':['420{}'.format(i) for i in range(len(unique_topics))],
    'topic_name':unique_topics
}

topics_df = pd.DataFrame(topics_dict)
topics_df.to_csv('../data/synthetic/topics.csv', sep=';', index=False)

In [133]:
keyword_TO_topic = {
    'keyword_name':[],
    'topic_name':[]
}

for i in range(len(papers_df)):
    for word in str(papers_df.iloc[i]['keywords']).split(','):
        keyword_TO_topic['keyword_name'].append(word.lower().strip())
        keyword_TO_topic['topic_name'].append(papers_df.iloc[i]['topic'].lower().strip())

keyword_TO_topic_DF = pd.DataFrame(keyword_TO_topic)
keyword_TO_topic_DF.drop_duplicates(inplace=True) # drop 72 duplicates
keyword_TO_topic_DF.reset_index(inplace=True, drop=True)

# mergin to get keyword and topic id
interim_DF = pd.merge(keyword_TO_topic_DF, keywords_df, on="keyword_name")
keyword_TO_topic_DF = pd.merge(interim_DF, topics_df, on="topic_name")
keyword_TO_topic_DF

Unnamed: 0,keyword_name,topic_name,keyword_id,topic_id
0,sleep,psychology,4690,4200
1,deprivation,psychology,4691,4200
2,cognitive function,psychology,4692,4200
3,attention,psychology,4693,4200
4,working,psychology,4694,4200
...,...,...,...,...
624,data representation,data visualization,469474,42070
625,big data security,data security,469468,42069
626,privacy,data security,469469,42069
627,data protection,data security,469470,42069


In [134]:
keyword_TO_topic_DF.to_csv('../data/synthetic/relations/keyword_TO_topic.csv', index=False, sep=';')