In [34]:
import pandas as pd
from py2neo import Graph
import os
import json
from Classes import Neo4jConnection

# create connection object using neo4j
with open("config.json") as json_data_file:
    data = json.load(json_data_file)
conn = Neo4jConnection(uri=data["uri"], 
                       user=data["user"],              
                       pwd=data["pwd"])


# use py4neo client connection to benchmark
try:
    graph = Graph(data["uri"], auth=(data["user"], data["pwd"]))
    print('SUCCESS: Connected to the Neo4j Database.')
except Exception as e:
    print('ERROR: Could not connect to the Neo4j Database. See console for details.')
    raise SystemExit(e)


# assign directory
directory = './phuse-scripts/data/adam/TDF_ADaM_v1.0/'
# giving file extension
ext = ('.xpt')
# create empty list to dataset names
dataset_list = list()

# iterate over all files with SAS extension
for files in os.listdir(directory):
    if files.endswith(ext):
        m=files.split(sep='.')[0]
        if type(m) != 'NoneType':
            # append dataset name to list
            dataset_list.append(m)
        # assign datasetname to dataframe name (not a good practice)
        globals()[m] = pd.read_sas(directory+files,format='xport') 
    else:
        continue

# cleaning wrong data types
adsl['USUBJID'] = adsl['USUBJID'].str.decode("utf-8").fillna(adsl['USUBJID']) 
adsl['SEX'] = adsl['SEX'].str.decode("utf-8").fillna(adsl['SEX']) 
adsl['ARM'] = adsl['ARM'].str.decode("utf-8").fillna(adsl['ARM']) 
adsl_df = adsl[['USUBJID', 'AGE', 'SEX', 'ARM', 'BMIBL']]


Driver successfully created
SUCCESS: Connected to the Neo4j Database.


In [10]:
def create_patients_nodes(df):
    # Adds patient nodes to the Neo4j graph.
    query = '''
    UNWIND $rows as row
    CREATE (p:Patient {USUBJID: row.USUBJID, AGE: row.AGE, ARM:row.ARM, SEX: row.SEX, BMI: row.BMIBL})
    '''
    return conn.query(query, parameters = {'rows': df.to_dict('records')})
%timeit create_patients_nodes(adsl_df)

19 ms ± 4.46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%%timeit 
for row in adsl_df.itertuples():
        graph.run('''
        CREATE (:Patient {USUBJID: $USUBJID, AGE: $AGE, ARM:$ARM, SEX: $SEX, BMI: $BMI})
        ''', parameters = {'USUBJID': row.USUBJID, 'AGE': row.AGE, 'ARM': row.ARM, 'SEX': row.SEX, 'BMI': row.BMIBL})

717 ms ± 69.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [33]:
%%timeit
adsl_df.to_csv('C:/Temp/dropzone/adsl.csv', encoding='utf-8', index=False)
query = '''
    LOAD CSV WITH HEADERS FROM "file:///C:/Temp/dropzone/adsl.csv" AS row
    CREATE (p:Patient {USUBJID: row.USUBJID, AGE: row.AGE, ARM:row.ARM, SEX: row.SEX, BMI: row.BMIBL})
    '''
conn.query(query, parameters = {})

22.3 ms ± 994 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
def create_patients_nodes(df):
    # Adds patient nodes to the Neo4j graph.
    query = '''
    UNWIND $rows as row
    CREATE (p:Patient {USUBJID: row.USUBJID, AGE: row.AGE, ARM:row.ARM, SEX: row.SEX, BMI: row.BMIBL})
    '''
    return conn.query(query, parameters = {'rows': df.to_dict('records')})

def create_treatment_nodes(treatment_set):
    # Adds treatment nodes to the Neo4j graph form a python set of unique values
    for i in treatment_set:
        query = '''
        CREATE (t:Treatment {Name: $ARM})
        '''
        return conn.query(query, parameters = {'ARM': i})
 
def create_adverseevent_nodes(event_set):
# Adds treatment nodes to the Neo4j graph form a python set of unique values
    for i in event_set:
        query = '''
        CREATE (ae:AdverseEvent {Term: $Term})
        '''
        return conn.query(query, parameters = {'Term': i})
    
def create_visit_nodes(visits_set):
# Adds visit nodes to the Neo4j graph form a python set of unique values
    for i in visits_set:
        query = '''
        CREATE (v:Visit {Name: $Name})
        '''
        return conn.query(query, parameters = {'Name': i})
    
def create_chemlab_nodes(df):
    # Adds chemical laboratory measurements nodes to the Neo4j graph.
    query = '''
    UNWIND $rows as row
    CREATE (p:Parameter {USUBJID: row.USUBJID, VISIT: row.VISIT, Laboratory: row.PARCAT1, Parameter: row.PARAM, Value: row.AVAL, Reference: row.LBNRIND, Dataset: 'adlbc'})
    '''
    return conn.query(query, parameters = {'rows': df.to_dict('records')})

def create_chemlab_nodes(df):
    # Adds hematology laboratory measurements nodes to the Neo4j graph.
    query = '''
    UNWIND $rows as row
    CREATE (p:Parameter {USUBJID: row.USUBJID, VISIT: row.VISIT, Laboratory: row.PARCAT1, Parameter: row.PARAM, Value: row.AVAL, Reference: row.LBNRIND, Dataset: 'adlbh'})
    '''
    return conn.query(query, parameters = {'rows': df.to_dict('records')})

def create_hemlab_nodes(df):
    # Adds hematology laboratory measurements nodes to the Neo4j graph.
    query = '''
    UNWIND $rows as row
    CREATE (vs:VitalSign {USUBJID: row.USUBJID, VISIT: row.VISIT, Laboratory: 'VS', Parameter: row.PARAM, Value: row.AVAL, Reference: '', Dataset: 'adlvs'})
    '''
    return conn.query(query, parameters = {'rows': df.to_dict('records')})

def create_adadas_nodes(df):
    # Adds ADADAS endpoint nodes
    query = '''
    UNWIND $rows as row
    CREATE (ep:Endpoint {USUBJID: row.USUBJID, VISIT: row.VISIT, EndpointName: 'ADAS-Cog', Parameter: row.PARAM, Value: row.AVAL, Reference: '', Dataset: 'adadas'})
    '''
    return conn.query(query, parameters = {'rows': df.to_dict('records')})

def create_cibic_nodes(df):
    # Adds ADADAS endpoint nodes
    query = '''
    UNWIND $rows as row
    CREATE (ep:Endpoint {USUBJID: row.USUBJID, VISIT: row.VISIT, EndpointName: 'CIBIC Score', Parameter: row.PARAM, Value: row.AVAL, Reference: '', Dataset: 'adcibc'})
    '''
    return conn.query(query, parameters = {'rows': df.to_dict('records')})


visits = set(adlbc['VISIT'])
treatments = set(adsl['ARM'])
adverse_event = set(adae['AETERM'])  
