In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
%%capture   
%pwd


In [3]:
%%capture
cd ..

In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
import psycopg2 as ps2
import os
import pandas as pd
import numpy as np
import csv
from src.funcs.utils import *

from dotenv import load_dotenv
load_dotenv()

True

In [6]:
# pull in db credentials from .env file
db_name=os.getenv('DB_NAME')
db_user=os.getenv('DB_USER')
db_host=os.getenv('DB_HOST_IP')
db_pwd=os.getenv('DB_PWD')

In [7]:
# conn.close()
# conn.rollback()

In [7]:
# define db session
conn = ps2.connect(
    host = db_host,
    database = db_name,
    user = db_user,
    password = db_pwd)

## Relationships

### Query for data

### Directional

In [8]:
# define query
sql="SELECT \
control.id, inkey[1], controltype,  string_agg(distinct(effect), ', '), string_agg(distinct(mechanism), ', '), \
num_refs, outkey[1] , reference.id, string_agg(distinct(biomarkertype), ', ') , string_agg(distinct(celllinename), ', '), \
string_agg(distinct(celltype), ', '), string_agg(distinct(changetype), ', '), string_agg(distinct(organ), ', '), \
string_agg(distinct(organism), ', '), string_agg(distinct(quantitativetype), ', '), string_agg(distinct(tissue), ', '),  \
string_agg(distinct(nct_id), ', '),  string_agg(distinct(phase), ', ') \
FROM resnet.control, resnet.reference \
WHERE control.id = reference.id and inkey[1] is not null and outkey[1] is not null \
GROUP BY control.id, inkey[1], controltype, num_refs, outkey[1], reference.id"

In [9]:
%%time
# run query and save to file
with conn.cursor() as cur:
    with open('./data/processed/relations_query_csl.txt', 'w', encoding="utf-8") as f:
        cur.execute(sql)
        csv_writer=csv.writer(f, delimiter= '|')
        for record in cur.fetchall():
            line='|'.join(map(str, record))+'\n'
            f.write(line)

Wall time: 14min 40s


### Bi-derectional

In [10]:
# sql="SELECT DISTINCT \
sql="SELECT \
control.id, inkey[1], inoutkey, controltype, relationship, string_agg(distinct(effect), ', '), string_agg(distinct(mechanism), ', '), num_refs, outkey[1], \
reference.id, string_agg(distinct(biomarkertype), ', '), string_agg(distinct(celllinename), ', '), string_agg(distinct(celltype), ', '), string_agg(distinct(changetype), ', '), \
string_agg(distinct(organ), ', '), string_agg(distinct(organism), ', '), string_agg(distinct(quantitativetype), ', '), string_agg(distinct(tissue), ', ') \
FROM resnet.control, resnet.reference \
WHERE control.id = reference.id and inkey[1] is null and outkey[1] is null \
GROUP BY control.id, controltype, reference.id"

In [11]:
%%time
# run query and save to file
with conn.cursor() as cur:
    with open('./data/processed/bidirectional_relations_query_csl.txt', 'w', encoding="utf-8") as f:
        cur.execute(sql)
        csv_writer=csv.writer(f, delimiter= '|')
        for record in cur.fetchall():
            line='|'.join(map(str, record))+'\n'
            f.write(line)

Wall time: 59min 9s


### Attributes

In [12]:
sql="SELECT  \
id, inkey[1], attributes, relationship, outkey[1] from resnet.control \
WHERE (control.id = control.attributes)";

In [13]:
%%time
# run query and save to file
with conn.cursor() as cur:
    with open('./data/processed/attributes_relations.txt', 'w', encoding="utf-8") as f:
        cur.execute(sql)
        csv_writer=csv.writer(f, delimiter= '|')
        for record in cur.fetchall():
            line='|'.join(map(str, record))+'\n'
            f.write(line)

Wall time: 5min 45s


## Process Data

### Bi-directional

In [14]:
%%time

df_birect=pd.read_csv('./data/processed/bidirectional_relations_query_csl.txt', sep='|', header=None, encoding='utf-8')

df_birect.info(); df_birect.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4718418 entries, 0 to 4718417
Data columns (total 18 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   0       int64 
 1   1       object
 2   2       object
 3   3       object
 4   4       object
 5   5       object
 6   6       object
 7   7       int64 
 8   8       object
 9   9       int64 
 10  10      object
 11  11      object
 12  12      object
 13  13      object
 14  14      object
 15  15      object
 16  16      object
 17  17      object
dtypes: int64(3), object(15)
memory usage: 648.0+ MB
Wall time: 13.4 s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,-9223367021478908826,,"[5747547533469447186, 6031869671835980567]",FunctionalAssociation,,,,1,,-9223367021478908826,,,,,liver,,,
1,-9223365481314032826,,"[5445426109658919917, -2285403010002775161]",FunctionalAssociation,,,,1,,-9223365481314032826,,,,,leg,,,
2,-9223357200775258657,,"[1596328558605610184, -8708891732207432195]",CellExpression,,,,11,,-9223357200775258657,,MDA-MB-231,,,breast,"Homo sapiens, Oryctolagus cuniculus",,
3,-9223356961959519979,,"[4541556823765750071, 3096071376414863663]",CellExpression,,,,1,,-9223356961959519979,,A2058,,,,,,
4,-9223351835527058603,,"[-2894717825806059007, -5196628532512894922]",FunctionalAssociation,,,,2,,-9223351835527058603,,,,,thoracolumbar spine,,,


In [15]:
df_birect.columns=['id1', ':START_ID', 'inOutkey', 'type:TYPE', 'relationship', 'effect', 'mechanism', 'ref_count:int', ':END_ID' , 'id2', 'biomarkertype', 'celllinename', 'celltype',
      'changetype', 'organ', 'organism', 'quantitativetype', 'tissue']

df_birect.head()

Unnamed: 0,id1,:START_ID,inOutkey,type:TYPE,relationship,effect,mechanism,ref_count:int,:END_ID,id2,biomarkertype,celllinename,celltype,changetype,organ,organism,quantitativetype,tissue
0,-9223367021478908826,,"[5747547533469447186, 6031869671835980567]",FunctionalAssociation,,,,1,,-9223367021478908826,,,,,liver,,,
1,-9223365481314032826,,"[5445426109658919917, -2285403010002775161]",FunctionalAssociation,,,,1,,-9223365481314032826,,,,,leg,,,
2,-9223357200775258657,,"[1596328558605610184, -8708891732207432195]",CellExpression,,,,11,,-9223357200775258657,,MDA-MB-231,,,breast,"Homo sapiens, Oryctolagus cuniculus",,
3,-9223356961959519979,,"[4541556823765750071, 3096071376414863663]",CellExpression,,,,1,,-9223356961959519979,,A2058,,,,,,
4,-9223351835527058603,,"[-2894717825806059007, -5196628532512894922]",FunctionalAssociation,,,,2,,-9223351835527058603,,,,,thoracolumbar spine,,,


In [16]:
# create separate columns for inOutkeys
first_ids, second_ids = inOutkeys_to_lists(df_birect)

len(first_ids); len(second_ids)
first_ids[:5]; second_ids[:5]

4718418

4718418

['5747547533469447186',
 '5445426109658919917',
 '1596328558605610184',
 '4541556823765750071',
 '-2894717825806059007']

['6031869671835980567',
 '-2285403010002775161',
 '-8708891732207432195',
 '3096071376414863663',
 '-5196628532512894922']

In [17]:
df_birect_1=df_birect.copy()

df_birect_1.drop(columns=['inOutkey', 'id1', 'id2', 'relationship'], inplace=True)

In [18]:
df_birect_1[':START_ID']=first_ids
df_birect_1[':END_ID']=second_ids
df_birect_1.info(); df_birect_1.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4718418 entries, 0 to 4718417
Data columns (total 14 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   :START_ID         object
 1   type:TYPE         object
 2   effect            object
 3   mechanism         object
 4   ref_count:int     int64 
 5   :END_ID           object
 6   biomarkertype     object
 7   celllinename      object
 8   celltype          object
 9   changetype        object
 10  organ             object
 11  organism          object
 12  quantitativetype  object
 13  tissue            object
dtypes: int64(1), object(13)
memory usage: 504.0+ MB


Unnamed: 0,:START_ID,type:TYPE,effect,mechanism,ref_count:int,:END_ID,biomarkertype,celllinename,celltype,changetype,organ,organism,quantitativetype,tissue
0,5747547533469447186,FunctionalAssociation,,,1,6031869671835980567,,,,,liver,,,
1,5445426109658919917,FunctionalAssociation,,,1,-2285403010002775161,,,,,leg,,,
2,1596328558605610184,CellExpression,,,11,-8708891732207432195,,MDA-MB-231,,,breast,"Homo sapiens, Oryctolagus cuniculus",,
3,4541556823765750071,CellExpression,,,1,3096071376414863663,,A2058,,,,,,
4,-2894717825806059007,FunctionalAssociation,,,2,-5196628532512894922,,,,,thoracolumbar spine,,,


In [19]:
df_birect_1[':START_ID']=df_birect_1[':START_ID'].astype('int64')
df_birect_1[':END_ID']=df_birect_1[':END_ID'].astype('int64')
df_birect_1['ref_count:int']=df_birect_1['ref_count:int'].astype('int16')

df_birect_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4718418 entries, 0 to 4718417
Data columns (total 14 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   :START_ID         int64 
 1   type:TYPE         object
 2   effect            object
 3   mechanism         object
 4   ref_count:int     int16 
 5   :END_ID           int64 
 6   biomarkertype     object
 7   celllinename      object
 8   celltype          object
 9   changetype        object
 10  organ             object
 11  organism          object
 12  quantitativetype  object
 13  tissue            object
dtypes: int16(1), int64(2), object(11)
memory usage: 477.0+ MB


In [20]:
# convert to category to save memory
df_birect_1=convert_object_to_category(df_birect_1)
df_birect_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4718418 entries, 0 to 4718417
Data columns (total 14 columns):
 #   Column            Dtype   
---  ------            -----   
 0   :START_ID         int64   
 1   type:TYPE         category
 2   effect            category
 3   mechanism         category
 4   ref_count:int     int16   
 5   :END_ID           int64   
 6   biomarkertype     category
 7   celllinename      category
 8   celltype          category
 9   changetype        category
 10  organ             category
 11  organism          category
 12  quantitativetype  category
 13  tissue            category
dtypes: category(11), int16(1), int64(2)
memory usage: 208.9 MB


In [21]:
df_birect_1.to_pickle('./data/processed/bidirectional_df.pkl')

### Directional

In [22]:
%%time

df_direct=pd.read_csv('./data/processed/relations_query_csl.txt', sep='|', header=None, encoding='utf-8')

df_direct.info(); df_direct.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10191129 entries, 0 to 10191128
Data columns (total 18 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   0       int64 
 1   1       int64 
 2   2       object
 3   3       object
 4   4       object
 5   5       int64 
 6   6       int64 
 7   7       int64 
 8   8       object
 9   9       object
 10  10      object
 11  11      object
 12  12      object
 13  13      object
 14  14      object
 15  15      object
 16  16      object
 17  17      object
dtypes: int64(5), object(13)
memory usage: 1.4+ GB
Wall time: 33.2 s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,-9223365986387833915,-4129340824583713637,Regulation,,,1,-2983484129356197270,-9223365986387833915,,,,,intestine,Rattus norvegicus,,,,
1,-9223365556180770043,7111680438658927661,Regulation,,,1,-3121142714757276759,-9223365556180770043,,,,,,,,,,
2,-9223365466130767478,-8447327320654612427,Regulation,,,10,-1646880163763186214,-9223365466130767478,,TOV-112D,"B-cell, endothelial cell",,peripheral nervous system,,,,,
3,-9223362118268724280,3670031740130349683,Regulation,positive,,1,-4650140568030604438,-9223362118268724280,,,plasma cell,,,,,plasma,,
4,-9223361192663839079,-6414782697893422681,Expression,positive,,1,-296331965523681380,-9223361192663839079,,,T-cell,,,,,,,


In [23]:
df_direct.columns=['id1', ':START_ID', 'type:TYPE', 'effect', 'mechanism', 'ref_count:int', ':END_ID' , 'id2', \
            'biomarkertype', 'celllinename', 'celltype', 'changetype', 'organ', 'organism', 'quantitativetype', \
            'tissue', 'nct_id', 'phase']

In [24]:
df_direct.info(); df_direct.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10191129 entries, 0 to 10191128
Data columns (total 18 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   id1               int64 
 1   :START_ID         int64 
 2   type:TYPE         object
 3   effect            object
 4   mechanism         object
 5   ref_count:int     int64 
 6   :END_ID           int64 
 7   id2               int64 
 8   biomarkertype     object
 9   celllinename      object
 10  celltype          object
 11  changetype        object
 12  organ             object
 13  organism          object
 14  quantitativetype  object
 15  tissue            object
 16  nct_id            object
 17  phase             object
dtypes: int64(5), object(13)
memory usage: 1.4+ GB


Unnamed: 0,id1,:START_ID,type:TYPE,effect,mechanism,ref_count:int,:END_ID,id2,biomarkertype,celllinename,celltype,changetype,organ,organism,quantitativetype,tissue,nct_id,phase
0,-9223365986387833915,-4129340824583713637,Regulation,,,1,-2983484129356197270,-9223365986387833915,,,,,intestine,Rattus norvegicus,,,,
1,-9223365556180770043,7111680438658927661,Regulation,,,1,-3121142714757276759,-9223365556180770043,,,,,,,,,,
2,-9223365466130767478,-8447327320654612427,Regulation,,,10,-1646880163763186214,-9223365466130767478,,TOV-112D,"B-cell, endothelial cell",,peripheral nervous system,,,,,
3,-9223362118268724280,3670031740130349683,Regulation,positive,,1,-4650140568030604438,-9223362118268724280,,,plasma cell,,,,,plasma,,
4,-9223361192663839079,-6414782697893422681,Expression,positive,,1,-296331965523681380,-9223361192663839079,,,T-cell,,,,,,,


In [25]:
df_direct=df_direct.drop(columns=['id1', 'id2'])
df_direct.head()

Unnamed: 0,:START_ID,type:TYPE,effect,mechanism,ref_count:int,:END_ID,biomarkertype,celllinename,celltype,changetype,organ,organism,quantitativetype,tissue,nct_id,phase
0,-4129340824583713637,Regulation,,,1,-2983484129356197270,,,,,intestine,Rattus norvegicus,,,,
1,7111680438658927661,Regulation,,,1,-3121142714757276759,,,,,,,,,,
2,-8447327320654612427,Regulation,,,10,-1646880163763186214,,TOV-112D,"B-cell, endothelial cell",,peripheral nervous system,,,,,
3,3670031740130349683,Regulation,positive,,1,-4650140568030604438,,,plasma cell,,,,,plasma,,
4,-6414782697893422681,Expression,positive,,1,-296331965523681380,,,T-cell,,,,,,,


In [26]:
df_direct['phase']=df_direct['phase'].fillna('None')

In [27]:
# convert to category dtype to save memory
df_direct=convert_object_to_category(df_direct)
    
df_direct.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10191129 entries, 0 to 10191128
Data columns (total 16 columns):
 #   Column            Dtype   
---  ------            -----   
 0   :START_ID         int64   
 1   type:TYPE         category
 2   effect            category
 3   mechanism         category
 4   ref_count:int     int64   
 5   :END_ID           int64   
 6   biomarkertype     category
 7   celllinename      category
 8   celltype          category
 9   changetype        category
 10  organ             category
 11  organism          category
 12  quantitativetype  category
 13  tissue            category
 14  nct_id            category
 15  phase             category
dtypes: category(13), int64(3)
memory usage: 583.5 MB


In [28]:
df_direct['ref_count:int']=df_direct['ref_count:int'].astype('int16')

In [29]:
df_direct.isnull().sum()

:START_ID           0
type:TYPE           0
effect              0
mechanism           0
ref_count:int       0
:END_ID             0
biomarkertype       0
celllinename        0
celltype            0
changetype          0
organ               0
organism            0
quantitativetype    0
tissue              0
nct_id              0
phase               0
dtype: int64

In [30]:
df_direct['type:TYPE'].value_counts()

Regulation               6310394
Expression                972328
DirectRegulation          780719
QuantitativeChange        503905
GeneticChange             466177
MolTransport              282980
MolSynthesis              179200
StateChange               168537
Biomarker                 145749
ClinicalTrial             128920
ProtModification           80243
miRNAEffect                67012
ChemicalReaction           60486
PromoterBinding            44478
FunctionalAssociation          1
Name: type:TYPE, dtype: int64

In [31]:
df_direct.to_pickle('./data/processed/directional_df.pkl')

### Attributes

In [32]:
%%time

df_att=pd.read_csv('./data/processed/attributes_relations.txt', sep='|', header=None, encoding='utf-8')

df_att.info(); df_att.head()

  exec(code, glob, local_ns)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16009643 entries, 0 to 16009642
Data columns (total 5 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   0       int64 
 1   1       object
 2   2       int64 
 3   3       object
 4   4       object
dtypes: int64(2), object(3)
memory usage: 610.7+ MB
Wall time: 17.6 s


Unnamed: 0,0,1,2,3,4
0,4119317064376552970,-8.095522484733303e+18,4119317064376552970,,4.939089232446321e+18
1,4449108270427552176,-7.954854151829985e+18,4449108270427552176,,5.356386876242299e+18
2,-6549874513341864871,,-6549874513341864871,,
3,-5257562996174351644,,-5257562996174351644,,
4,-5055366494525421779,,-5055366494525421779,,


In [33]:
df_att.columns=['id1', ':START_ID', 'id2', 'type:TYPE', ':END_ID']
df_att.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16009643 entries, 0 to 16009642
Data columns (total 5 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   id1        int64 
 1   :START_ID  object
 2   id2        int64 
 3   type:TYPE  object
 4   :END_ID    object
dtypes: int64(2), object(3)
memory usage: 610.7+ MB


In [34]:
df_att['type:TYPE'].value_counts()

None             14917159
is-a              1062691
similar-to          25740
part-of              3325
derivative-of         728
Name: type:TYPE, dtype: int64

In [35]:
df_att=df_att[df_att['type:TYPE']!='None']
df_att.reset_index(drop=True, inplace=True)
df_att. info(); df_att.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1092484 entries, 0 to 1092483
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   id1        1092484 non-null  int64 
 1   :START_ID  1092484 non-null  object
 2   id2        1092484 non-null  int64 
 3   type:TYPE  1092484 non-null  object
 4   :END_ID    1092484 non-null  object
dtypes: int64(2), object(3)
memory usage: 41.7+ MB


Unnamed: 0,id1,:START_ID,id2,type:TYPE,:END_ID
0,-9211345763361301943,6239307973426633202,-9211345763361301943,is-a,3282417126387566677
1,-9199385581024420801,3431286749236869131,-9199385581024420801,similar-to,4166951674999260658
2,-9171611659504516599,1782891673412044594,-9171611659504516599,is-a,4571827019359806331
3,-9167253756875171418,-3646445504188635301,-9167253756875171418,similar-to,8441324521048175572
4,-9022844312547271757,1866838472195476209,-9022844312547271757,is-a,-781694405486375532


In [36]:
df_att=df_att.drop(columns=['id1', 'id2'])
df_att.info(); df_att.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1092484 entries, 0 to 1092483
Data columns (total 3 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   :START_ID  1092484 non-null  object
 1   type:TYPE  1092484 non-null  object
 2   :END_ID    1092484 non-null  object
dtypes: object(3)
memory usage: 25.0+ MB


Unnamed: 0,:START_ID,type:TYPE,:END_ID
0,6239307973426633202,is-a,3282417126387566677
1,3431286749236869131,similar-to,4166951674999260658
2,1782891673412044594,is-a,4571827019359806331
3,-3646445504188635301,similar-to,8441324521048175572
4,1866838472195476209,is-a,-781694405486375532


In [37]:
# remove any unwanted white space
for col in list(df_att.columns):
    df_att[col]=df_att[col].apply(lambda x: str(x).strip())

In [38]:
# set dtypes
df_att[':START_ID']=df_att[':START_ID'].astype('int64')
df_att[':END_ID']=df_att[':END_ID'].astype('int64')
df_att['type:TYPE']=df_att['type:TYPE'].astype('category')

df_att.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1092484 entries, 0 to 1092483
Data columns (total 3 columns):
 #   Column     Non-Null Count    Dtype   
---  ------     --------------    -----   
 0   :START_ID  1092484 non-null  int64   
 1   type:TYPE  1092484 non-null  category
 2   :END_ID    1092484 non-null  int64   
dtypes: category(1), int64(2)
memory usage: 17.7 MB


In [39]:
df_att.to_pickle('./data/processed/attributes_df.pkl')

### Concatenate Relationships

In [40]:
df_directional=pd.read_pickle('./data/processed/directional_df.pkl')
df_directional.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10191129 entries, 0 to 10191128
Data columns (total 16 columns):
 #   Column            Dtype   
---  ------            -----   
 0   :START_ID         int64   
 1   type:TYPE         category
 2   effect            category
 3   mechanism         category
 4   ref_count:int     int16   
 5   :END_ID           int64   
 6   biomarkertype     category
 7   celllinename      category
 8   celltype          category
 9   changetype        category
 10  organ             category
 11  organism          category
 12  quantitativetype  category
 13  tissue            category
 14  nct_id            category
 15  phase             category
dtypes: category(13), int16(1), int64(2)
memory usage: 478.3 MB


In [41]:
df_bidirect=pd.read_pickle('./data/processed/bidirectional_df.pkl')
df_bidirect.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4718418 entries, 0 to 4718417
Data columns (total 14 columns):
 #   Column            Dtype   
---  ------            -----   
 0   :START_ID         int64   
 1   type:TYPE         category
 2   effect            category
 3   mechanism         category
 4   ref_count:int     int16   
 5   :END_ID           int64   
 6   biomarkertype     category
 7   celllinename      category
 8   celltype          category
 9   changetype        category
 10  organ             category
 11  organism          category
 12  quantitativetype  category
 13  tissue            category
dtypes: category(11), int16(1), int64(2)
memory usage: 193.5 MB


In [42]:
df_att=pd.read_pickle('./data/processed/attributes_df.pkl')
df_att.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1092484 entries, 0 to 1092483
Data columns (total 3 columns):
 #   Column     Non-Null Count    Dtype   
---  ------     --------------    -----   
 0   :START_ID  1092484 non-null  int64   
 1   type:TYPE  1092484 non-null  category
 2   :END_ID    1092484 non-null  int64   
dtypes: category(1), int64(2)
memory usage: 17.7 MB


In [43]:
%%time

df_concat=pd.concat([df_directional, df_bidirect, df_att])

Wall time: 6.18 s


In [44]:
df_concat.info(); df_concat.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16002031 entries, 0 to 1092483
Data columns (total 16 columns):
 #   Column            Dtype   
---  ------            -----   
 0   :START_ID         int64   
 1   type:TYPE         object  
 2   effect            category
 3   mechanism         object  
 4   ref_count:int     float64 
 5   :END_ID           int64   
 6   biomarkertype     object  
 7   celllinename      object  
 8   celltype          object  
 9   changetype        object  
 10  organ             object  
 11  organism          object  
 12  quantitativetype  object  
 13  tissue            object  
 14  nct_id            category
 15  phase             category
dtypes: category(3), float64(1), int64(2), object(10)
memory usage: 1.8+ GB


Unnamed: 0,:START_ID,type:TYPE,effect,mechanism,ref_count:int,:END_ID,biomarkertype,celllinename,celltype,changetype,organ,organism,quantitativetype,tissue,nct_id,phase
0,-4129340824583713637,Regulation,,,1.0,-2983484129356197270,,,,,intestine,Rattus norvegicus,,,,
1,7111680438658927661,Regulation,,,1.0,-3121142714757276759,,,,,,,,,,
2,-8447327320654612427,Regulation,,,10.0,-1646880163763186214,,TOV-112D,"B-cell, endothelial cell",,peripheral nervous system,,,,,
3,3670031740130349683,Regulation,positive,,1.0,-4650140568030604438,,,plasma cell,,,,,plasma,,
4,-6414782697893422681,Expression,positive,,1.0,-296331965523681380,,,T-cell,,,,,,,


In [45]:
df_concat.isnull().sum()

:START_ID                 0
type:TYPE                 0
effect              1092484
mechanism           1092484
ref_count:int       1092484
:END_ID                   0
biomarkertype       1092484
celllinename        1092484
celltype            1092484
changetype          1092484
organ               1092484
organism            1092484
quantitativetype    1092484
tissue              1092484
nct_id              5810902
phase               5810902
dtype: int64

In [46]:
df_concat['ref_count:int'].fillna(0, inplace=True)

In [47]:
df_concat=df_concat.fillna('None')

In [48]:
# set ref count dtype
df_concat['ref_count:int']=df_concat['ref_count:int'].astype('int16')

In [49]:
# change object to category to save memory
df_concat=convert_object_to_category(df_concat)

df_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16002031 entries, 0 to 1092483
Data columns (total 16 columns):
 #   Column            Dtype   
---  ------            -----   
 0   :START_ID         int64   
 1   type:TYPE         category
 2   effect            category
 3   mechanism         category
 4   ref_count:int     int16   
 5   :END_ID           int64   
 6   biomarkertype     category
 7   celllinename      category
 8   celltype          category
 9   changetype        category
 10  organ             category
 11  organism          category
 12  quantitativetype  category
 13  tissue            category
 14  nct_id            category
 15  phase             category
dtypes: category(13), int16(1), int64(2)
memory usage: 932.8 MB


In [50]:
df_concat.reset_index(drop=True, inplace=True)
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16002031 entries, 0 to 16002030
Data columns (total 16 columns):
 #   Column            Dtype   
---  ------            -----   
 0   :START_ID         int64   
 1   type:TYPE         category
 2   effect            category
 3   mechanism         category
 4   ref_count:int     int16   
 5   :END_ID           int64   
 6   biomarkertype     category
 7   celllinename      category
 8   celltype          category
 9   changetype        category
 10  organ             category
 11  organism          category
 12  quantitativetype  category
 13  tissue            category
 14  nct_id            category
 15  phase             category
dtypes: category(13), int16(1), int64(2)
memory usage: 810.7 MB


In [51]:
df_concat.drop_duplicates(inplace=True)
df_concat.reset_index(drop=True, inplace=True)

df_concat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16001720 entries, 0 to 16001719
Data columns (total 16 columns):
 #   Column            Dtype   
---  ------            -----   
 0   :START_ID         int64   
 1   type:TYPE         category
 2   effect            category
 3   mechanism         category
 4   ref_count:int     int16   
 5   :END_ID           int64   
 6   biomarkertype     category
 7   celllinename      category
 8   celltype          category
 9   changetype        category
 10  organ             category
 11  organism          category
 12  quantitativetype  category
 13  tissue            category
 14  nct_id            category
 15  phase             category
dtypes: category(13), int16(1), int64(2)
memory usage: 810.7 MB


In [52]:
df_concat['type:TYPE'].value_counts()
df_concat['organ'].value_counts()

Regulation               6310394
FunctionalAssociation    2068008
CellExpression           1492256
Binding                  1157797
is-a                     1062397
Expression                972328
DirectRegulation          780719
QuantitativeChange        503905
GeneticChange             466177
MolTransport              283319
MolSynthesis              179200
StateChange               168537
Biomarker                 145749
ClinicalTrial             128920
ProtModification           80243
miRNAEffect                67012
ChemicalReaction           60489
PromoterBinding            44478
similar-to                 25740
part-of                     3325
derivative-of                727
Name: type:TYPE, dtype: int64

None                                                                                                                     9662169
liver                                                                                                                     263333
brain                                                                                                                     246320
heart                                                                                                                     156469
skin                                                                                                                      128663
                                                                                                                          ...   
airway, alveolus, colon, esophagus, gastrointestinal tract, intestine, lung, mammary gland, nerve, nose, pleura, skin          1
blood vessel, neural tube, placenta                                                              

In [53]:
df_concat=df_concat.replace('None', '_')
df_concat.info(); df_concat.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16001720 entries, 0 to 16001719
Data columns (total 16 columns):
 #   Column            Dtype   
---  ------            -----   
 0   :START_ID         int64   
 1   type:TYPE         category
 2   effect            category
 3   mechanism         category
 4   ref_count:int     int16   
 5   :END_ID           int64   
 6   biomarkertype     category
 7   celllinename      category
 8   celltype          category
 9   changetype        category
 10  organ             category
 11  organism          category
 12  quantitativetype  category
 13  tissue            category
 14  nct_id            category
 15  phase             category
dtypes: category(13), int16(1), int64(2)
memory usage: 810.7 MB


Unnamed: 0,:START_ID,type:TYPE,effect,mechanism,ref_count:int,:END_ID,biomarkertype,celllinename,celltype,changetype,organ,organism,quantitativetype,tissue,nct_id,phase
0,-4129340824583713637,Regulation,_,_,1,-2983484129356197270,_,_,_,_,intestine,Rattus norvegicus,_,_,_,_
1,7111680438658927661,Regulation,_,_,1,-3121142714757276759,_,_,_,_,_,_,_,_,_,_
2,-8447327320654612427,Regulation,_,_,10,-1646880163763186214,_,TOV-112D,"B-cell, endothelial cell",_,peripheral nervous system,_,_,_,_,_
3,3670031740130349683,Regulation,positive,_,1,-4650140568030604438,_,_,plasma cell,_,_,_,_,plasma,_,_
4,-6414782697893422681,Expression,positive,_,1,-296331965523681380,_,_,T-cell,_,_,_,_,_,_,_


In [54]:

df_concat['type:TYPE']=df_concat['type:TYPE'].apply(lambda x: x.upper())
df_concat.info(); df_concat.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16001720 entries, 0 to 16001719
Data columns (total 16 columns):
 #   Column            Dtype   
---  ------            -----   
 0   :START_ID         int64   
 1   type:TYPE         category
 2   effect            category
 3   mechanism         category
 4   ref_count:int     int16   
 5   :END_ID           int64   
 6   biomarkertype     category
 7   celllinename      category
 8   celltype          category
 9   changetype        category
 10  organ             category
 11  organism          category
 12  quantitativetype  category
 13  tissue            category
 14  nct_id            category
 15  phase             category
dtypes: category(13), int16(1), int64(2)
memory usage: 810.7 MB


Unnamed: 0,:START_ID,type:TYPE,effect,mechanism,ref_count:int,:END_ID,biomarkertype,celllinename,celltype,changetype,organ,organism,quantitativetype,tissue,nct_id,phase
0,-4129340824583713637,REGULATION,_,_,1,-2983484129356197270,_,_,_,_,intestine,Rattus norvegicus,_,_,_,_
1,7111680438658927661,REGULATION,_,_,1,-3121142714757276759,_,_,_,_,_,_,_,_,_,_
2,-8447327320654612427,REGULATION,_,_,10,-1646880163763186214,_,TOV-112D,"B-cell, endothelial cell",_,peripheral nervous system,_,_,_,_,_
3,3670031740130349683,REGULATION,positive,_,1,-4650140568030604438,_,_,plasma cell,_,_,_,_,plasma,_,_
4,-6414782697893422681,EXPRESSION,positive,_,1,-296331965523681380,_,_,T-cell,_,_,_,_,_,_,_


In [55]:
df_concat['type:TYPE'].value_counts()

REGULATION               6310394
FUNCTIONALASSOCIATION    2068008
CELLEXPRESSION           1492256
BINDING                  1157797
IS-A                     1062397
EXPRESSION                972328
DIRECTREGULATION          780719
QUANTITATIVECHANGE        503905
GENETICCHANGE             466177
MOLTRANSPORT              283319
MOLSYNTHESIS              179200
STATECHANGE               168537
BIOMARKER                 145749
CLINICALTRIAL             128920
PROTMODIFICATION           80243
MIRNAEFFECT                67012
CHEMICALREACTION           60489
PROMOTERBINDING            44478
SIMILAR-TO                 25740
PART-OF                     3325
DERIVATIVE-OF                727
Name: type:TYPE, dtype: int64

In [56]:
cols=list(df_directional.columns)
cols

[':START_ID',
 'type:TYPE',
 'effect',
 'mechanism',
 'ref_count:int',
 ':END_ID',
 'biomarkertype',
 'celllinename',
 'celltype',
 'changetype',
 'organ',
 'organism',
 'quantitativetype',
 'tissue',
 'nct_id',
 'phase']

In [57]:
df_concat=df_concat[cols]
df_concat.head(5)

Unnamed: 0,:START_ID,type:TYPE,effect,mechanism,ref_count:int,:END_ID,biomarkertype,celllinename,celltype,changetype,organ,organism,quantitativetype,tissue,nct_id,phase
0,-4129340824583713637,REGULATION,_,_,1,-2983484129356197270,_,_,_,_,intestine,Rattus norvegicus,_,_,_,_
1,7111680438658927661,REGULATION,_,_,1,-3121142714757276759,_,_,_,_,_,_,_,_,_,_
2,-8447327320654612427,REGULATION,_,_,10,-1646880163763186214,_,TOV-112D,"B-cell, endothelial cell",_,peripheral nervous system,_,_,_,_,_
3,3670031740130349683,REGULATION,positive,_,1,-4650140568030604438,_,_,plasma cell,_,_,_,_,plasma,_,_
4,-6414782697893422681,EXPRESSION,positive,_,1,-296331965523681380,_,_,T-cell,_,_,_,_,_,_,_


In [58]:
%%time
df_concat.to_csv('./data/processed/relations.txt', sep='|', index=False, header=True)

Wall time: 4min 12s


In [59]:
# list of column names for header file
cols=list(df_concat.columns)
cols

[':START_ID',
 'type:TYPE',
 'effect',
 'mechanism',
 'ref_count:int',
 ':END_ID',
 'biomarkertype',
 'celllinename',
 'celltype',
 'changetype',
 'organ',
 'organism',
 'quantitativetype',
 'tissue',
 'nct_id',
 'phase']

In [60]:
# create headers file
df_rel_headers=pd.DataFrame(columns=cols)
df_rel_headers
df_rel_headers.to_csv('./data/processed/relations_header.txt', sep= '|', index=False)

Unnamed: 0,:START_ID,type:TYPE,effect,mechanism,ref_count:int,:END_ID,biomarkertype,celllinename,celltype,changetype,organ,organism,quantitativetype,tissue,nct_id,phase


## Nodes

In [61]:
# define query
sql='select id, name, nodetype from resnet.node where id is not null and name is not null and nodetype is not null'

In [62]:
%%time

# save query results to file
with conn.cursor() as cur:
    with open('./data/raw/nodes_.txt', 'w', encoding="utf-8") as f:
        cur.execute(sql)
        csv_writer=csv.writer(f, delimiter='|')
        for record in cur.fetchall():
            line="|".join(map(str, record))+'\n'
            
            f.write(line)

Wall time: 27.2 s


### Create Node files

In [64]:
df_node=pd.read_csv('./data/raw/nodes_.txt', delimiter='|', header=None, encoding='utf-8')
df_node.info(); df_node.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470805 entries, 0 to 1470804
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   0       1470805 non-null  int64 
 1   1       1470805 non-null  object
 2   2       1470805 non-null  object
dtypes: int64(1), object(2)
memory usage: 33.7+ MB


Unnamed: 0,0,1,2
0,1000020775393047472,r_LOC108352567,Protein
1,-1000022879358845591,Rx26449263,SmallMol
2,-1000024917554356200,GO: extracellular matrix of synaptic cleft,SemanticConcept
3,1000026386881640747,rs876660037,GeneticVariant
4,-1000028657865034959,m_Fbxw19,Protein


In [65]:
# create column headers
df_node.columns=[':ID', 'name', ':LABEL']
df_node.head()

Unnamed: 0,:ID,name,:LABEL
0,1000020775393047472,r_LOC108352567,Protein
1,-1000022879358845591,Rx26449263,SmallMol
2,-1000024917554356200,GO: extracellular matrix of synaptic cleft,SemanticConcept
3,1000026386881640747,rs876660037,GeneticVariant
4,-1000028657865034959,m_Fbxw19,Protein


In [66]:
# df cleanup
df_node[':LABEL']=df_node[':LABEL'].apply(lambda x: x.upper())
df_node[':LABEL']=df_node[':LABEL'].apply(lambda x: x.strip())
df_node['name']=df_node['name'].apply(lambda x: x.strip())
df_node[':ID']=df_node[':ID'].apply(lambda x: str(x).strip())  #convert to string and strip()
df_node[':ID']=df_node[':ID'].astype('int64')   # convert back to int
df_node.info(); df_node.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470805 entries, 0 to 1470804
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   :ID     1470805 non-null  int64 
 1   name    1470805 non-null  object
 2   :LABEL  1470805 non-null  object
dtypes: int64(1), object(2)
memory usage: 33.7+ MB


Unnamed: 0,:ID,name,:LABEL
0,1000020775393047472,r_LOC108352567,PROTEIN
1,-1000022879358845591,Rx26449263,SMALLMOL
2,-1000024917554356200,GO: extracellular matrix of synaptic cleft,SEMANTICCONCEPT
3,1000026386881640747,rs876660037,GENETICVARIANT
4,-1000028657865034959,m_Fbxw19,PROTEIN


In [67]:
# check for name strings that are too long
df_node['name_len']=df_node['name'].apply(lambda x: len(x))

In [68]:
df_node.sort_values(by='name_len', ascending=False)

Unnamed: 0,:ID,name,:LABEL,name_len
1107832,-7235442027224814239,1-(4-(4-(6-fluorobenzoisoxazol-3-yl)piperidin-...,SMALLMOL,283582
600508,-4382490639255431495,"D-Glucose, O-4,6-dideoxy-4-((4-(alpha-D-glucop...",SMALLMOL,254
7778,1043483757283375332,"2,4a-Dihydroxy-1-methyl-8-methylenegibb-3-ene-...",SMALLMOL,248
1229720,7946627188054156171,"Cholan-24-oic acid, 3,7,12-trioxo-, (5beta)-, ...",SMALLMOL,243
377137,-3119824026281474218,"1,3,5-Triazine-2,4,6-triamine, N,N'''-1,2-etha...",SMALLMOL,216
...,...,...,...,...
205050,2153331758759806501,HN,CELLTYPE,2
1086727,-7115660023470151832,c,SMALLMOL,1
1273144,-8190334285627176943,X,SMALLMOL,1
432989,-3437747827402286845,\,SMALLMOL,1


In [69]:
# Issue is leading quote with no ending quote in sql db for id -7235442027224814239 above
# the following is a temp fix unitil resolved in sql db
# process long name field

x=df_node[df_node[':ID']==-7235442027224814239]['name']
long_index=df_node[df_node[':ID']==-7235442027224814239].index[0] #capture index to delete later
long_index
x=x.to_frame().reset_index()
x.info(); x.head()
y=x.loc[0,'name']
len(y)

1107832

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   1 non-null      int64 
 1   name    1 non-null      object
dtypes: int64(1), object(1)
memory usage: 144.0+ bytes


Unnamed: 0,index,name
0,1107832,1-(4-(4-(6-fluorobenzoisoxazol-3-yl)piperidin-...


283582

In [71]:
# parse bad field
lst=y.split('\r\n')
# lst=y.split('\n')

lst[:5]
lst[0]='-7235442027224814239|' + lst[0]
lst[:5]

['1-(4-(4-(6-fluorobenzoisoxazol-3-yl)piperidin-1-yl)butyl)-4-(4-fluorophenyl)piperidine-2,6-dione|SmallMol',
 '7235446264322303181|Rx25148080|SmallMol',
 '7235446778373378569|6-chloro-2-fluoropurine|SmallMol',
 '-7235449901734945744|2,2,7-trimethyl-4H-1,4-benzoxazin-3-one|SmallMol',
 '-7235456322816637391|N-(p-Tolyl)-p-toluenesulphonamide|SmallMol']

['-7235442027224814239|1-(4-(4-(6-fluorobenzoisoxazol-3-yl)piperidin-1-yl)butyl)-4-(4-fluorophenyl)piperidine-2,6-dione|SmallMol',
 '7235446264322303181|Rx25148080|SmallMol',
 '7235446778373378569|6-chloro-2-fluoropurine|SmallMol',
 '-7235449901734945744|2,2,7-trimethyl-4H-1,4-benzoxazin-3-one|SmallMol',
 '-7235456322816637391|N-(p-Tolyl)-p-toluenesulphonamide|SmallMol']

In [72]:
# create list of lists from long name field
new_list=[]
for i in range(len(lst)):
    z=lst[i].split('|')
    new_list.append(z)
    

In [73]:
len(new_list)
new_list[:5]

3855

[['-7235442027224814239',
  '1-(4-(4-(6-fluorobenzoisoxazol-3-yl)piperidin-1-yl)butyl)-4-(4-fluorophenyl)piperidine-2,6-dione',
  'SmallMol'],
 ['7235446264322303181', 'Rx25148080', 'SmallMol'],
 ['7235446778373378569', '6-chloro-2-fluoropurine', 'SmallMol'],
 ['-7235449901734945744',
  '2,2,7-trimethyl-4H-1,4-benzoxazin-3-one',
  'SmallMol'],
 ['-7235456322816637391', 'N-(p-Tolyl)-p-toluenesulphonamide', 'SmallMol']]

In [74]:
# create df from list of lists
df_temp=pd.DataFrame(new_list, columns=[':ID', 'name', ':LABEL'])
df_temp.info(); df_temp.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3855 entries, 0 to 3854
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   :ID     3855 non-null   object
 1   name    3855 non-null   object
 2   :LABEL  3854 non-null   object
dtypes: object(3)
memory usage: 90.5+ KB


Unnamed: 0,:ID,name,:LABEL
0,-7235442027224814239,1-(4-(4-(6-fluorobenzoisoxazol-3-yl)piperidin-...,SmallMol
1,7235446264322303181,Rx25148080,SmallMol
2,7235446778373378569,6-chloro-2-fluoropurine,SmallMol
3,-7235449901734945744,"2,2,7-trimethyl-4H-1,4-benzoxazin-3-one",SmallMol
4,-7235456322816637391,N-(p-Tolyl)-p-toluenesulphonamide,SmallMol


In [75]:
df_temp.dropna(inplace=True)
df_temp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3854 entries, 0 to 3853
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   :ID     3854 non-null   object
 1   name    3854 non-null   object
 2   :LABEL  3854 non-null   object
dtypes: object(3)
memory usage: 120.4+ KB


In [76]:
df_temp[':LABEL']=df_temp[':LABEL'].apply(lambda x: x.upper())
df_temp.head()

Unnamed: 0,:ID,name,:LABEL
0,-7235442027224814239,1-(4-(4-(6-fluorobenzoisoxazol-3-yl)piperidin-...,SMALLMOL
1,7235446264322303181,Rx25148080,SMALLMOL
2,7235446778373378569,6-chloro-2-fluoropurine,SMALLMOL
3,-7235449901734945744,"2,2,7-trimethyl-4H-1,4-benzoxazin-3-one",SMALLMOL
4,-7235456322816637391,N-(p-Tolyl)-p-toluenesulphonamide,SMALLMOL


In [77]:
# combine dfs
df_node.drop(int(f'{long_index}'), inplace = True) #drop row with long name field
df_node.drop(columns=['name_len'], inplace=True)

df_node_new=pd.concat([df_node, df_temp])
df_node_new.reset_index(inplace=True, drop=True)
df_node_new.info(); df_node_new.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1474658 entries, 0 to 1474657
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   :ID     1474658 non-null  object
 1   name    1474658 non-null  object
 2   :LABEL  1474658 non-null  object
dtypes: object(3)
memory usage: 33.8+ MB


Unnamed: 0,:ID,name,:LABEL
0,1000020775393047472,r_LOC108352567,PROTEIN
1,-1000022879358845591,Rx26449263,SMALLMOL
2,-1000024917554356200,GO: extracellular matrix of synaptic cleft,SEMANTICCONCEPT
3,1000026386881640747,rs876660037,GENETICVARIANT
4,-1000028657865034959,m_Fbxw19,PROTEIN


In [78]:
# check for presence of ';;' & ';'
df_node_new[df_node_new['name'].str.contains(";")]
df_node_new[df_node_new['name'].str.contains(";;")]

Unnamed: 0,:ID,name,:LABEL
94,-1000445907920769387,Ac-FRPX(Hle; D)F(3-Cl)F-NH2,SMALLMOL
103,1000511471434219628,(S)-3-chloro-4-(N-(3-fluoropropyl)pyrrolidin-3...,SMALLMOL
1038,10055389442503362,Carbamoyl-C(D;bridge C11)KFFW(D)F(4-(Isopropyl...,SMALLMOL
1301,-1006961062563094732,Ac-ED(lactam K7)HX(2-Nal; D)RWK-NH2,SMALLMOL
1503,1008036727165805856,X(Iqc; D)C(bridge X7)YW(D)RTX(Pen)T-NH2,SMALLMOL
...,...,...,...
1473596,7251653840688617755,X(Inp)F(D;4-(Chloro))X(2-Nal;D)F-Amino,SMALLMOL
1473919,-7253500055656412392,m_T(3;10)61H (Gene ID 21159),PROTEIN
1474113,7254604792324956166,YF(D)GW(D)X(Nle; N-Me)DF-NH2,SMALLMOL
1474252,725545692487216447,"2(S)2-{[6-(5-chloro-1H-pyrrolo[2,3-b]pyridin-3...",SMALLMOL


Unnamed: 0,:ID,name,:LABEL
8757,-1048773164875225821,"Y(2,6-Me; N-CO)X(Orn(D)modifixfffd;; lactam G5...",SMALLMOL
946734,6327831653333845569,X(Orn(D)modifixfffd;; lactam G4)X(2-Nal)P(D)G,SMALLMOL


In [79]:
# convert single and double semi-colons to colons
df_node_new['name']=df_node_new['name'].str.replace(';;', ':')
df_node_new['name']=df_node_new['name'].str.replace(';', ':')


In [80]:
df_node_new.to_csv('./data/processed/nodes.txt', sep='|', index=False, header=True)

In [81]:
# create list of column names for header file
cols=list(df_node.columns)
cols

[':ID', 'name', ':LABEL']

In [82]:
# create and save header file
node_headers=pd.DataFrame(columns=cols)
node_headers
node_headers.to_csv('./data/processed/nodes_header.txt', sep='|', index=False)

Unnamed: 0,:ID,name,:LABEL


In [83]:
conn.close()

**To load data into Neo4j**, click on 'Add' in the project pane. Name the new project, click on the '...' by the 'Open' button for the project, and choose 'Terminal'. Take note of the dbms number at the cursor. Copy the nodes.txt, nodes_header.txt, relations.txt, and realations_header.txt files pepared above. Drop them in the import file found here: C:\Users\\[user_name]\\.Neo4jDesktop\relate-data\dbmss\dbms-##(number at cursor)\import

At the Neo4j terminal, change to the bin directory (type 'cd bin') and then paste the following at the cursor:
neo4j-admin import --delimiter="|" --nodes=import/nodes_header.txt, import/nodes.txt --relationships=import/relations_header.txt, import/relations.txt --skip-bad-relationships=true

Go back to the project pane, press 'Start'. The active database will now be shown in a pane at the top. Press open to use in the Neo4j browser.