In [45]:
import pandas as pd
import os
import numpy as np
from Dataset import processFiles
from copy import deepcopy

# Make our Knowledge Graph

Import knowledge graph

In [46]:
path = os.path.dirname(os.getcwd()) + '/data'
ourKG = pd.read_csv(path+'/ourKg_raw.txt', header=None, delimiter=r'\t')
ourKG.head()

Unnamed: 0,0,1,2,3,4
0,disease_protein,gene_protein,SNRNP48,disease,Minor spliceosome
1,disease_protein,gene_protein,SNRNP35,disease,Minor spliceosome
2,disease_protein,gene_protein,SNRNP25,disease,Minor spliceosome
3,disease_protein,gene_protein,RNPC3,disease,Minor spliceosome
4,disease_protein,gene_protein,PDCD7,disease,Minor spliceosome


Rename columns

In [47]:
ourKG = ourKG.rename(columns={0: 'relation', 1: 'x_type', 2: 'x_name', 3: 'y_type', 4: 'y_name'})
ourKG['x_type']= ourKG['x_type'].apply(lambda x: x.replace("/","_"))
ourKG['y_type']= ourKG['y_type'].apply(lambda x: x.replace("/","_"))
ourKG['relation']= ourKG['relation'].apply(lambda x: x.replace("/","_"))
ourKG.head()

Unnamed: 0,relation,x_type,x_name,y_type,y_name
0,disease_protein,gene_protein,SNRNP48,disease,Minor spliceosome
1,disease_protein,gene_protein,SNRNP35,disease,Minor spliceosome
2,disease_protein,gene_protein,SNRNP25,disease,Minor spliceosome
3,disease_protein,gene_protein,RNPC3,disease,Minor spliceosome
4,disease_protein,gene_protein,PDCD7,disease,Minor spliceosome


Notice the node and edge types

In [48]:
print(ourKG['relation'].unique())
print(ourKG['x_type'].unique())
print(ourKG['y_type'].unique())

['disease_protein' 'protein_protein' 'drug_protein' 'drug_drug'
 'disease_disease']
['gene_protein' 'drug' 'disease']
['disease' 'gene_protein' 'drug']


Make graph undirected

In [67]:
from tqdm.notebook import tqdm

newKG = deepcopy(ourKG)
for row in tqdm(ourKG.iterrows()):
    if row[1]['y_name'] in ['MIG','Minor spliceosome']:
        newKG.loc[len(newKG)] = [row[1]['relation'], row[1]['y_type'], row[1]['y_name'], row[1]['x_type'], row[1]['x_name']]
newKG = newKG.reset_index(drop=True)
newKG = newKG.drop_duplicates()

3627261


0it [00:00, ?it/s]

Save graph

In [86]:
newKG.to_csv(path+'/ourkg.csv', index=False)

# Combine with Main Dataset

Get main dataset

In [79]:
kg = processFiles(os.path.dirname(os.getcwd()))

Loading data from path: /opt/scratch/labs/wuc/Drug-Repurposing/data/kg.csv


Get our dataset

In [87]:
path = os.path.dirname(os.getcwd()) + '/data'
ourKG = pd.read_csv(path + '/ourkg.csv')

Drop unimportant columns and nodes

In [81]:
kg.drop(columns=['display_relation', 'x_id', 'x_source', 'y_id', 'y_source'], inplace=True)
removed_nodes = ['biological_process','molecular_function','cellular_component','pathway','anatomy']
mask = np.where(kg['x_type'].isin(removed_nodes) | kg['y_type'].isin(removed_nodes))[0]
kg.drop(mask, inplace=True)

Get name to idx and apply to our data

In [88]:
name_to_idx = dict(zip(kg['x_name'], kg['x_index']))
ourKG['x_index'] = ourKG['x_name'].map(name_to_idx)
ourKG['y_index'] = ourKG['y_name'].map(name_to_idx)

Add new indexes for non overlapping data

In [90]:
max_idx = kg['x_index'].max()
for i,name in enumerate(ourKG.loc[ourKG['x_index'].isna()]['x_name'].unique()):
    name_to_idx[name] = max_idx + i + 1

ourKG['x_index'] = ourKG['x_name'].map(name_to_idx)
ourKG['y_index'] = ourKG['y_name'].map(name_to_idx)
ourKG = ourKG.astype({'x_index': 'int64', 'y_index': 'int64'})
ourKG

Unnamed: 0,relation,x_type,x_name,y_type,y_name,x_index,y_index
0,disease_protein,gene_protein,SNRNP48,disease,Minor spliceosome,57102,133212
1,disease_protein,gene_protein,SNRNP35,disease,Minor spliceosome,965,133212
2,disease_protein,gene_protein,SNRNP25,disease,Minor spliceosome,10101,133212
3,disease_protein,gene_protein,RNPC3,disease,Minor spliceosome,5436,133212
4,disease_protein,gene_protein,PDCD7,disease,Minor spliceosome,8992,133212
...,...,...,...,...,...,...,...
3627221,disease_protein,disease,MIG,gene_protein,GMFB,133213,9599
3627222,disease_protein,disease,MIG,gene_protein,TCTN1,133213,6232
3627223,disease_protein,disease,MIG,gene_protein,DERL3,133213,7244
3627224,disease_protein,disease,MIG,gene_protein,SPG11,133213,35216


Combine knowledge graphs

In [91]:
totalKG = pd.concat([kg,ourKG])
totalKG = totalKG.drop_duplicates()

Save total knowledge graph

In [92]:
totalKG.to_csv(path+'/totalkg.csv', index=False)