# <center>Data Pre-processing </center>

This notebook contains the code to filter, pre-process, and normalize the raw benchmark data sets for training knwoledge graph embedding models (KGEMs).

## System configurations

In [1]:
import getpass
import sys
import time
sys.path.insert(0, '..')

In [2]:
getpass.getuser()

'yojana'

In [3]:
sys.version

'3.8.13 (default, Mar 28 2022, 06:16:26) \n[Clang 12.0.0 ]'

In [4]:
time.asctime()

'Tue Jun 14 14:53:44 2022'

## Import modules

In [5]:
import os
import logging
import pandas as pd
from tqdm import tqdm

# Graph creation
from src.utils import create_graph_from_df
import networkx as nx

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns 

# Splitting libraries
import numpy as np

In [6]:
logger = logging.getLogger('__name__')

In [7]:
# Make directories to store the data
os.makedirs('../data/plots', exist_ok=True)
os.makedirs('../data/kg', exist_ok=True)

### Loading the BioKG and filtering data

In [8]:
biokg_df = pd.read_csv('../data/networks/biokg.links.tsv', sep='\t', names=['source', 'relation', 'target'])
biokg_df = biokg_df[biokg_df['relation'].isin([
    'PROTEIN_DISEASE_ASSOCIATION',
    'DRUG_DISEASE_ASSOCIATION',
    'PPI',  # protein-protein
    'DPI',  # drug-protein
])]

# Rearrange columns 
biokg_df = biokg_df[['source', 'target', 'relation']]

In [9]:
# Replace relations to custom ones
biokg_df['relation'].replace(to_replace='PROTEIN_DISEASE_ASSOCIATION', value='associates', inplace=True)
biokg_df['relation'].replace(to_replace='DPI', value='associates', inplace=True)
biokg_df['relation'].replace(to_replace='PPI', value='associates', inplace=True)
biokg_df['relation'].replace(to_replace='DRUG_DISEASE_ASSOCIATION', value='treats', inplace=True)
biokg_df.head()

Unnamed: 0,source,target,relation
28046,DB00907,D006261,treats
28047,DB01050,D041781,treats
28048,DB12117,D000303,treats
28049,DB00697,D008107,treats
28050,DB00916,D014786,treats


In [10]:
# Add namespace prefix in front of ids 
for idx, row in tqdm(biokg_df.iterrows(), total=biokg_df.shape[0]):
    (
        source,
        target,
        relation
    ) = row
    
    if source.startswith("DB"):
        biokg_df.loc[idx, 'source'] = f'drugbank:{source}'
    elif source.startswith("D"):
        biokg_df.loc[idx, 'source'] = f'mesh:{source}'
    else:
        biokg_df.loc[idx, 'source'] = f'uniprot:{source}'
        
    if target.startswith("DB"):
        biokg_df.loc[idx, 'target'] = f'drugbank:{target}'
    elif target.startswith("D"):
        biokg_df.loc[idx, 'target'] = f'mesh:{target}'
    else:
        biokg_df.loc[idx, 'target'] = f'uniprot:{target}'

biokg_df.head()

100%|██████████| 317993/317993 [01:10<00:00, 4491.05it/s]


Unnamed: 0,source,target,relation
28046,drugbank:DB00907,mesh:D006261,treats
28047,drugbank:DB01050,mesh:D041781,treats
28048,drugbank:DB12117,mesh:D000303,treats
28049,drugbank:DB00697,mesh:D008107,treats
28050,drugbank:DB00916,mesh:D014786,treats


In [11]:
biokg_df['relation'].value_counts()

associates    251126
treats         66867
Name: relation, dtype: int64

### Filtering not connected chemicals not connected to genes

In [12]:
filtered_data = biokg_df[biokg_df['relation'] == 'associates']
known_sources = set(filtered_data['source'].tolist())
known_target = set(filtered_data['target'].tolist())
len(known_sources), len(known_target)

(44684, 27151)

In [13]:
drug_disease_data = biokg_df[biokg_df['relation'] == 'treats']
_drugs = set(drug_disease_data['source'].tolist())
_diseases = set(drug_disease_data['target'].tolist())
len(_drugs), len(_diseases)

(3314, 3074)

In [14]:
## Are there novel drugs with "treats" relation?
novel_drugs = _drugs - known_sources
len(novel_drugs)

973

In [15]:
novel_disease = _diseases - known_target
len(novel_disease)

1275

Thus, there are certain drug-disease pairs that have just one direct edge connecting them, unlike the other where there is a path from chemical-gene-disease. In the next steps, we will remove such pairs by their idex position in the mail dataframe.

In [16]:
idx_to_remove = []

for idx, row in tqdm(drug_disease_data.iterrows(), total=drug_disease_data.shape[0]):
    (
        source,
        target,
        rel
    ) = row
    
    if source in novel_drugs or target in novel_disease:
        idx_to_remove.append(idx)
        
len(idx_to_remove)

100%|██████████| 66867/66867 [00:04<00:00, 16490.31it/s]


14807

In [17]:
print(biokg_df.shape)
biokg_df.drop(idx_to_remove, inplace=True)
biokg_df.shape

(317993, 3)


(303186, 3)

In [18]:
biokg_df['relation'].value_counts()

associates    251126
treats         52060
Name: relation, dtype: int64

In [19]:
biokg_df.to_csv('../data/networks/biokg_processed.tsv', sep='\t', index=False)