# Custom network generation

This notebook generates a custom pharmacological network with drug-gene, gene-gene and gene-disease causal interactions.

In [1]:
import getpass
import sys
import time

import pandas as pd

In [2]:
getpass.getuser()

'sarahbeenie'

In [3]:
sys.version

'3.7.5 (default, Nov  1 2019, 02:16:32) \n[Clang 11.0.0 (clang-1100.0.33.8)]'

In [4]:
time.asctime()

'Fri May  8 12:44:23 2020'

Load networks and mapping

In [5]:
# load interaction network
df = pd.read_csv('processed_graph_30_apr.tsv', sep='\t', index_col=0)
# load OpenBioLink network
openbiolink_df = pd.read_csv('openbiolink_network.tsv', sep='\t')

# load entrez-hgnc ID mappings
entrez_hgnc_mapping_df = pd.read_csv('entrez_to_hgnc_mappings.tsv', sep='\t', dtype=object)

In [6]:
df.head()

Unnamed: 0,source_identifier,target_identifier,bel_relation,source_database
0,drugbank:DB00087,HGNC:3620,association,drugbank
1,drugbank:DB00818,HGNC:4083,increases,drugbank
2,drugbank:DB00139,HGNC:408,decreases,drugbank
3,drugbank:DB01142,HGNC:278,decreases,drugbank
4,drugbank:DB00173,HGNC:626,association,drugbank


In [7]:
df.shape

(253157, 4)

In [8]:
# Contributing resources
list(df.source_database.unique())

['drugbank',
 'INTACT',
 'BIOGRID',
 'PATHWAYCOMMONS',
 'KEGG',
 'REACTOME',
 'WIKIPATHWAYS',
 'DISGENET',
 'CLINICALTRIALS']

### Relation types 

In [9]:
# Unique relation types and their counts in network
df['bel_relation'].value_counts()

association     176014
increases        37774
regulates        21296
decreases        11288
hasComponent      6785
Name: bel_relation, dtype: int64

Map regulate relation types as increases relations

In [10]:
df.loc[df['bel_relation'] == 'regulates', 'bel_relation'] = 'increases'

Map association relationships between genes and disease from DisGeNet to increases relationships

In [11]:
df.loc[df['source_database'] == 'DISGENET', 'bel_relation'] = 'increases'

### Remove duplicate interactions

If the interaction is duplicated, keep the first occurrence of the interaction and remove all subsequent, duplicated rows.

In [12]:
df_duplicate_relations = df[['source_identifier', 'target_identifier', 'bel_relation']].copy()
duplicates = df_duplicate_relations[df_duplicate_relations.duplicated()]
df_without_duplicates = df.drop_duplicates(subset=['source_identifier', 'target_identifier', 'bel_relation'], keep='first')

print(f'{len(df.index)}: total number of interactions')
print(f'{len(duplicates.index)}: number of duplicates')
print(f'{len(df_without_duplicates.index)}: total number of interactions after removal of duplicates')

253157: total number of interactions
0: number of duplicates
253157: total number of interactions after removal of duplicates


### Filter dataset to include: 

1. drugs (DrugBank ID) and genes (HGNC ID)
2. genes (HGNC ID) and genes (HGNC ID)
3. genes (HGNC ID) and diseases/phenotypes (UMLS concept ID) 

In [13]:
drug_protein_df = df_without_duplicates.loc[
    (df_without_duplicates['source_identifier'].str.startswith('drugbank:')) & 
    (df_without_duplicates['target_identifier'].str.startswith('HGNC:'))]

protein_protein_df = df_without_duplicates.loc[
    (df_without_duplicates['source_identifier'].str.startswith('HGNC:')) & 
    (df_without_duplicates['target_identifier'].str.startswith('HGNC:'))]

protein_disease_df = df_without_duplicates.loc[
    (df_without_duplicates['source_identifier'].str.startswith('HGNC:')) & 
    (df_without_duplicates['target_identifier'].str.startswith('UMLS:'))]

### Remove association, hasComponent and NaN relations and retain increases and decreases relations

In [14]:
# Unique relation types in drug-protein interactions
list(drug_protein_df.bel_relation.unique())

['association', 'increases', 'decreases', 'hasComponent']

In [15]:
# Unique relation types in protein-protein interactions
list(protein_protein_df.bel_relation.unique())

['decreases', 'association', 'increases', 'hasComponent']

In [16]:
# Unique relation types in protein-disease interactions
list(protein_disease_df.bel_relation.unique())

['increases']

In [17]:
# Only retain increases and decreases relation types between drug-protein and protein-protein interactions
drug_protein_df = drug_protein_df[(drug_protein_df['bel_relation'] == 'increases') | (drug_protein_df['bel_relation'] == 'decreases')]
protein_protein_df = protein_protein_df[(protein_protein_df['bel_relation'] == 'increases') | (protein_protein_df['bel_relation'] == 'decreases')]

In [18]:
frames = [drug_protein_df, protein_protein_df, protein_disease_df]
concatenated_df = pd.concat(frames)

concatenated_df.shape

(62274, 4)

### Remove drug-protein and gene-disease pairs that are not connected to the network

In [19]:
# Protein that are a target of a drug
protein_targets = {
    target
    for source, target, relation, source_database in concatenated_df.values
    if source.startswith("drugbank")
}

# Protein targets that are not source nodes themselves
proteins_without_connection = protein_targets.difference(set(concatenated_df.source_identifier.unique()))

# # Update network removing the drug-protein pairs whose targets do not have any further connections
df_no_isolated_drug_targets = pd.DataFrame([
    {'source_identifier': source, 'target_identifier': target, 'bel_relation': relation, 'source_database': source_database}
    for source, target, relation, source_database in concatenated_df.values
    if (source.startswith("drugbank") and target not in proteins_without_connection)
])

In [20]:
# Protein that are associated to a disease
disease_associated_proteins = {
    source
    for source, target, relation, source_database in concatenated_df.values
    if target.startswith("UMLS")
}

# Disease-associated proteins that are not target nodes of any node (cannot be reached)
proteins_without_connection = disease_associated_proteins.difference(set(concatenated_df.target_identifier.unique()))

# Update network removing the drug-protein pairs whose targets do not have any further connections
df_no_isolated_disease_genes = pd.DataFrame([
    {'source_identifier': source, 'target_identifier': target, 'bel_relation': relation, 'source_database': source_database}
    for source, target, relation, source_database in concatenated_df.values
    if (target.startswith("UMLS") and source not in proteins_without_connection)
])

In [21]:
frames = [df_no_isolated_drug_targets, protein_protein_df, df_no_isolated_disease_genes]
concatenated_df = pd.concat(frames)

concatenated_df.shape

(59925, 4)

### Map increases and decreases relationships to 1 and -1, respectively

In [22]:
concatenated_df.loc[concatenated_df['bel_relation'] == 'increases', 'bel_relation'] = 1
concatenated_df.loc[concatenated_df['bel_relation'] == 'decreases', 'bel_relation'] = -1

### Remove conflicting interactions and self edges

After removing duplicate interactions, if duplicated entities remain, this implies the occurrence of conflicting interactions. If this is the case, remove all occurrences of these interactions.

In [23]:
df_duplicate_entities = concatenated_df[['source_identifier', 'target_identifier']].copy()
duplicate_entities = df_duplicate_entities[df_duplicate_entities.duplicated()]
df_clean = df_duplicate_entities.drop_duplicates(subset=['source_identifier', 'target_identifier'], keep=False)

merged_df = pd.merge(df_clean, 
                  concatenated_df,  
                  how='left', 
                  on=['source_identifier','target_identifier'], 
                 )

print(f'{len(concatenated_df.index)}: # of causal interactions')
print(f'{len(duplicate_entities.index)}: # of conflicting interactions')
print(f'{len(merged_df.index)}: # of causal interactions after removal of conflicting interactions')

59925: # of causal interactions
0: # of conflicting interactions
59925: # of causal interactions after removal of conflicting interactions


Remove interactions in which entities have edges with themselves

In [24]:
count = 0
for row in merged_df.itertuples(index=False):
    if row[0] == row[1]:
        count+=1

In [25]:
network_df = merged_df.drop(merged_df[(merged_df['source_identifier'] == merged_df['target_identifier'])].index)

print(f'{count}: # of self edges')
print(f'{len(network_df.index)}: # of causal interactions in network after removal of self edges and duplicates')

524: # of self edges
59401: # of causal interactions in network after removal of self edges and duplicates


In [26]:
network_df.rename(columns={'source_identifier': 'source', 'target_identifier': 'target', 'bel_relation': 'relation'}, inplace=True)

### Get gene-phenotype edges from OpenBioLink network

In [27]:
gene_phenotype_df = openbiolink_df.loc[
                             (openbiolink_df['source'].str.startswith('NCBIGENE:')) & 
                             (openbiolink_df['target'].str.startswith('HP:'))
]

gene_phenotype_df.shape

(161003, 3)

### Map entrez IDs in OpenBioLink network to HGNC IDs 

In [28]:
entrez_hgnc_mapping_df.head()

Unnamed: 0,entrez,hgnc_id
0,1,5
1,503538,37133
2,29974,24086
3,2,7
4,144571,27057


In [29]:
# Add NCBIGENE: prefix to entrez Ids
entrez_hgnc_mapping_df['entrez'] = 'NCBIGENE:' + entrez_hgnc_mapping_df['entrez'].astype(str)
#Add HGNC: prefix to hgnc Ids
entrez_hgnc_mapping_df['hgnc_id'] = 'HGNC:' + entrez_hgnc_mapping_df['hgnc_id'].astype(str)

# Merge entrez ids in openbiolink network with Ids in entrez to hgnc mapping dataFrame
merged_ids_df = pd.merge(gene_phenotype_df, entrez_hgnc_mapping_df, left_on = 'source', right_on = 'entrez', how='inner')
hgnc_hp_df = merged_ids_df[['hgnc_id', 'target', 'relation']]

# Add openbiolink as source database to gene-phenotype relations
hgnc_hp_df['source_database'] = 'openbiolink'
hgnc_hp_df.rename(columns={'hgnc_id':'source'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [30]:
frames = [network_df, hgnc_hp_df]
inhouse_network_df = pd.concat(frames)

inhouse_network_df.shape

(220369, 4)

In [31]:
inhouse_network_df.to_csv('custom_network.tsv', sep='\t', index=False)