In [None]:
import csv
import os
import json
import gensim
import pickle
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.preprocessing import MinMaxScaler

import networkx as nx
import networkx
import community
from yellowbrick.cluster import KElbowVisualizer
from community import community_louvain
from sknetwork.clustering import Louvain, BiLouvain, modularity, bimodularity
import matplotlib.pyplot as plt


In [None]:
MAINDIR = ''

## Mapping 
MESHid (CTD) --> DrugBank --> ChEMBL (access to SMILE)

In [None]:
## Mapping Drugbank Id to ChemicalID
## Chemical vocabulary: http://ctdbase.org/downloads/
chem_DrugBank_ID = pd.read_csv("CTD_chemicals.csv").dropna()
DrugBankID2ChemicalID = pd.Series(chem_DrugBank_ID['ChemicalID'].values, index=chem_DrugBank_ID['DrugBankIDs']).to_dict()
ChemicalID2DrugBankID = dict((y,x) for x,y in DrugBankID2ChemicalID.items())
print(len(DrugBankID2ChemicalID))

In [None]:
ChEMBL = pd.read_csv("drugbank.csv", sep='\t')
ChEMBL_DrugbankID = ChEMBL[['DrugbankID','CHEMBL']].dropna()
ChEMBL2DrugbankID_temp = pd.Series(ChEMBL_DrugbankID['DrugbankID'].values, index=ChEMBL_DrugbankID['CHEMBL']).to_dict()

new_ChEMBL2DrugbankID = {}
for key, value in ChEMBL2DrugbankID_temp.items():
    if '|' in key:
        for new_key in key.split('|'):
            new_ChEMBL2DrugbankID[new_key] = value
    else:
        new_ChEMBL2DrugbankID[key] = value

In [None]:
## https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi
Chembl_DB_ID = pd.read_csv("Chembl_DB_ID.csv", sep='\t')
Chembl_DB_ID = Chembl_DB_ID.dropna()
ChEMBL2DrugbankID_new = pd.Series(Chembl_DB_ID['DB_ID'].values, index=Chembl_DB_ID['CHEMBL_ID']).to_dict()

In [None]:
## merging 2 diffrenet source of mapping  
ChEMBL2DrugbankID = {**new_ChEMBL2DrugbankID, **ChEMBL2DrugbankID_new}
DrugbankID2ChEMBL = dict((y,x) for x,y in ChEMBL2DrugbankID.items())
DrugbankID2ChEMBL

In [None]:
# protein seq
target_seq = pd.read_csv('uniprot-yourlist.csv')
target_seq = target_seq.dropna(subset=['Sequence'])
target_seq['Target'] = pd.to_numeric(target_seq['Target'], errors='ignore')

In [None]:
list_phase4 = list(pd.read_csv('phase_4_drug.csv', index_col=0)['Molecule ChEMBL ID'])

## 1- Drug-Target interactions

In [None]:
## Resding DTI from ctd website and select Homo sapiens rows
chem_gene = pd.read_csv("CTD_chem_gene_ixns_new.csv").rename(columns={'ChemicalID':'from','GeneID':'to'})
chem_gene_homo = chem_gene[chem_gene['OrganismID']==9606.0]
chem_gene_homo.head(5)

In [None]:
chem_gene_homo

In [None]:
df_interaction = chem_gene_homo[['from','to','InteractionActions']].drop_duplicates()
df_interaction['Type_Interaction']= 'DTI'
df_interaction

In [None]:
# mapping MESHid(CTD)--> DrugBankID--> ChemicalID
df_interaction['Chembl']= df_interaction['from'].map(ChemicalID2DrugBankID).map(DrugbankID2ChEMBL)
df_interaction= df_interaction.dropna(subset=['Chembl'])
df_interaction

In [None]:
# ChemicalID --> SMILES 
smile = pd.read_csv(f'{MAINDIR}/Chmbel2smile.csv', header=None)
ChEMBL2Smiles = pd.Series(smile[0].values, index=smile[1]).to_dict()
df_interaction['smiles'] = df_interaction['Chembl'].map(ChEMBL2Smiles)
df_interaction.head()

In [None]:
## remove drug that we do not have smile id
no_smileId = list(set(df_interaction['Chembl'])-set(smile[1]))
no_smileId

In [None]:
df_interactions = df_interaction[~df_interaction['Chembl'].isin(no_smileId)]
df_interactions.head()

### Final DTI


In [None]:
df_interactions.to_csv(f'{MAINDIR}/df_interactions.csv')
with open(f'{MAINDIR}/df_interactions.pkl', 'wb') as handle:
    pickle.dump(df_interactions, handle)

In [None]:
num_drug = len(set(df_interactions['from']))
num_target = len(set(df_interactions['to']))
print (f'number of taregt: {num_target}')
print (f'number of drug: {num_drug}')
print (f'number of interactions: {len(df_interactions)}')
print ('\n\n type of interactions:')
df_interactions['InteractionActions'].value_counts().head()

In [None]:
pd.DataFrame(df_interactions['InteractionActions'].value_counts()).to_csv('type_of_DTI.csv')
type_DTI = pd.read_csv('type_DTI.csv')

type_DTI_list = list(set(list(type_DTI['S1'])+ list(type_DTI['S2'])+ list(type_DTI['S3'])+ list(type_DTI['S4'])+list(type_DTI['S5'])+ list(type_DTI['S6'])))[1:]
type2num = pd.Series(0, type_DTI_list).to_dict()
print(f'Number of unique type of interactions: {len(type_DTI_list)}')

# Counting the number of each interactions
dic_type = {}

for T in type_DTI_list:
    count = 0
    for col in  range(1,7): 
        count_tmp = type_DTI[type_DTI[f'S{col}']== T ]['num'].sum()
        count = count + count_tmp
    dic_type[T] = count
df_sum_type = pd.DataFrame(dic_type.items()).sort_values(by=1, ascending=False)
df_sum_type.head(10)

### Selected interactions

In [None]:
selected_type = ['increases^expression','decreases^expression','decreases^reaction','increases^reaction', 'increases^activity','decreases^activity']
df_sum_type[df_sum_type[0].isin(selected_type)]

In [None]:
df_interactions_sepType = pd.read_csv(f'{MAINDIR}/df_interactions_sepType.csv').drop(columns=['Unnamed: 6'])

selected_DTI = df_interactions_sepType[df_interactions_sepType['S1'].isin(selected_type)]
for col in range(2,7):
    selected_DTI_tmp = df_interactions_sepType[df_interactions_sepType[f'S{col}'].isin(selected_type)]
    selected_DTI = pd.concat([selected_DTI, selected_DTI_tmp])
    selected_DTI = selected_DTI.drop(columns=['Unnamed: 0'])
selected_DTI.head()

In [None]:
# saving protein IDs
pd.DataFrame(set(selected_DTI['to'])).to_csv(f'{MAINDIR}/protein_EntrezID.csv')
num_pro = len(set(selected_DTI['to']))
print(f'number of proteins: {num_pro}')

# read protein seq from uniport save file
uniport_seq = pd.read_csv(f'{MAINDIR}/uniprot-ids.csv')
uniport_seq.head(3)

In [None]:
id2Sequence = pd.Series(uniport_seq['Sequence'].values, index=uniport_seq['id']).to_dict()
print(f'Number of proteins has sequence: {len(id2Sequence)}')
list_proteins = list(map(int, list(id2Sequence.keys())))
selected_DTI_final = selected_DTI[selected_DTI['to'].isin(list_proteins)]

selected_uniport_seq = uniport_seq[uniport_seq['id'].isin(list_proteins)].drop_duplicates(subset=['id'])
selected_uniport_seq.to_csv(f'{MAINDIR}/selected_uniport_seq.csv')

In [None]:
# selecting phase 4 drugs
selected_DTI_final = selected_DTI_final[selected_DTI_final['Chembl'].isin(list_phase4)]

# select targets that we have seq
selected_DTI_final = selected_DTI_final[selected_DTI_final['to'].isin(list(target_seq['Target']))]
selected_DTI_final

In [None]:
count_drug_inter = pd.DataFrame(selected_DTI_final['from'].value_counts())
drug_with_less_than_100target = list(count_drug_inter[(count_drug_inter['from']<=100)].index)
drug_with_less_than_100target

In [None]:
selected_DTI_final = selected_DTI_final[selected_DTI_final['from'].isin(drug_with_less_than_100target)]
selected_DTI_final['to'].value_counts()

In [None]:
selected_DTI_final.to_csv(f'{MAINDIR}/selected_DTI.csv')
num_drug = len(set(selected_DTI_final['from']))
num_target = len(set(selected_DTI_final['to']))
print (f'number of taregt: {num_target}')
print (f'number of drug: {num_drug}')
print (f'number of interactions: {len(selected_DTI_final)}')
print (f'number of interaction type: {len(selected_type)}')
selected_type

#### Negative interactions

In [None]:
ChEMBL2DrugbankID = {v: k for k, v in DrugbankID2ChEMBL.items()}
DrugBankID2ChemicalID = {v: k for k, v in ChemicalID2DrugBankID.items()}

In [None]:
# ChEMBL_id to Entrez_id (from uniprot)
with open('ChEMBL2Entrez.pkl', 'rb') as f:
    ChEMBL2Entrez = pickle.load(f)
ChEMBL2Entrez

In [None]:
ChEMBL_inter = pd.read_csv('DTI_known_ChEMBLid_originalID.csv', index_col=0)
ChEMBL_inter.columns = ['Chembl','T_Chembl','weight']
negative_inter = ChEMBL_inter[ChEMBL_inter['weight']==0]
negative_inter['from'] = negative_inter['Chembl'].map(ChEMBL2DrugbankID).map(DrugBankID2ChemicalID)
negative_inter['to'] = negative_inter ['T_Chembl'].map(ChEMBL2Entrez)
negative_inter['smiles'] = negative_inter['Chembl'].map(ChEMBL2Smiles)
negative_inter = negative_inter.dropna().drop(columns= ['T_Chembl','weight'])
negative_inter ['Type_Interaction'] = 'neg_DTI'
negative_inter ['S1'] = 'negative_DTI'
negative_inter = negative_inter[negative_inter['Chembl'].isin(list_phase4)]
negative_inter = negative_inter[negative_inter['to'].isin(list(target_seq['Target']))]
negative_inter.to_csv('negative_int.csv')
negative_inter

In [None]:
print(f'Number of negative interactions: {len(negative_inter)}')

In [None]:
final_P_N_DTI = selected_DTI_final.append(negative_inter, sort=False)
final_P_N_DTI

## 2- Diseases-Drug
- http://ctdbase.org/downloads/#cd

In [None]:
DisD_tmp = pd.read_csv("CTD_chemicals_diseases.csv").sort_values(by=['DiseaseID'])
DisD = DisD_tmp[DisD_tmp['DirectEvidence']=='therapeutic']
DisD['Cheml_D'] = DisD['ChemicalID'].map(ChemicalID2DrugBankID).map(DrugbankID2ChEMBL)
DisD = DisD.dropna(subset=['Cheml_D'])
DisD['smile'] = DisD['Cheml_D'].map(ChEMBL2Smiles)
DisD = DisD.dropna(subset=['smile'])
DisD = DisD[DisD['Cheml_D'].isin(list_phase4)]
DisD

In [None]:
DisD['DiseaseName'].value_counts()

## 3- Diseases-Target 
- http://ctdbase.org/downloads/#gd

In [None]:
DisT_tmp = pd.read_pickle('CTD_genes_diseases.pkl')
DisT_first_col = list(DisT_tmp)
DisT_tmp.columns = ['GeneSymbol','GeneID', 'DiseaseName','DiseaseID','DirectEvidence', 'InferenceChemicalName','InferenceScore','OmimIDs','PubMedIDs']

DisT_tmp.head()

In [None]:
DisT_tmp['DirectEvidence'].value_counts()

In [None]:
DisT_therapeutic = DisT_tmp[DisT_tmp['DirectEvidence'].isin(['therapeutic','marker/mechanism|therapeutic'])]
len_dis_therapeutic = len(set(DisT_therapeutic['DiseaseID']))
print(f'Num of therapeutic diseases: {len_dis_therapeutic}')


DisT_therapeutic

In [None]:
len_dis = len(set(DisT_tmp['DiseaseID']))
print(f'Total Num of diseases: {len_dis}')

In [None]:
# Removing the last two digits of MESH-ID
DisT_tmp ['DiseaseID_new'] = DisT_tmp['DiseaseID'].map(lambda x: str(x)[:-2])

len_dis_new = len(set(DisT_tmp['DiseaseID_new']))
print(f'Num of diseases after removing 2 last digits: {len_dis_new}')

In [None]:
# Removing the last two digits of MESH-ID
DisT_therapeutic ['DiseaseID_new'] = DisT_therapeutic['DiseaseID'].map(lambda x: str(x)[:-2])

len_dis_therapeutic = len(set(DisT_therapeutic['DiseaseID_new']))
print(f'Num of therapeutic diseases after removing 2 last digits: {len_dis_therapeutic}')

#### Therapeutic diseases

In [None]:
DisT = DisT_therapeutic[['GeneID','DiseaseID', 'DiseaseName']]

# Selecting targets that we have in DTI
DisT = DisT[DisT['GeneID'].isin(final_P_N_DTI['to'])]
DisT

In [None]:
#DisT.loc [86119326] = ['100174880','MESH:D000014','Abnormalities, Drug-Induced']
DisT = DisT.sort_values(by=['DiseaseID'])

In [None]:
# selecting targets that we have sequnce
DisT = DisT[DisT['GeneID'].isin(target_seq['Target'])]

In [None]:
DisT['DiseaseName'].value_counts()

### Summary of drug/targets/diseases in 3 datasets

In [None]:
num_DisT_T = len(set(DisT['GeneID']))
print(f'Number of target in DisT: {num_DisT_T}')

num_DisT_dis = len(set(DisT['DiseaseID']))
print(f'Number of diseases in DisT: {num_DisT_dis}')

print(f'Number of interactions in DisT: {len(DisT)}')

In [None]:
num_DisD_D = len(set(DisD['ChemicalID']))
print(f'Number of drug in DisD: {num_DisD_D}')

num_DisD_dis = len(set(DisD['DiseaseID']))
print(f'Number of diseases in DisD: {num_DisD_dis}')

print(f'Number of interactions in DisD: {len(DisD)}')

In [None]:
num_T = len(set(final_P_N_DTI['to']))
print(f'Number of target in DTI: {num_T}')

num_D = len(set(final_P_N_DTI['from']))
print(f'Number of drug in DTI: {num_D}')

print(f'Number of interactions in DTI: {len(final_P_N_DTI)}')

### List of drug/target/disease

In [None]:
list_drugs = set(list(final_P_N_DTI['from'])+list(DisD['ChemicalID']))

# save info of selected drugs for DDS
info_drugs = final_P_N_DTI[['smiles','Chembl']].rename(columns={'Chembl':1,'smiles':0}).append(DisD[['smile', 'Cheml_D',]].rename(columns={'Cheml_D':1,'smile':0})).drop_duplicates(subset=1, keep="first")
info_drugs.to_csv('temp_SMILES_main.txt', sep='\t', index=False, header=False)

print(f'Number of drugs for similarity: {len(list_drugs)}')

In [None]:
list_targets = set(list(final_P_N_DTI['to'])+list(DisT['GeneID']))

# save info of selected targets for PPS
info_targets = target_seq[target_seq['Target'].isin(list_targets)].drop_duplicates(subset='Target', keep="first")
info_targets.to_csv("selected_uniport_seq.csv")

print(f'Number of targets for similarity: {len(list_targets)}')

In [None]:
list_dis = set(list (DisD['DiseaseID'])+list(DisT['DiseaseID']))
print(f'Number of diseases: {len(list_dis)}')

## 4- Drug-Drug similarities


In [None]:
with open(f'{MAINDIR}/DDS.pkl', 'rb') as f:
        DDS = pickle.load(f)
DDS.head()

## 5- Protein-Protein similarities

In [None]:
PPS_matrix = pd.read_csv("PPS(seq).csv", index_col=0)

In [None]:
PPS = PPS_matrix.copy()

# set diagonal as NAN (similarity of drugs with itself)
m,n = PPS.shape
PPS[:] = np.where(np.arange(m)[:,None] >= np.arange(n),np.nan,PPS)

# stack() gives all links similarities
PPS = PPS.stack().reset_index().rename(columns={'level_0':'to', 'level_1':'from', 0:'weight'}) 
PPS

## DDS and PPS Normalization

In [None]:
def scale_data(df):
    scaler = MinMaxScaler(feature_range=(0, 1))
    data = df[['weight']]
    scaler.fit(data)
    data = scaler.transform(data)
    df['weight'] = data
    return df

In [None]:
def plot_similarities (DDS, PPS):
    DDS_boxplot = DDS[['weight']]
    DDS_boxplot['type'] = 'DDS'

    PPS_boxplot = PPS[['weight']]
    PPS_boxplot['type'] = 'PPS'

    df_boxplot = DDS_boxplot.append(PPS_boxplot)

    boxplot = df_boxplot.boxplot(by='type',fontsize=15, figsize=(6,7))

In [None]:
plot_similarities (DDS, PPS)
#DDS_nor = scale_data(DDS)
#PPS_nor = scale_data(PPS)

In [None]:
##### TO DO #####
# unique id for DDS and PPS

In [None]:
# clusters PPS network
edgeList_pps = PPS.values.tolist()
G = networkx.Graph()
weights = []

for i in range(len(edgeList_pps)):
    G.add_edge(edgeList_pps[i][0], edgeList_pps[i][1], weight=edgeList_pps[i][2])
    weights.append(edgeList_pps[i][2])
    
A = networkx.adjacency_matrix(G).A
PPS_adj = A.copy()

louvain = Louvain()
labels = louvain.fit_transform(PPS_adj)

labels_unique, counts = np.unique(labels, return_counts=True)

PPS_cluster_label= pd.DataFrame({'target':list(G.nodes()), 'label':labels})
PPS_cluster_label['label']= PPS_cluster_label['label'].astype(str)
print(labels_unique, counts)

In [None]:
target2cluster = dict(zip(PPS_cluster_label.target, PPS_cluster_label.label))

In [None]:
color_dict = {k: v for k, v in enumerate(['#58ACFA','#FF1493', 'yellow','orange', '#00CED1','#5F9EA0','#006400','#96bf65','#fcc808','#7b2b48',
 '#e96957','#e06000','#173679','#d2dd49','#684a6b','#096eb2','#ce482a', 'red', 'lime', 'lightslategray',
                                      'olive', 'rosybrown', 'sienna', 'darkmagenta','midnightblue','maroon',
                                      'lightcoral','gold','sandybrown','tomato','lawngreen','lightgreen','darkorchid',
                                      'lightskyblue','darkgreen'])}
color_dict= {str(key): value for key, value in color_dict.items()}


In [None]:
weight = [element * 1000 for element in weights]
weight = [40 if i>=40 else i for i in weight]

In [None]:
#partition = community_louvain.best_partition(G)
pos = nx.spring_layout(G, scale=2)

plt.figure(figsize=(10,6))
nx.draw(G, pos, node_color=[color_dict[v] for v in target2cluster.values()], edge_color=weight, node_size=[20]*len(G.nodes()))

## Create edgelist

In [None]:
PPS['Type_Interaction']='PPS'
PPS['to']= PPS['to'].astype(int)
PPS['from']= PPS['from'].astype(int)
PPS

In [None]:
DDS['Type_Interaction']='DDS'
DDS['to']= DDS['to'].astype(int)
DDS['from']= DDS['from'].astype(int)
DDS

In [None]:
DisD_final = DisD[['Cheml_D','DiseaseID']].rename(columns={'Cheml_D':'from','DiseaseID':'to'})
DisD_final['from'] = DisD_final['from'].str.replace('CHEMBL', '').astype(int)
DisD_final['Type_Interaction'] = 'DisD'
DisD_final

In [None]:
DisT_final = DisT[['GeneID','DiseaseID']].rename(columns={'DiseaseID':'from','GeneID':'to'})
DisT_final['Type_Interaction']= 'DisT'
DisT_final

In [None]:
Chembl2CTD = pd.Series(selected_DTI_final['from'].values, index=selected_DTI_final['Chembl']).to_dict()
selected_DTI_1 = final_P_N_DTI.copy()
selected_DTI_1['from'] = selected_DTI_1['Chembl'].str.replace('CHEMBL', '').astype(int)
selected_DTI_1['to'] = selected_DTI_1['to'].astype(int)
selected_DTI_1 = selected_DTI_1.drop(columns= ['Chembl','smiles'])

In [None]:
selected_DTI_1

In [None]:
# unique id for drug/target/disease
target_list = list(set(list(PPS['to'])+ list(PPS['from'])+list(DisT_final['to'])+ list(selected_DTI_1['to'])))
max_target = max(target_list)

drug_list = list(set(list(DDS['to'])+ list(DDS['from'])+list(DisD_final['from'])+ list(selected_DTI_1['from'])))
drugId2numId_nod2vec = {k: v+max_target+1 for v, k in enumerate(sorted(drug_list))}

DDS['to']= DDS['to'].map(drugId2numId_nod2vec)
DDS['from']= DDS['from'].map(drugId2numId_nod2vec)
DisD_final['from'] = DisD_final['from'].map(drugId2numId_nod2vec)
selected_DTI_1['from'] = selected_DTI_1['from'].map(drugId2numId_nod2vec)

max_drug = max(drugId2numId_nod2vec.values())
disease_list = list(set (list(DisT_final['from']) + list(DisD_final['to'])))


In [None]:
diseaseId2numId_nod2vec = {k: v+max_drug+1 for v, k in enumerate(sorted(disease_list))}
DisT_final['from'] = DisT_final['from'].map(diseaseId2numId_nod2vec)
DisT_final['weight'] = 1
DisD_final['to'] = DisD_final['to'].map(diseaseId2numId_nod2vec)
DisD_final['weight'] = 1

In [None]:
with open('DisD.pkl', 'wb') as handle:
    pickle.dump(DisD_final, handle)
    
with open('DisT.pkl', 'wb') as handle:
    pickle.dump(DisT_final, handle)

with open('DTI.pkl', 'wb') as handle:
    pickle.dump(selected_DTI_1, handle) 
    
with open('PPS.pkl', 'wb') as handle:
    pickle.dump(PPS, handle) 

with open('DDS.pkl', 'wb') as handle:
    pickle.dump(DDS, handle) 

In [None]:
all_nod2vec = ((DisD_final.append(DisT_final)).append(PPS)).append(DDS)
all_nod2vec= all_nod2vec[all_nod2vec['weight']!=0]
all_nod2vec['Type_Interaction'].value_counts()

In [None]:
avrg_pps = all_nod2vec[all_nod2vec['Type_Interaction']=='PPS']['weight'].mean()
PPS_new = PPS[PPS['weight']>avrg_pps]

all_nod2vec_new = ((DisD_final.append(DisT_final)).append(PPS_new)).append(DDS)
all_nod2vec_new = all_nod2vec_new[all_nod2vec_new['weight']!=0]
all_nod2vec_new['Type_Interaction'].value_counts()

In [None]:
with open('all_nod2vec_new.pkl', 'wb') as handle:
    pickle.dump(all_nod2vec_new, handle) 

In [None]:
# dic of disese/drug/proteins
with open('drugId2numId_nod2vec.pkl', 'wb') as handle:
    pickle.dump(drugId2numId_nod2vec, handle) 
    
with open('diseaseId2numId_nod2vec.pkl', 'wb') as handle:
    pickle.dump(diseaseId2numId_nod2vec, handle) 
    
target2name= pd.Series('target', index=target_list).to_dict()

with open('target2name.pkl', 'wb') as handle:
    pickle.dump(target2name, handle) 
    