In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib as plt
from itertools import combinations
import re
from tqdm import tqdm

In [2]:

exp_id='exp1'

In [3]:
def readpkl (datapath):
    fp = gzip.open(datapath) # or fp = open(pkl_filename) for .pkl files
    df=pickle.load(fp)
    return df

### Gene mapping files

In [4]:
humangene=pd.read_csv('./AD_progect/data/NIH-gene-inf/Homo_sapiens.gene_info',sep='\t')
humangene=humangene[humangene['#tax_id']==9606]
humangene=humangene[humangene['type_of_gene']=='protein-coding']
humangene=humangene[['GeneID','Symbol','Synonyms']]
humangene['GeneID']=humangene['GeneID'].astype(str)
humangene1=humangene[['GeneID','Symbol']].groupby('Symbol').count().reset_index()
humangene1=humangene1[humangene1.GeneID==1]# remove genesym has more than one ID
humangene=humangene[humangene.Symbol.isin(humangene1.Symbol.values)]
allsymbol=[]
allsynom=[]
for i in range(len(humangene)):
    symbol=humangene.Symbol.values[i]
    synom=str(humangene.Synonyms.values[i]).split('|')
    synom.append(symbol)
    for j in synom:
        allsymbol.append(symbol)
        allsynom.append(j)
syntosym=pd.DataFrame(allsymbol,columns=['symbol'])
syntosym['synom']=allsynom
syntosym1=syntosym.groupby('synom').count().reset_index()
syntosym1=syntosym1[syntosym1.symbol==1]
syntosym=syntosym[syntosym.synom.isin(syntosym1.synom.values)]

#gene symbol to Entrez ID
codes={}
codes['gene_symbol2id']={row[1]:row[0] for idx, row in humangene[['GeneID','Symbol']].iterrows()} #gene symbol to Entrez ID
codes['gene_id2symbol']={row[0]:row[1] for idx, row in humangene[['GeneID','Symbol']].iterrows()} #Entrez ID to gene symbol
codes['gene_syno2symbol']={row[0]:row[1] for idx, row in syntosym[['synom','symbol']].iterrows()} #Synonym to gene symbol
#codes #gene_symbol2id, gene_id2symbol, gene_syno2symbol 

## ADgene

In [5]:
adgenelist=pd.read_csv('./AD_progect/data/CTD_subset/merge-genes-list.csv', index_col=0)
adgenelist[adgenelist.notnull()]

Unnamed: 0,hgnc_symbol
0,AKT1
1,APP
2,IL1B
3,IL6
4,IRS1
...,...
738,YARS
739,YIF1B
740,ZBTB16
741,ZNF536


## CTD drug-gene (evidence not specific to AD, not human-specific)

In [6]:
gene_drug=pd.read_csv('./AD_progect/data/CTD_subset/CTD/chemical-gene interactions.csv', sep=',')
gene_drug=gene_drug[gene_drug['Gene Symbol'].isin(list(codes['gene_symbol2id'].keys()))]

In [7]:
gene_drug[gene_drug['Chemical Name']=='Cadmium']['Gene Symbol'].values

array(['A1BG', 'A2M', 'AAMP', ..., 'ZSCAN26', 'ZSWIM3', 'ZXDA'],
      dtype=object)

In [8]:
codes['drugname2mesh']={row[0].upper():row[1] for idx, row in gene_drug[['Chemical Name','Chemical ID']].drop_duplicates().iterrows()}
codes['mesh2drugname']={row[0].upper():row[1] for idx, row in gene_drug[['Chemical ID','Chemical Name']].drop_duplicates().iterrows()}

drug_name_AD_ctdid = pd.DataFrame(zip(codes['drugname2mesh'].keys(), codes['drugname2mesh'].values()),columns=['name','CTD_id'])
drug_name_AD_ctdid.to_csv('drug_name_AD_ctdid.csv')

In [9]:
gene_drug=gene_drug[['Gene ID', 'Chemical ID', 'Chemical Name']].drop_duplicates()
gene_drug['Gene ID']=gene_drug['Gene ID'].apply(lambda x: 'gene_'+str(x))
gene_drug['Chemical ID']=gene_drug['Chemical ID'].apply(lambda x: 'drug_'+x)
gene_drug['Chemical Name'] = ['drug_'+i.upper() for i in gene_drug['Chemical Name']]

In [11]:
# Change column name
#gene_drug = gene_drug.rename(columns={'Gene ID': 'node1', 'Chemical ID': 'node2'})
[len(gene_drug['Chemical ID'].unique()), len(gene_drug['Gene ID'].unique())]

[6543, 16997]

## CTD drug-pathway (pathways were inferred by genes, then drug connected based on drug-gene interactions)

In [12]:
drug_tf_idf_1=pd.read_csv('./AD_progect/data/Drug-pathway-GSEA/enrich_kegg_drug_path.csv')
drug_tf_idf_2=pd.read_csv('./AD_progect/data/Drug-pathway-GSEA/enrich_ractome_drug_path.csv')
drug_tf_idf=pd.concat([drug_tf_idf_1,drug_tf_idf_2])
drug_tf_idf=drug_tf_idf.rename(columns={'Chemical Name': 'drugname','map_id':'pathway'})
drug_tf_idf=drug_tf_idf[['drugname','pathway']]
drug_tf_idf['pathway']=['pathway_'+i for i in drug_tf_idf['pathway']]
drug_tf_idf['drugname']=['drug_'+i.upper() for i in drug_tf_idf['drugname']]
drug_tf_idf=drug_tf_idf[drug_tf_idf.drugname.isin(gene_drug['Chemical Name'].values)]
drug_tf_idf=drug_tf_idf[['drugname','pathway']]
[len(drug_tf_idf['drugname'].unique()), len(drug_tf_idf['pathway'].unique())]

[4748, 1622]

## CTD AD-gene-pathway (all CTD pathways related to ADgene)

In [13]:
ADgene_pathways=pd.read_csv('/AD_progect/data/Drug-pathway-GSEA/ad_related_path.csv')
ADgene_pathways.columns=['pathway','AD_gene']
ADgene_pathways['pathway']=['pathway_'+i for i in ADgene_pathways['pathway']]
ADgene_pathways['AD_gene']=ADgene_pathways['AD_gene'].map(codes['gene_symbol2id'])
ADgene_pathways['AD_gene']=['gene_'+str(i) for i in ADgene_pathways['AD_gene']]
ADgene_pathways = ADgene_pathways.rename(columns={'pathway': 'node1', 'AD_gene': 'node2'})

[len(ADgene_pathways['pathway'].unique()), len(ADgene_pathways['AD_gene'].unique())]

[678, 82]

In [15]:
ADgene_pathways

Unnamed: 0,node1,node2
0,pathway_PC7_4688,gene_102
1,pathway_PC7_4688,gene_351
2,pathway_PC7_4688,gene_596
3,pathway_PC7_4688,gene_801
4,pathway_PC7_4688,gene_836
...,...,...
1773,pathway_PC7_8786,gene_1200
1774,pathway_PC7_8790,gene_1565
1775,pathway_PC7_8803,gene_351
1776,pathway_PC7_8833,gene_55676


## drug-drug-simlarity

In [17]:
data_path = './AD_progect/data/similarity_tables/'

drugsimpath = ['Atom-pair_similarity.csv', 'MACCS_key_similarity.csv', 'Morgan_2_similarity.csv', 'Topology_similarity.csv']


In [18]:
alldrugsim=pd.DataFrame()
for i in drugsimpath:
    temp=pd.read_csv(data_path+i)
    alldrugsim=pd.concat([alldrugsim,temp])
alldrugsim['drug_a']=[i.upper() for i in alldrugsim['drug_a']]
alldrugsim['drug_b']=[i.upper() for i in alldrugsim['drug_b']]

alldrugsim=alldrugsim[['drug_a','drug_b']].drop_duplicates()
alldrugsim=alldrugsim[alldrugsim.drug_a!=alldrugsim.drug_b]
alldrugsim['drug_a']=['drug_'+i.upper() for i in alldrugsim['drug_a']]
alldrugsim['drug_b']=['drug_'+i.upper() for i in alldrugsim['drug_b']]

alldrugsim=alldrugsim[alldrugsim.drug_a.isin(gene_drug['Chemical Name'])]
alldrugsim=alldrugsim[alldrugsim.drug_b.isin(gene_drug['Chemical Name'])]
len(gene_drug['Chemical Name'].unique()), alldrugsim.shape[0]

Unnamed: 0,drug_a,drug_b
182,drug_10-HYDROXYCAMPTOTHECIN,drug_CAMPTOTHECIN
184,drug_10-HYDROXYCAMPTOTHECIN,drug_INDOMETHACIN
186,drug_10-HYDROXYCAMPTOTHECIN,drug_TOPOTECAN
1379,"drug_3,3'-DIINDOLYLMETHANE",drug_AXITINIB
1383,"drug_3,3'-DIINDOLYLMETHANE",drug_CHRYSIN
...,...,...
55384,drug_STAUROSPORINE,drug_VINPOCETINE
55392,drug_STAUROSPORINE,drug_WORTMANNIN
55395,drug_STAUROSPORINE,drug_DOXYCYCLINE
55396,drug_STAUROSPORINE,drug_MINOCYCLINE


## protein - protein interaction

In [20]:

PP_1=pd.read_csv('./AD_progect/data/CTD_subset/biology-database/KegglinkevaluationPPPN_1', header=None,sep='\t')
PP_1.columns=['protein1','protein2', 'positive']
PP_1.replace('PP', 1, inplace=True)
PP_1.replace('PN', 0, inplace=True)
PP_1=path_sim_kegg.loc[PP_1['positive']==1, ['protein1','protein2']]
PP_1['protein1']=PP_1['protein1'].astype(str)
PP_1['protein2']=PP_1['protein2'].astype(str)
PP_1['protein1']=PP_1['protein1'].apply(lambda x: codes['gene_id2symbol'].get(x))
PP_1['protein2']=PP_1['protein2'].apply(lambda x: codes['gene_id2symbol'].get(x))
PP_1=PP_1[PP_1.gene1.isin(list(codes['gene_id2symbol'].values()))]
PP_1=PP_1[PP_1.gene2.isin(list(codes['gene_id2symbol'].values()))]

In [23]:
PP_1=PP_1[PP_1.gene1.isin(list(codes['gene_id2symbol'].values()))]
PP_1=PP_1[PP_1.gene2.isin(list(codes['gene_id2symbol'].values()))]
PP_1['protein1']=PP_1['gene1'].map(codes['gene_symbol2id'])
PP_1['protein1']=['gene_'+str(i) for i in PP_1['gene1']]
PP_1['protein2']=PP_1['gene2'].map(codes['gene_symbol2id'])
PP_1['protein2']=['gene_'+str(i) for i in PP_1['gene2']]


In [30]:
# Downloaded file was homo sapiens interactome
# please download data from https://string-db.org/cgi/download?sessionId=%24input-%3E%7BsessionId%7D&species_text=Homo+sapiens
# into ./AD_progect/data/STRING
string=readpkl('./AD_progect/data/STRING/INTERACTIONDATA/9606.protein.links.full.v11.0.pkl.gz')
# filtered by score >= 950
string=string[string.combined_score>=950]
string=string[['protein1','protein2']]

In [35]:

stringinf=readpkl('./AD_progect/data/STRING/ACCESSORYDATA/9606.protein.info.v11.0.pkl.gz')
stringinf=stringinf[['protein_external_id','preferred_name']]
stringinf=stringinf[stringinf.preferred_name.isin(list(codes['gene_symbol2id'].keys()))].drop_duplicates()
stringinf={stringinf.protein_external_id.values[i]:stringinf.preferred_name.values[i] for i in range(len(stringinf))}
string['protein1']=string['protein1'].map(stringinf)
string['protein2']=string['protein2'].map(stringinf)
string=string.drop_duplicates()
string=string[string['protein1'].isin(list(stringinf.values()))]
string=string[string['protein2'].isin(list(stringinf.values()))]  
string=string[string['protein1'].isin(list(codes['gene_symbol2id'].keys()))]
string=string[string['protein2'].isin(list(codes['gene_symbol2id'].keys()))]

In [36]:
string['protein1']=string['protein1'].map(codes['gene_symbol2id'])
string['protein1']=string['protein1'].apply(lambda x: 'gene_'+str(x))
string['protein2']=string['protein2'].map(codes['gene_symbol2id'])
string['protein2']=string['protein2'].apply(lambda x: 'gene_'+str(x))

In [None]:
PP_2=pd.concat([PP_1, string])

## Drug-Phenotypes (inferred by drugs, not AD-specific)

In [45]:
phenotypes=pd.read_csv('./AD_progect/data/CTD_subset/CTD/phenotypes.csv',sep=',')
codes['phenotype_id_to_name']={row[0]:row[1] for idx, row in phenotypes[['Phenotype Term ID','Phenotype Term Name']].drop_duplicates().iterrows()}
drug_phenotype=phenotypes['Chemical Inference Network'].dropna().apply(lambda x: x.split('|')).apply(pd.Series).merge(phenotypes['Phenotype Term ID'],left_index=True, right_index=True).melt(id_vars=['Phenotype Term ID'],value_name='drug').drop('variable', axis=1).dropna()
drug_phenotype['drug']=drug_phenotype['drug'].apply(lambda x: x.upper())
drug_phenotype.dropna(inplace=True)
drug_phenotype['Phenotype Term ID']=drug_phenotype['Phenotype Term ID'].apply(lambda x: 'phenotype_'+x)
drug_phenotype['drug']=drug_phenotype['drug'].apply(lambda x: 'drug_'+x)
drug_phenotype=drug_phenotype[['drug', 'Phenotype Term ID']]
drug_phenotype = drug_phenotype.rename(columns={'drug': 'node1', 'Phenotype Term ID': 'node2'})
[len(drug_phenotype['node1'].unique()), len(drug_phenotype['node2'].unique())]


Unnamed: 0,Phenotype Term ID,drug
0,GO:0006915,27-HYDROXYCHOLESTEROL
1,GO:0008283,27-HYDROXYCHOLESTEROL
2,GO:0008219,4-HYDROXY-2-NONENAL
3,GO:0016042,4-HYDROXY-2-NONENAL
4,GO:0006979,4-HYDROXY-2-NONENAL
...,...,...
152706,GO:0006915,TACRINE
152707,GO:0008283,ZINC
155169,GO:0006915,THIOUREA
157632,GO:0006915,VITAMIN D


## Gene-Phenotypes (inferred by genes, AD-specific)

In [50]:
gene_phenotype=phenotypes['Gene Inference Network'].dropna().apply(lambda x: x.split('|')).apply(pd.Series).merge(phenotypes['Phenotype Term ID'],left_index=True, right_index=True).melt(id_vars=['Phenotype Term ID'],value_name='gene').drop('variable', axis=1).dropna()
gene_phenotype['Phenotype Term ID']=gene_phenotype['Phenotype Term ID'].apply(lambda x: 'phenotype_'+x)

# create a drugname to mesh table
geneid_table= pd.DataFrame(codes['gene_symbol2id'].items(), columns=['gene', 'id'])
# change drugname to id 
gene_phenotype = gene_phenotype.merge(geneid_table, left_on='gene', right_on='gene', how = 'left')
gene_phenotype = gene_phenotype[gene_phenotype['id'].notnull()]
gene_phenotype = gene_phenotype.drop(columns=['gene'])
gene_phenotype = gene_phenotype.rename(columns={'id':'AD_gene'})
gene_phenotype['AD_gene']=gene_phenotype['AD_gene'].apply(lambda x: 'gene_'+str(x))
gene_phenotype = gene_phenotype.rename(columns={'AD_gene': 'node1', 'Phenotype Term ID': 'node2'})
[len(gene_phenotype['node1'].unique()), len(gene_phenotype['node2'].unique())]

Unnamed: 0,Phenotype Term ID,AD_gene
0,phenotype_GO:0006915,gene_351
1,phenotype_GO:0008283,gene_1636
2,phenotype_GO:0008219,gene_3162
3,phenotype_GO:0016042,gene_5336
4,phenotype_GO:0006979,gene_348
...,...,...
7517,phenotype_GO:0005515,gene_1200
7518,phenotype_GO:0005515,gene_54209
7519,phenotype_GO:0005515,gene_7422
7520,phenotype_GO:0005515,gene_7447


## Item2idx mapping

In [52]:
from sklearn.preprocessing import LabelEncoder

In [55]:
def bidirection(dataframe):
    df=dataframe
    newdf=dataframe.copy()
    newdf.columns=['node2','node1','type']
    df=pd.concat([df,newdf])
    return df

In [58]:
# Remove chemical ID from gene_drug
gene_drug = gene_drug[['Gene ID', 'Chemical Name']]
gene_drug.columns=['node1', 'node2']# gene, drug
# Change column names
drug_tf_idf.columns=['node1', 'node2'] #  drug, pathway
ADgene_pathways.columns=['node1', 'node2'] # pathway, Adgene

alldrugsim.columns=['node1', 'node2'] # drug_a, drug_b
ppi= PP_2
gene_phenotype.columns=['node1','node2']#gene, phenotype
drug_phenotype.columns=['node1','node2']#drug, phenotype

path_sim.columns=['node1', 'node2']# gene1, gene2

In [59]:
gene_drug['type']='gene-drug'
drug_tf_idf['type']='drug_pathway'
alldrugsim['type']='drug_sim'
ppi['type']='protein-protein'
gene_phenotype['type']='gene-phenotype'
drug_phenotype['type']='drug-phenotype'
ADgene_pathways['type']='AD_gene_pathway'


In [60]:
drug_tf_idf=bidirection(drug_tf_idf)
alldrugsim=bidirection(alldrugsim)
ppi=bidirection(ppi)
drug_phenotype=bidirection(drug_phenotype)
gene_phenotype=bidirection(gene_phenotype)
path_sim=bidirection(path_sim)

In [61]:
edge_index=pd.concat([gene_drug, ppi, drug_phenotype,gene_phenotype,drug_tf_idf, alldrugsim, ADgene_pathways,path_sim])

In [71]:
edge_index=pd.concat([gene_drug, ppi, drug_phenotype,gene_phenotype,drug_tf_idf, alldrugsim, ADgene_pathways,path_sim])
edge_index['node1']=edge_index['node1'].astype(str)
edge_index['node2']=edge_index['node2'].astype(str)
edge_index.to_csv('edge_index_kang_old.csv')

In [72]:
node_1 = edge_index['node1'].tolist()
node_2 = edge_index['node2'].tolist()
len(node_1), len(node_2)

(1149715, 1149715)

Label Encoders

In [73]:
node_name_list=list(set(list(edge_index['node1'].values)+list(edge_index['node2'].values)))
node_name_list.sort()

In [74]:
le=LabelEncoder()
le.fit(node_name_list)

LabelEncoder()

In [75]:
edge_index['node1']=le.transform(edge_index['node1'])
edge_index['node2']=le.transform(edge_index['node2'])
edge_index

Unnamed: 0,node1,node2,type
6,6544,1939,gene-drug
7,6544,2345,gene-drug
8,6544,2350,gene-drug
9,6544,3168,gene-drug
10,6544,4560,gene-drug
...,...,...,...
29503,21940,22378,protein-protein
29504,20896,22378,protein-protein
29505,17123,11204,protein-protein
29506,12484,12375,protein-protein


In [78]:
len(le.classes_)

30279

## Import pre-trained embedding

In [None]:
#!wget https://dgl-data.s3-us-west-2.amazonaws.com/dataset/DRKG/drkg.tar.gz

In [79]:
import csv

In [80]:
#Get pretrained embedding

entity_emb=np.load('/home/all_dataset/DRKG/embed/DRKG_TransE_l2_entity.npy')
#rel_emb=np.load(data_path+'DRKG/embed/DRKG_TransE_l2_relation.npy')
emb_size=entity_emb.shape[1]

entity_idmap_file = '/home/all_dataset/DRKG/embed/entities.tsv'
relation_idmap_file = '/home/all_dataset/DRKG/embed/relations.tsv'

# Get all embedding ids
entity_map = {}
entity_id_map = {}
with open(entity_idmap_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['name','id'])
    for row_val in reader:
        entity_map[row_val['name']] = int(row_val['id'])
        entity_id_map[int(row_val['id'])] = row_val['name']

# get drug, pathway, disease embedding ids
entity_drug=list(entity_map.keys())
entity_drug=[i for i in entity_drug if i.startswith('Compound::')]
entity_pathway=list(entity_map.keys())
entity_pathway=[i for i in entity_pathway if i.startswith('Pathway::')]
entity_disease=list(entity_map.keys())
entity_disease=[i for i in entity_disease if i.startswith('Disease::')]


In [81]:
# Extract DRKG drug ID-number pair
entity_drug_pair = {x: entity_map[x] for x in entity_drug}
entity_drug_pair
# Select drug entity-ID pairs in DRKG, then clean their names
entity_drug_table = pd.DataFrame(list(entity_drug_pair.items()), columns=['DRKG_index','ID'])
entity_drug_table['DRKG_index']=[i.replace('Compound::','') for i in entity_drug_table['DRKG_index']]
entity_drug_table['DRKG_index']=[i.replace('MESH:','') for i in entity_drug_table['DRKG_index']]
entity_drug_table['DRKG_index']=[i.replace('molport:','') for i in entity_drug_table['DRKG_index']]
# Remove consecutive 0 right after 'ZINC'
entity_drug_table['DRKG_index']=['ZINC'+str(int(i.replace('zinc:ZINC',''))) if i[0:5]=='zinc:' else i for i in entity_drug_table['DRKG_index']]
entity_drug_table['DRKG_index']=[i.replace('chebi:','CHEBI:') for i in entity_drug_table['DRKG_index']]
entity_drug_table['DRKG_index']=[i.replace('pubchem:','PubChem') for i in entity_drug_table['DRKG_index']]
entity_drug_table

Unnamed: 0,DRKG_index,ID
0,DB02573,2397
1,DB05105,2398
2,DB00244,2401
3,DB00684,2402
4,DB03118,2404
...,...,...
24308,CHEMBL446325,96903
24309,CHEMBL44752,96904
24310,CHEMBL482477,96905
24311,CHEMBL576739,96910


In [82]:
# Use the same way to clean DRKG drug entities
entity_drug1=[i.replace('Compound::','') for i in entity_drug]
entity_drug2=[i.replace('MESH:','') for i in entity_drug1]
entity_drug3=[i.replace('molport:','') for i in entity_drug2]
# Remove consecutive 0 right after 'ZINC'
entity_drug4=['ZINC'+str(int(i.replace('zinc:ZINC',''))) if i[0:5]=='zinc:' else i for i in entity_drug3]
entity_drug5=[i.replace('chebi:','CHEBI:') for i in entity_drug4]
entity_drug6=[i.replace('pubchem:','PubChem') for i in entity_drug5]

In [83]:
# Pubchempy drug synonyms
pubchempy = pd.read_csv('/AD_project/sym/Pubchem_query/pubchem_drug_sym_map.csv')
allsym = pubchempy
#allsym1=allsym[allsym.sym.isin(entity_drug6)]
#allsym1=allsym1.groupby('drugname').first().reset_index()
allsym1=allsym[allsym.sym.isin(entity_drug6)]
# Add 'Compound::' to those with DB, CHEMBL, CHEBI IDs 
allsym2=allsym1[(allsym1.sym.str.startswith('DB'))|(allsym1.sym.str.startswith('CHEMBL'))|(allsym1.sym.str.startswith('CHEBI'))]
allsym2['sym']=allsym1['sym'].apply(lambda x: 'Compound::'+str(x))
# Add 'Compound::MESH:' to those with MESH IDs 
allsym3=allsym1[~allsym1.drugname.isin(allsym2.drugname.values)]
allsym3['sym']=allsym3['sym'].apply(lambda x: 'Compound::MESH:'+str(x))
allsym4=pd.concat([allsym2,allsym3])
#allsym4=allsym4[['drugname','DRKGindex']]
allsym4['drugname']=[i.upper() for i in allsym4['drugname']]
allsym4['drugname']=allsym4['drugname'].apply(lambda x: 'drug_'+str(x))
allsym4 = allsym4.rename(columns={'sym':'DRKG_index'})
drugname2external1 = allsym4
drugname2external1 = drugname2external1[['drugname','DRKG_index']]

# CTD drug list
drugname2external2=pd.read_csv('/AD_project/sym/drug_name_AD_ctdid.csv')[['name','CTD_id']]
drugname2external2.columns=['drugname','DRKG_index']
drugname2external2 = drugname2external2[drugname2external2.DRKG_index.isin(entity_drug6)]
drugname2external2['DRKG_index']='Compound::MESH:'+drugname2external2['DRKG_index']
drugname2external2['drugname']=[i.upper() for i in drugname2external2['drugname']]
drugname2external2['drugname']=drugname2external2['drugname'].apply(lambda x: 'drug_'+str(x))
drugname2external2 = drugname2external2[['drugname','DRKG_index']]

# Although there are some overlaps between two dataframe, create dict for Pubchempy and CTD drugname2id
drugname2external = pd.concat([drugname2external1, drugname2external2])
drugname2id = dict(drugname2external.values)
len(drugname2id)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  allsym2['sym']=allsym1['sym'].apply(lambda x: 'Compound::'+str(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  allsym3['sym']=allsym3['sym'].apply(lambda x: 'Compound::MESH:'+str(x))


3729

In [84]:
# Get drug's DRKG IDs
drug_drkg=[]

for entity in le.classes_ :
    if entity.split('_')[0]=='drug':
        drkgid=drugname2id.get(entity)
        drug_drkg.append(drkgid)
len([x for x in drug_drkg if x is not None])

3729

In [91]:
gene_drkg = ['Gene::'+entity.split('_')[1] for entity in le.classes_ if entity.split('_')[0]=='gene']
pathway_drkg=[entity.replace('pathway_','Pathway::') for entity in le.classes_ if entity.split('_')[0]=='pathway']
disease_drkg=['Disease::'+entity.split('_')[1] for entity in le.classes_ if entity.split('_')[0]=='disease']
phenotype_drkg=['Biological Process::'+entity.split('_')[1] for entity in le.classes_ if entity.split('_')[0]=='phenotype']


In [93]:
len(le.classes_)

30279

In [None]:
#Get embedding
# Get drugname/disease name to entity ID mappings

relation_map = {}

with open(relation_idmap_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['name','id'])
    for row_val in reader:
        relation_map[row_val['name']] = int(row_val['id'])
        
# handle the ID mapping
#bait_ids = []
gene_ids = []
drug_ids = []
phenotype_ids = []
pathway_ids = []
disease_ids=[]
    
#for bait in baits_drkg:
    #bait_ids.append(entity_map.get(bait))

for gene in gene_drkg:
    gene_ids.append(entity_map.get(gene))
    
for drug in drug_drkg:
    drug_ids.append(entity_map.get(drug))
    
for phenotype in phenotype_drkg:
    phenotype_ids.append(entity_map.get(phenotype))
    
for pathway in pathway_drkg:
    pathway_ids.append(entity_map.get(pathway))
    
for disease in disease_drkg:
    disease_ids.append(entity_map.get(disease)) 

In [None]:
drug_emb=np.array([entity_emb[drug_id] if drug_id is not None else np.zeros(emb_size) for drug_id in drug_ids ])
gene_emb=np.array([entity_emb[gene_id] if gene_id is not None else np.zeros(emb_size) for gene_id in gene_ids ])
disease_emb=np.array([entity_emb[disease_id] if disease_id is not None else np.zeros(emb_size) for disease_id in disease_ids ])
pathway_emb=np.array([entity_emb[pathway_id] if pathway_id is not None else np.zeros(emb_size) for pathway_id in pathway_ids ])
phenotype_emb=np.array([entity_emb[phenotype_id] if phenotype_id is not None else np.zeros(emb_size) for phenotype_id in phenotype_ids ])

In [None]:
#How many missing in drugs?
print(len(drug_ids),len([gene_id for gene_id in drug_ids if gene_id is not None]))
#How many missing in genes?
print(len(gene_ids),len([gene_id for gene_id in gene_ids if gene_id is not None]))
#How many missing in phenotypes?
print(len(phenotype_ids),len([gene_id for gene_id in phenotype_ids if gene_id is not None]))
#How many missing in pathways?
print(len(pathway_ids),len([gene_id for gene_id in pathway_ids if gene_id is not None]))

In [None]:
node_features=np.concatenate(( drug_emb, gene_emb,pathway_emb, phenotype_emb))

In [None]:
node_features.shape

# get AD_gene_indicator

In [95]:
gene_AD_gene=list(adgenelist['mesh'].values)
gene_AD_gene.sort()
gene_AD_gene_ind=[]
gene_AD_index_list=[]
for i in le.classes_:
    if i in gene_AD_gene:
        gene_AD_gene_ind.append(1)
        gene_AD_index_list.append(list(le.classes_).index(i))
    else:
        gene_AD_gene_ind.append(0)

In [96]:
print(len(gene_AD_gene_ind))

30279


# get AD_gene_mask

In [98]:
gene_index_in=[ i for i in list(le.classes_) if 'gene' in i]
gene_mask=[]
gene_index_list=[]
for i in tqdm(le.classes_):
    if i in gene_index_in:
        gene_mask.append(1)
        gene_index_list.append(list(le.classes_).index(i))
    else:
        gene_mask.append(0)

100%|██████████| 30279/30279 [08:01<00:00, 62.82it/s]  


In [101]:
non_ad_gene_list=list(set(gene_index_list).difference(set(gene_AD_index_list)))

In [102]:
print(len(non_ad_gene_list))

16575


## Save as pickle

In [115]:
edge_index=edge_index.dropna()
edge_index.to_pickle('./AD_project/pre_process/AD_edge_index_'+exp_id+'_kh.pkl')
pickle.dump(node_features, open('./AD_project/pre_process/AD_node_feature_'+exp_id+'_kh.pkl', 'wb'))
pickle.dump(codes, open('./AD_project/pre_process/AD_codes_'+exp_id+'_kh.pkl','wb'))
pickle.dump(le, open('./AD_project/pre_process/AD_LabelEncoder_'+exp_id+'_kh.pkl','wb'))
pickle.dump(gene_AD_gene_ind, open('./AD_project/pre_process/AD_gene_node_class_'+exp_id+'_kh.pkl','wb'))
pickle.dump(gene_AD_index_list, open('./AD_project/pre_process/ad_gene_index_'+exp_id+'_kh.pkl','wb'))
pickle.dump(gene_mask, open('./AD_project/pre_process/gene_mask_'+exp_id+'_kh.pkl','wb'))
pickle.dump(non_ad_gene_list, open('./AD_project/pre_process/non_ad_gene_index_'+exp_id+'_kh.pkl','wb'))