# Notebook: Create OPA2VEC associations file
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Get genes associated with diseases (disgenet) and chemicals (ctdbase), take their go functions and assign to the associated chem/dis in an associations file. Also output finalclasses.lst, a file that tells opa2vec which entities you would like the vectors for

In [33]:
import pandas as pd
import numpy as np
import subprocess

### 1. Get Uniprot ID set for each Gene 

In [34]:
# CHEMICALS
# Import CTD file to get the geneIDs we want 
df_cg = pd.read_csv('../ctd-to-nt/csvs/CTD_chem_gene_ixns.csv', skiprows=27)
df_cg = df_cg.drop(0)

# Cut down to just humans if you want to
df_cg = df_cg.loc[df_cg['OrganismID'] == 9606.0]

# Need to change float to int for the url to work
df_cg['GeneID'] = df_cg.GeneID.astype(int)

In [35]:
# DISEASES
# Import DisGeNet with disease IDs and Gene IDs
df_cgd = pd.read_csv('../disgenet-nt/input_tsvs/curated_gene_disease_associations.tsv', sep='\t')

# Limit by evidence Score 
df_cgd = df_cgd[df_cgd.score >= 0.42]

In [36]:
# # Create list of all associated GeneIDs for each of Chemicals and Diseases
cgene_ids = df_cg.GeneID.unique()
dgene_ids = df_cgd.geneId.unique()

# Combine them
all_genes = set(cgene_ids).union(set(dgene_ids))

In [37]:
df_cgd.head()

Unnamed: 0,geneId,geneSymbol,diseaseId,diseaseName,score,NofPmids,NofSnps,source
158,10002,NR2E3,C1849394,Enhanced S-Cone Syndrome,0.486319,7,7,CTD_human;UNIPROT
235,1001,CDH3,C1832162,"HYPOTRICHOSIS, CONGENITAL, WITH JUVENILE MACUL...",0.6,1,3,CTD_human;ORPHANET;UNIPROT
241,1001,CDH3,C1857041,"Ectodermal dysplasia, ectrodactyly, and macula...",0.600824,1,2,CTD_human;ORPHANET;UNIPROT
586,10020,GNE,C1853926,NONAKA MYOPATHY,0.617857,12,23,CTD_human;ORPHANET;UNIPROT
837,100506658,OCLN,C3489725,Pseudo-TORCH syndrome,0.600824,1,4,CTD_human;ORPHANET;UNIPROT


In [38]:
# Write combined gene ids to file
with open('geneIDs.txt', 'w') as f:
    for item in all_genes:
        f.write("%s\n" % item)

In [14]:
# NOTE the next step is MANUAL
# You need to go to https://www.uniprot.org/uploadlists/ and give it the created geneIDs.txt file, ask it to convert
# entrez gene to uniprot ID. Then download this as uniprotIDs.txt (as uncompressed, mapping table)

##### Import the list of uniprot IDs

In [39]:
# Import manually generated file of geneID --> uniprotID
df_uni_ids = pd.read_csv('uniprotIDs.txt', sep='\t')
df_uni_ids.columns = ['GeneID', 'UniprotID']
df_uni_ids['GeneID'] = df_uni_ids.GeneID.astype(str)

In [40]:
# Some of the GeneIds are actually a couple of IDs on one row. Split them into multiple rows
df_uni_ids = pd.concat([pd.Series(row['UniprotID'], row['GeneID'].split(','))              
                    for _, row in df_uni_ids.iterrows()]).reset_index()
df_uni_ids.columns = ['GeneID', 'UniprotID']

##### Create merged df to enable grouping by chemicalID and diseaseID

In [41]:
# Create mapping file of gene ID to chem ID... and gene ID to disease ID 
cg_dict = dict(zip(df_cg.GeneID, df_cg.ChemicalID))
cgd_dict = dict(zip(df_cgd.geneId, df_cgd.diseaseId))

In [42]:
df_uni_ids.head()

Unnamed: 0,GeneID,UniprotID
0,1,P04217
1,1,V9HWD8
2,2,P01023
3,131076,C9JQ41
4,131076,Q4VC31


In [43]:
df_uni_ids_d = df_uni_ids.copy() # make a copy for diseases

In [44]:
# Apply the mapping, thereby creating chemical column
df_uni_ids['ChemicalID'] = df_uni_ids.GeneID
df_uni_ids['ChemicalID'] = df_uni_ids['ChemicalID'].astype(int).map(cg_dict)

In [45]:
# Apply the mapping, thereby creating disease column
df_uni_ids_d['DiseaseID'] = df_uni_ids_d.GeneID
df_uni_ids_d['DiseaseID'] = df_uni_ids_d['DiseaseID'].astype(int).map(cgd_dict)

In [46]:
# # Output disease list, later used in nn notebook
# disease_list = df_uni_ids_d.DiseaseID.unique()
# len(disease_list)
# np.savetxt(r'diseases.lst', disease_list, fmt='%s')

##### Group by Chem ID

In [47]:
df_uni_ids.dropna().shape # 33381

(33381, 3)

In [48]:
df_uni_ids_d.dropna().shape #2914

(2914, 3)

### 2. Mine goa file, attaching go function for each uniprot ID to the parent chemical/gene

In [49]:
# import goa file (uniprot ID to go_functions)
go_funcs = pd.read_csv('../goa_human.gaf', header=None, skiprows=30, sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [51]:
# Cut out all cols except uniprot ids and go_funcs, rename these
go_funcs = go_funcs.rename(columns={ go_funcs.columns[1]: "UniprotID" })
go_funcs = go_funcs.rename(columns={ go_funcs.columns[4]: "gofunc" })
col_list = ['UniprotID', 'gofunc']
df_go = go_funcs[col_list]

In [52]:
# Merge the go functions into our existing chem-uniprotID and dis-uniprotID dfs
df_uni_ids_d = df_uni_ids_d.merge(df_go, on='UniprotID', how='outer').dropna()
df_uni_ids = df_uni_ids.merge(df_go, on='UniprotID', how='outer').dropna()

### 3. Prep and write to output file 

In [55]:
df_uni_ids_d.head()

Unnamed: 0,GeneID,UniprotID,DiseaseID,gofunc,go_url
174,16,P49588,C2750090,GO:0002161,<http://purl.obolibrary.org/obo/GO_0002161>
175,16,P49588,C2750090,GO:0002196,<http://purl.obolibrary.org/obo/GO_0002196>
176,16,P49588,C2750090,GO:0004813,<http://purl.obolibrary.org/obo/GO_0004813>
177,16,P49588,C2750090,GO:0004813,<http://purl.obolibrary.org/obo/GO_0004813>
178,16,P49588,C2750090,GO:0004813,<http://purl.obolibrary.org/obo/GO_0004813>


In [54]:
# Create a col with the full go url
df_uni_ids['go_url'] = '<' + 'http://purl.obolibrary.org/obo/' + df_uni_ids.gofunc.str.replace(':', '_')  + '>'
df_uni_ids_d['go_url'] =  '<' + 'http://purl.obolibrary.org/obo/' + df_uni_ids_d.gofunc.str.replace(':', '_')  + '>'

In [56]:
# Grab just the columns we want to output (diseaseID and go_url/ chemicalID and go_url)
col_list_c = ['ChemicalID', 'go_url']
col_list_d = ['DiseaseID', 'go_url']
df_c = df_uni_ids[col_list_c]
df_d = df_uni_ids_d[col_list_d]

In [57]:
df_c.head()

Unnamed: 0,ChemicalID,go_url
0,D015032,<http://purl.obolibrary.org/obo/GO_0002576>
1,D015032,<http://purl.obolibrary.org/obo/GO_0003674>
2,D015032,<http://purl.obolibrary.org/obo/GO_0005576>
3,D015032,<http://purl.obolibrary.org/obo/GO_0005576>
4,D015032,<http://purl.obolibrary.org/obo/GO_0005576>


In [60]:
# Output an association file for each of chem and dis
np.savetxt(r'associations_c.txt', df_c.values, fmt='%s')
np.savetxt(r'associations_d.txt', df_d.values, fmt='%s')

In [61]:
# Merge these two into one single file
subprocess.call('cat associations_* > myassociations', shell=True)

0

In [62]:
# Output list of entities for which we want vectors (seems to be req'd by opa2vec)
# JUST CHEMICALS
df_c.head()
entities4vec = df_c.ChemicalID.unique()
np.savetxt(r'finalclasses.lst', entities4vec, fmt='%s')

In [69]:
# # ADD DISEASES TO FINAL CLASS LST
# entities4vec = entities4vec.tolist() + df_d.DiseaseID.unique().tolist()
# np.savetxt(r'finalclasses.lst', entities4vec, fmt='%s')