# Notebook: Create OPA2VEC associations file
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Get genes associated with diseases (disgenet) and chemicals (ctdbase), take their go functions and assign to the associated chem/dis in an associations file. Also output finalclasses.lst, a file that tells opa2vec which entities you would like the vectors for

In [91]:
# %reset

In [92]:
import pandas as pd
import numpy as np
import subprocess

### 1. Get Uniprot ID set for each Gene 

In [93]:
# CHEMICALS
# Import CTD file to get the geneIDs we want 
df_cg = pd.read_csv('../ctd-to-nt/csvs/CTD_chem_gene_ixns.csv', skiprows=27)
df_cg = df_cg.drop(0)

# Cut down to just humans if you want to
df_cg = df_cg.loc[df_cg['OrganismID'] == 9606.0]

# Need to change float to int for the url to work
df_cg['GeneID'] = df_cg.GeneID.astype(int)

In [94]:
# # DISEASES
# # Import DisGeNet with disease IDs and Gene IDs
# df_cgd = pd.read_csv('../disgenet-nt/input_tsvs/curated_gene_disease_associations.tsv', sep='\t')

# # Limit by evidence Score 
# df_cgd = df_cgd[df_cgd.score >= 0.42]

In [95]:
# df_cgd.head()

In [96]:
# Alternative DISEASES --> Switching out DisGeNet for CTD (as they use UMLS/MESH respectively)
# Import preprepared g-d associations (from ctd-to-nt notebook) direct assocs only
df_cgd = pd.read_csv('../ctd-to-nt/gene-dis-pos-assocs.csv')
df_cgd['GeneID'] = df_cgd['GeneID'].astype(int)
df_cgd.head()

Unnamed: 0,GeneID,DiseaseID
0,50518,MESH:D003920
1,50518,MESH:D003924
2,50518,MESH:D008113
3,50518,MESH:D009369
4,50518,MESH:D009765


In [139]:
df_cgd.DiseaseID.nunique()

5704

In [97]:
# # Create list of all associated GeneIDs for each of Chemicals and Diseases
cgene_ids = df_cg.GeneID.unique()
dgene_ids = df_cgd.GeneID.unique()

# Combine them
all_genes = set(cgene_ids).union(set(dgene_ids))

In [138]:
len(dgene_ids)

8278

In [98]:
df_cgd.head()

Unnamed: 0,GeneID,DiseaseID
0,50518,MESH:D003920
1,50518,MESH:D003924
2,50518,MESH:D008113
3,50518,MESH:D009369
4,50518,MESH:D009765


In [99]:
# Write combined gene ids to file
with open('geneIDs.txt', 'w') as f:
    for item in all_genes:
        f.write("%s\n" % item)

In [100]:
# NOTE the next step is MANUAL
# You need to go to https://www.uniprot.org/uploadlists/ and give it the created geneIDs.txt file, ask it to convert
# entrez gene to uniprot ID. Then download this as uniprotIDs.txt (as uncompressed, mapping table)

##### Import the list of uniprot IDs

In [101]:
# Import manually generated file of geneID --> uniprotID
df_uni_ids = pd.read_csv('uniprotIDs.txt', sep='\t')
df_uni_ids.columns = ['GeneID', 'UniprotID']
df_uni_ids['GeneID'] = df_uni_ids.GeneID.astype(str)

In [102]:
# Some of the GeneIds are actually a couple of IDs on one row. Split them into multiple rows
df_uni_ids = pd.concat([pd.Series(row['UniprotID'], row['GeneID'].split(','))              
                    for _, row in df_uni_ids.iterrows()]).reset_index()
df_uni_ids.columns = ['GeneID', 'UniprotID']

##### Create merged df to enable grouping by chemicalID and diseaseID

In [103]:
# Create mapping file of gene ID to chem ID... and gene ID to disease ID 
cg_dict = dict(zip(df_cg.GeneID, df_cg.ChemicalID))
cgd_dict = dict(zip(df_cgd.GeneID, df_cgd.DiseaseID))

In [104]:
df_uni_ids.head()

Unnamed: 0,GeneID,UniprotID
0,1,P04217
1,1,V9HWD8
2,2,P01023
3,131076,C9JQ41
4,131076,Q4VC31


In [105]:
df_uni_ids_d = df_uni_ids.copy() # make a copy for diseases

In [106]:
df_uni_ids_d.shape

(33381, 2)

In [107]:
# Apply the mapping, thereby creating chemical column
df_uni_ids['ChemicalID'] = df_uni_ids.GeneID
df_uni_ids['ChemicalID'] = df_uni_ids['ChemicalID'].astype(int).map(cg_dict)

In [108]:
# Apply the mapping, thereby creating disease column
df_uni_ids_d['DiseaseID'] = df_uni_ids_d.GeneID
df_uni_ids_d['DiseaseID'] = df_uni_ids_d['DiseaseID'].astype(int).map(cgd_dict)

In [140]:
cg_dict

{4609: 'D015032',
 2052: 'D014859',
 5243: 'C095179',
 836: 'C088658',
 841: 'C088658',
 207: 'C510150',
 3091: 'C478783',
 5594: 'C088658',
 5595: 'C088658',
 5599: 'C088658',
 6513: 'C016837',
 7422: 'C088658',
 2811: 'D001335',
 2993: 'D015032',
 3674: 'D014520',
 3690: 'C009687',
 7066: 'D015032',
 6648: 'C088658',
 25828: 'C023616',
 57817: 'D024483',
 842: 'C088658',
 1026: 'C088658',
 1111: 'D015215',
 7157: 'C088658',
 563: 'D014635',
 3797: 'D014635',
 3848: 'D015032',
 7018: 'C016837',
 160428: 'D014635',
 596: 'C088658',
 598: 'C088658',
 4193: 'D019345',
 1565: 'D015032',
 1559: 'C088658',
 1576: 'C053228',
 8856: 'C029659',
 7364: 'C020549',
 581: 'C088658',
 637: 'C088658',
 54205: 'D015039',
 56616: 'C088658',
 8772: 'C088658',
 355: 'C088658',
 356: 'C088658',
 142: 'C088658',
 331: 'D015054',
 4363: 'C063449',
 218: 'D001335',
 595: 'C088658',
 1147: 'C031238',
 1956: 'D019287',
 3162: 'C088658',
 373356: 'C521487',
 3383: 'C088658',
 3551: 'C031238',
 4316: 'D015054',

In [110]:
# # Output disease list, later used in nn notebook
# disease_list = df_uni_ids_d.DiseaseID.unique()
# len(disease_list)
# np.savetxt(r'diseases.lst', disease_list, fmt='%s')

In [111]:
## This establishes that the len of a disease id is always 8 while a chem is 7 or 10
# df_uni_ids['IDlen'] = df_uni_ids.ChemicalID.map(lambda x: len(x))
# df_uni_ids_d['IDlen'] = df_uni_ids_d.DiseaseID.map(lambda x: len(x))
# print(df_uni_ids.IDlen.unique())
# print(df_uni_ids_d.IDlen.unique())

##### Group by Chem ID

In [112]:
df_uni_ids.dropna().shape # 33381

(33381, 3)

In [113]:
df_uni_ids.head()

Unnamed: 0,GeneID,UniprotID,ChemicalID
0,1,P04217,D015032
1,1,V9HWD8,D015032
2,2,P01023,D015032
3,131076,C9JQ41,D014635
4,131076,Q4VC31,D014635


In [141]:
df_uni_ids.ChemicalID.nunique()

412

In [114]:
df_uni_ids_d.dropna().shape #2914

(14385, 3)

In [143]:
df_uni_ids_d.DiseaseID.nunique()

2558

### 2. Mine goa file, attaching go function for each uniprot ID to the parent chemical/gene

In [115]:
# import goa file (uniprot ID to go_functions)
go_funcs = pd.read_csv('../goa_human.gaf', header=None, skiprows=30, sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [116]:
# Cut out all cols except uniprot ids and go_funcs, rename these
go_funcs = go_funcs.rename(columns={ go_funcs.columns[1]: "UniprotID" })
go_funcs = go_funcs.rename(columns={ go_funcs.columns[4]: "gofunc" })
col_list = ['UniprotID', 'gofunc']
df_go = go_funcs[col_list]

In [117]:
# Merge the go functions into our existing chem-uniprotID and dis-uniprotID dfs
df_uni_ids_d = df_uni_ids_d.merge(df_go, on='UniprotID', how='outer').dropna()
df_uni_ids = df_uni_ids.merge(df_go, on='UniprotID', how='outer').dropna()

#### Alternative vectors: Control for the number of disease go functions

In [118]:
# df_uni_ids.ChemicalID.value_counts()

In [119]:
len(df_uni_ids_d.DiseaseID.value_counts()[(df_uni_ids_d.DiseaseID.value_counts()>=30) & (df_uni_ids_d.DiseaseID.value_counts()<1000)])

1276

In [146]:
df_uni_ids_d.DiseaseID.nunique()

2558

### 3. Prep and write to output file 

In [121]:
df_uni_ids_d.shape

(279474, 4)

In [122]:
df_uni_ids.sample(13)

Unnamed: 0,GeneID,UniprotID,ChemicalID,gofunc
458138,57804,Q9HCU8,D024483,GO:0005654
418829,51603,Q8N6R0,D014635,GO:0005515
79169,2519,Q9BTY2,C027373,GO:0005788
53664,1633,P27707,D015032,GO:0009157
369222,23729,Q9UHJ6,D018021,GO:0016310
12765,359,P41181,D014635,GO:0015250
326844,80174,Q8NFT6,D014638,GO:0019901
452291,57162,Q96FA3,C088658,GO:0043331
154189,5156,P16234,C088658,GO:0005018
180593,5868,P20339,D014635,GO:0030670


In [123]:
df_uni_ids.ChemicalID.nunique()

412

In [124]:
df_uni_ids.shape

(464194, 4)

In [125]:
# Create a col with the full go url
df_uni_ids['go_url'] = 'http://purl.obolibrary.org/obo/' + df_uni_ids.gofunc.str.replace(':', '_') 
df_uni_ids_d['go_url'] =  'http://purl.obolibrary.org/obo/' + df_uni_ids_d.gofunc.str.replace(':', '_')

In [126]:
df_uni_ids[df_uni_ids.ChemicalID == 'C031238'].go_url.unique()

array(['http://purl.obolibrary.org/obo/GO_0002223',
       'http://purl.obolibrary.org/obo/GO_0002479',
       'http://purl.obolibrary.org/obo/GO_0002756',
       'http://purl.obolibrary.org/obo/GO_0003009',
       'http://purl.obolibrary.org/obo/GO_0004672',
       'http://purl.obolibrary.org/obo/GO_0005515',
       'http://purl.obolibrary.org/obo/GO_0005524',
       'http://purl.obolibrary.org/obo/GO_0005654',
       'http://purl.obolibrary.org/obo/GO_0005737',
       'http://purl.obolibrary.org/obo/GO_0005829',
       'http://purl.obolibrary.org/obo/GO_0006468',
       'http://purl.obolibrary.org/obo/GO_0006954',
       'http://purl.obolibrary.org/obo/GO_0006955',
       'http://purl.obolibrary.org/obo/GO_0007249',
       'http://purl.obolibrary.org/obo/GO_0007252',
       'http://purl.obolibrary.org/obo/GO_0007266',
       'http://purl.obolibrary.org/obo/GO_0008384',
       'http://purl.obolibrary.org/obo/GO_0008385',
       'http://purl.obolibrary.org/obo/GO_0009615',
       'http

In [127]:
# Create a col with the full go url
df_uni_ids['go_url'] = '<' + 'http://purl.obolibrary.org/obo/' + df_uni_ids.gofunc.str.replace(':', '_')  + '>'
df_uni_ids_d['go_url'] =  '<' + 'http://purl.obolibrary.org/obo/' + df_uni_ids_d.gofunc.str.replace(':', '_')  + '>'

In [128]:
# Grab just the columns we want to output (diseaseID and go_url/ chemicalID and go_url)
col_list_c = ['ChemicalID', 'go_url']
col_list_d = ['DiseaseID', 'go_url']
df_c = df_uni_ids[col_list_c]
df_d = df_uni_ids_d[col_list_d]

In [149]:
df_c.ChemicalID.nunique()

412

In [150]:
df_d.DiseaseID.nunique()

2558

In [153]:
df_d.head()

Unnamed: 0,DiseaseID,go_url
0,MESH:D012559,<http://purl.obolibrary.org/obo/GO_0002576>
1,MESH:D012559,<http://purl.obolibrary.org/obo/GO_0003674>
2,MESH:D012559,<http://purl.obolibrary.org/obo/GO_0005576>
3,MESH:D012559,<http://purl.obolibrary.org/obo/GO_0005576>
4,MESH:D012559,<http://purl.obolibrary.org/obo/GO_0005576>


In [151]:
# Output an association file for each of chem and dis
np.savetxt(r'associations_c.txt', df_c.values, fmt='%s')
np.savetxt(r'associations_d.txt', df_d.values, fmt='%s')

In [152]:
# Merge these two into one single file
subprocess.call('cat associations_* > myassociations', shell=True)

0

In [133]:
# Create entities.lst to inform opa2vec which entities we want vectors for
entities = df_d.DiseaseID.unique().tolist() + df_c.ChemicalID.unique().tolist()
np.savetxt(r'entities.lst', entities, fmt='%s')

## NEXT STEP in pipeline is to manually run opa2vec, though it could be automated here...

python2 runOPA2Vec.py -ontology ../ontologies/go.owl -associations ../msc-thesis/opa/myassociations  -entities ../msc-thesis/opa/entities.lst -outfile outter.lst

#### Export GoFunction counts per chem and per disease

In [135]:
# Create df of counts of chem and disease gofuncs for export, later used in assessing NN results
chem_gofunc_counts = df_uni_ids.groupby('ChemicalID')['gofunc'].nunique()
dis_gofunc_counts = df_uni_ids_d.groupby('DiseaseID')['gofunc'].nunique()

chem_gofunc_counts = pd.DataFrame(chem_gofunc_counts)
chem_gofunc_counts = chem_gofunc_counts.reset_index()
dis_gofunc_counts = pd.DataFrame(dis_gofunc_counts)
dis_gofunc_counts = dis_gofunc_counts.reset_index()

gofunc_counts = chem_gofunc_counts.merge(dis_gofunc_counts, how='outer')
gofunc_counts.to_csv('gofunc_counts.csv', index=False)