# Notebook: Create associations file of CTD elements for OPA2VEC
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Get CTD elements (e.g chemicals and diseases), take their go functions and assign to the associated chem/dis in an associations file. Also output finalclasses.lst, a file that tells opa2vec which entities you would like the vectors for


In [2]:
import pandas as pd
import numpy as np
import subprocess

In [None]:
# Get files
subprocess.call('wget http://ctdbase.org/reports/CTD_genes.csv.gz', shell = True)

subprocess.call('mv *.gz ../ctd-to-nt/csvs/', shell=True)
subprocess.call('gunzip ../ctd-to-nt/csvs/*.gz', shell=True)

In [15]:
# Get CTD gene reference file --> gives us uniprot IDs 
df_gene_ref = pd.read_csv('../ctd-to-nt/csvs/CTD_genes.csv', skiprows=27 )
df_gene_ref = df_gene_ref.drop(0)

In [16]:
df_gene_ref.shape

(508297, 8)

In [17]:
df_gene_ref = df_gene_ref.dropna(subset=['UniProtIDs', 'GeneID']) # drop if it doesn't have UniprotID

In [18]:
df_gene_ref.shape

(291192, 8)

In [21]:
# Import Genes-Pathways Associations (CTD)
df_gp = pd.read_csv('../ctd-to-nt/csvs/CTD_genes_pathways.csv', skiprows=27 )
df_gp = df_gp.drop(0)

In [27]:
df_gp = df_gp.drop(['PathwayName', '# GeneSymbol'], axis=1)

In [28]:
df_gp.head()

Unnamed: 0,GeneID,PathwayID
1,1.0,REACT:R-HSA-109582
2,1.0,REACT:R-HSA-168256
3,1.0,REACT:R-HSA-168249
4,1.0,REACT:R-HSA-6798695
5,1.0,REACT:R-HSA-76002


In [29]:
df_gene_ref.head()

Unnamed: 0,# GeneSymbol,GeneName,GeneID,AltGeneIDs,Synonyms,BioGRIDIDs,PharmGKBIDs,UniProtIDs
17,11-BETA-HSD3,11-beta-hydroxysteroid dehydrogenase type 3,100174880.0,,,,,Q6PUF2
22,128UP,upstream of RpIII128,36288.0,,128up-PA|CG8340-PA|Dmel_CG8340|GTP-bp|X71866,62072.0,,P32234
23,128UP_1,GTP-binding protein 128up,105268400.0,,,,,A0A0C9RG91
37,140UP,upstream of RpII140,41720.0,100187719.0,140up-PA|140up-PB|CG9852-PA|CG9852-PB|Dmel_CG9...,66797.0,,A0A0B4KFZ0|K7JHH0|P81928
38,140UP_1,RPII140-upstream gene protein,105268335.0,,140up_0,,,A0A0C9R514


In [37]:
# Merge in the uniprot IDs to df_gp
df_gp = pd.merge(df_gp,df_gene_ref[['GeneID','UniProtIDs']],on='GeneID', how='left')

In [38]:
# import goa file (uniprot ID to go_functions)
go_funcs = pd.read_csv('../goa_human.gaf', header=None, skiprows=30, sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [39]:
# Cut out all cols except uniprot ids and go_funcs, rename these
go_funcs = go_funcs.rename(columns={ go_funcs.columns[1]: "UniprotID" })
go_funcs = go_funcs.rename(columns={ go_funcs.columns[4]: "gofunc" })
col_list = ['UniprotID', 'gofunc']
df_go = go_funcs[col_list]

In [40]:
df_go.head()


Unnamed: 0,UniprotID,gofunc
0,A0A024R161,GO:0003924
1,A0A024R161,GO:0007186
2,A0A024RBG1,GO:0003723
3,A0A024RBG1,GO:0005829
4,A0A024RBG1,GO:0008486


In [42]:
df_gp = df_gp.rename(columns={'UniProtIDs': 'UniProtID'})

In [44]:
df_gp.head()

Unnamed: 0,GeneID,PathwayID,UniProtID
0,1.0,REACT:R-HSA-109582,A0A0A0MX79|A0A1U8C678|A0A2J8JM56|A0A2K5C3B8|A0...
1,1.0,REACT:R-HSA-168256,A0A0A0MX79|A0A1U8C678|A0A2J8JM56|A0A2K5C3B8|A0...
2,1.0,REACT:R-HSA-168249,A0A0A0MX79|A0A1U8C678|A0A2J8JM56|A0A2K5C3B8|A0...
3,1.0,REACT:R-HSA-6798695,A0A0A0MX79|A0A1U8C678|A0A2J8JM56|A0A2K5C3B8|A0...
4,1.0,REACT:R-HSA-76002,A0A0A0MX79|A0A1U8C678|A0A2J8JM56|A0A2K5C3B8|A0...


In [None]:
# Split the UniProtID into separate UniProtIDs RUN THIS ONLY ONCE
s = df_gp['UniProtID'].str.split('|').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'UniProtID'
df_gp = df_gp.join(s.apply(lambda x: pd.Series(x.split('|'))))

In [None]:
df_gp.head()

In [43]:
# Merge the go functions into our existing df_gp
df_gp = df_gp.merge(df_go, on='UniprotID', how='outer').dropna()

KeyError: 'UniprotID'