# Notebook: Create OPA2VEC associations file - GoFuncs
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Get genes associated with diseases (disgenet) and chemicals (ctdbase), take their go functions and assign to the associated chem/dis in an associations file. Also output finalclasses.lst, a file that tells opa2vec which entities you would like the vectors for

In [1]:
import pandas as pd
import numpy as np
import subprocess

### 1. Get Uniprot ID set for each Gene 

In [2]:
# CHEMICALS
# Import CTD file to get the geneIDs we want 

# Set coltypes pre import to maximise memory
col_types = {   
    'GeneID': 'category',
    'ChemicalID': 'category',
    'InteractionActions': 'category',
}
cols=['ChemicalID', 'GeneID', 'InteractionActions', 'OrganismID', 'GeneSymbol']


df_cg = pd.read_csv('../ctd-to-nt/csvs/CTD_chem_gene_ixns.csv', usecols=cols, skiprows=27)
df_cg = df_cg.drop(0)

# Cut down to just humans if you want to
# df_cg = df_cg.loc[df_cg['OrganismID'] == 9606.0]

# Need to change float to int for the later url to work
df_cg['GeneID'] = df_cg.GeneID.astype(int)

In [3]:
df_cg.head()

Unnamed: 0,ChemicalID,GeneSymbol,GeneID,OrganismID,InteractionActions
1,C534883,MAX,4149,,affects^binding|affects^folding|decreases^acti...
2,C534883,MAX,4149,,affects^binding|decreases^reaction
3,C534883,MYC,4609,9606.0,decreases^expression
4,C534883,MYC,4609,9606.0,decreases^activity
5,C534883,MYC,4609,9606.0,decreases^expression


In [4]:
## Now I'm going to split the chem-gene relationships into pos/neg correlation with neutral for unclear 
#(further info is likely available to split the unclear better)

# Split the interactionActions into separate predicates
s = df_cg['InteractionActions'].str.split('|').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'InteractionActions'
df_cg = df_cg.join(s.apply(lambda x: pd.Series(x.split('|'))))

# Make the new column prettier
df_cg = df_cg.rename(columns={0: 'Predicate'})
df_cg['Predicate'] = df_cg.Predicate.str.replace('^', '_')
df_cg['Predicate'] = df_cg.Predicate.str.replace(' ', '_')


In [5]:
# Map each c-g relationship to positive or negative (True is positive) based on the predicates

map_to_corr_direction = {
    'increases_expression' : True,
    'decreases_activity' : False,
    'decreases_expression': False,
    'increases_activity': True,
    'increases_stability': True,
    'decreases_stability': False,
    'increases_abundance': True,
    'decreases_abundance': False,
    'increases_degradation': False,
    'decreases_degradation': True,
    'increases_chemical_synthesis' : True,
    'increases_reaction' : True,
    'decreases_reaction' : False
}

# Apply map
df_cg['dir_corr'] = df_cg.Predicate.map(map_to_corr_direction).astype(str)
df_cg = df_cg.assign(**pd.get_dummies(df_cg['dir_corr']))
df_cg = df_cg.rename(columns={'False': 'Neg', 'True': 'Pos', 'nan': 'direction_unknown'}) # rename of a column

In [6]:
df_cg = df_cg[df_cg.Pos == 1]

In [7]:
# # DISEASES
# # Import DisGeNet with disease IDs and Gene IDs
# df_cgd = pd.read_csv('../disgenet-nt/input_tsvs/curated_gene_disease_associations.tsv', sep='\t')

# # Limit by evidence Score 
# df_cgd = df_cgd[df_cgd.score >= 0.42]

In [8]:
# df_cgd.head()

In [9]:
# Alternative DISEASES --> Switching out DisGeNet for CTD (as they use UMLS/MESH respectively)
# Import preprepared g-d associations (from ctd-to-nt notebook) direct assocs only
df_cgd = pd.read_csv('../ctd-to-nt/gene-dis-pos-assocs.csv')
df_cgd['GeneID'] = df_cgd['GeneID'].astype(int)
df_cgd.head()

Unnamed: 0,GeneID,DiseaseID
0,50518,MESH:D003920
1,50518,MESH:D003924
2,50518,MESH:D008113
3,50518,MESH:D009369
4,50518,MESH:D009765


In [10]:
# # Create list of all associated GeneIDs for each of Chemicals and Diseases
cgene_ids = df_cg.GeneID.unique()
dgene_ids = df_cgd.GeneID.unique()

# Combine them
all_genes = set(cgene_ids).union(set(dgene_ids))

In [11]:
df_cgd.head()

Unnamed: 0,GeneID,DiseaseID
0,50518,MESH:D003920
1,50518,MESH:D003924
2,50518,MESH:D008113
3,50518,MESH:D009369
4,50518,MESH:D009765


In [12]:
# Write combined gene ids to file
with open('geneIDs.txt', 'w') as f:
    for item in all_genes:
        f.write("%s\n" % item)

### NOTE the next step is MANUAL
You need to go to https://www.uniprot.org/uploadlists/ and give it the created geneIDs.txt file, ask it to convert
entrez gene to uniprot ID. Then download this as uniprotIDs.txt (as uncompressed, mapping table)

##### Import the list of uniprot IDs

In [13]:
# Import manually generated file of geneID --> uniprotID
df_uni_ids = pd.read_csv('uniprotIDs.txt', sep='\t',usecols=[0,1])
df_uni_ids.columns = ['GeneID', 'UniprotID']
df_uni_ids['GeneID'] = df_uni_ids.GeneID.astype(str)

In [14]:
# Some of the GeneIds are actually a couple of IDs on one row. Split them into multiple rows
df_uni_ids = pd.concat([pd.Series(row['UniprotID'], row['GeneID'].split(','))              
                    for _, row in df_uni_ids.iterrows()]).reset_index()
df_uni_ids.columns = ['GeneID', 'UniprotID']

##### Create merged df to enable grouping by chemicalID and diseaseID

In [15]:
# Create mapping file of gene ID to chem ID... and gene ID to disease ID 
cg_dict = dict(zip(df_cg.GeneID, df_cg.ChemicalID))
cgd_dict = dict(zip(df_cgd.GeneID, df_cgd.DiseaseID))

In [16]:
df_uni_ids.head()

Unnamed: 0,GeneID,UniprotID
0,1,P04217
1,1,V9HWD8
2,2,P01023
3,131076,C9JQ41
4,131076,Q4VC31


In [17]:
df_uni_ids_d = df_uni_ids.copy() # make a copy for diseases

In [18]:
# Apply the mapping, thereby creating chemical column
df_uni_ids['ChemicalID'] = df_uni_ids.GeneID
df_uni_ids['ChemicalID'] = df_uni_ids['ChemicalID'].astype(int).map(cg_dict)

In [19]:
# Apply the mapping, thereby creating disease column
df_uni_ids_d['DiseaseID'] = df_uni_ids_d.GeneID
df_uni_ids_d['DiseaseID'] = df_uni_ids_d['DiseaseID'].astype(int).map(cgd_dict)

In [20]:
# # Output disease list, later used in nn notebook
# disease_list = df_uni_ids_d.DiseaseID.unique()
# len(disease_list)
# np.savetxt(r'diseases.lst', disease_list, fmt='%s')

In [21]:
## This establishes that the len of a disease id is always 8 while a chem is 7 or 10
# df_uni_ids['IDlen'] = df_uni_ids.ChemicalID.map(lambda x: len(x))
# df_uni_ids_d['IDlen'] = df_uni_ids_d.DiseaseID.map(lambda x: len(x))
# print(df_uni_ids.IDlen.unique())
# print(df_uni_ids_d.IDlen.unique())

##### Group by Chem ID

In [22]:
df_uni_ids.dropna().shape # 33381

(47326, 3)

In [23]:
df_uni_ids.head()

Unnamed: 0,GeneID,UniprotID,ChemicalID
0,1,P04217,D014635
1,1,V9HWD8,D014635
2,2,P01023,D014800
3,131076,C9JQ41,D014635
4,131076,Q4VC31,D014635


In [24]:
df_uni_ids_d.dropna().shape #2914

(14795, 3)

### 2. Mine goa file, attaching go function for each uniprot ID to the parent chemical/gene

In [25]:
# import goa file (uniprot ID to go_functions)
go_funcs = pd.read_csv('../goa_human.gaf', header=None, skiprows=30, sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [26]:
# Cut out all cols except uniprot ids and go_funcs, rename these
go_funcs = go_funcs.rename(columns={ go_funcs.columns[1]: "UniprotID" })
go_funcs = go_funcs.rename(columns={ go_funcs.columns[4]: "gofunc" })
col_list = ['UniprotID', 'gofunc']
df_go = go_funcs[col_list]

In [27]:
# Merge the go functions into our existing chem-uniprotID and dis-uniprotID dfs
df_uni_ids_d = df_uni_ids_d.merge(df_go, on='UniprotID', how='outer').dropna()
df_uni_ids = df_uni_ids.merge(df_go, on='UniprotID', how='outer').dropna()

### 3. Prep and write to output file 

In [28]:
df_uni_ids_d.head()

Unnamed: 0,GeneID,UniprotID,DiseaseID,gofunc
0,1,P04217,MESH:D012559,GO:0002576
1,1,P04217,MESH:D012559,GO:0003674
2,1,P04217,MESH:D012559,GO:0005576
3,1,P04217,MESH:D012559,GO:0005576
4,1,P04217,MESH:D012559,GO:0005576


In [29]:
df_uni_ids.head()

Unnamed: 0,GeneID,UniprotID,ChemicalID,gofunc
0,1,P04217,D014635,GO:0002576
1,1,P04217,D014635,GO:0003674
2,1,P04217,D014635,GO:0005576
3,1,P04217,D014635,GO:0005576
4,1,P04217,D014635,GO:0005576


In [30]:
# Create a col with the full go url
df_uni_ids['go_url'] = '<' + 'http://purl.obolibrary.org/obo/' + df_uni_ids.gofunc.str.replace(':', '_')  + '>'
df_uni_ids_d['go_url'] =  '<' + 'http://purl.obolibrary.org/obo/' + df_uni_ids_d.gofunc.str.replace(':', '_')  + '>'

In [31]:
# Grab just the columns we want to output (diseaseID and go_url/ chemicalID and go_url)
col_list_c = ['ChemicalID', 'go_url']
col_list_d = ['DiseaseID', 'go_url']
df_c = df_uni_ids[col_list_c]
df_d = df_uni_ids_d[col_list_d]

In [32]:
# df_c.ChemicalID.unique()

In [33]:
df_d.DiseaseID.unique()

array(['MESH:D012559', 'MESH:D009404', 'MESH:D001749', ..., 'OMIM:613703',
       'OMIM:613617', 'MESH:C537581'], dtype=object)

In [40]:
df_c.head()

Unnamed: 0,ChemicalID,go_url
0,D014635,<http://purl.obolibrary.org/obo/GO_0002576>
1,D014635,<http://purl.obolibrary.org/obo/GO_0003674>
2,D014635,<http://purl.obolibrary.org/obo/GO_0005576>
3,D014635,<http://purl.obolibrary.org/obo/GO_0005576>
4,D014635,<http://purl.obolibrary.org/obo/GO_0005576>


In [34]:
# Output an association file for each of chem and dis
np.savetxt(r'associations_c.txt', df_c.values, fmt='%s')
np.savetxt(r'associations_d.txt', df_d.values, fmt='%s')

In [35]:
# Merge these two into one single file
subprocess.call('cat associations_* > myassociations', shell=True)

0

In [36]:
# Create entities.lst to inform opa2vec which entities we want vectors for
entities = df_d.DiseaseID.unique().tolist() + df_c.ChemicalID.unique().tolist()
np.savetxt(r'entities.lst', entities, fmt='%s')

### NEXT STEP in pipeline is to run opa2vec, automated below
python2 runOPA2Vec.py -ontology ../ontologies/go.owl -associations ../msc-thesis/opa/myassociations -entities ../msc-thesis/opa/entities.lst -outfile outter.lst

In [37]:
# Ok let's actually run it 
subprocess.check_output('(cd ../../opa2vec/ ; python2 runOPA2Vec.py -ontology ../ontologies/go.owl -associations ../msc-thesis/opa/myassociations -entities ../msc-thesis/opa/entities.lst -outfile ../msc-thesis/opa/go-gofuncs.lst)', shell=True)

b'Loading of Axioms ...\nLoading ...\n    1%\n    2%\n    3%\n    4%\n    5%\n    6%\n    7%\n    8%\n    9%\n    10%\n    11%\n    12%\n    13%\n    14%\n    15%\n    16%\n    17%\n    18%\n    19%\n    20%\n    21%\n    22%\n    23%\n    24%\n    25%\n    26%\n    27%\n    28%\n    29%\n    30%\n    31%\n    32%\n    33%\n    34%\n    35%\n    36%\n    37%\n    38%\n    39%\n    40%\n    41%\n    42%\n    43%\n    44%\n    45%\n    46%\n    47%\n    48%\n    49%\n    50%\n    51%\n    52%\n    53%\n    54%\n    55%\n    56%\n    57%\n    58%\n    59%\n    60%\n    61%\n    62%\n    63%\n    64%\n    65%\n    66%\n    67%\n    68%\n    69%\n    70%\n    71%\n    72%\n    74%\n    75%\n    76%\n    77%\n    79%\n    80%\n    82%\n    83%\n    84%\n    85%\n    87%\n    88%\n    90%\n    91%\n    93%\n    94%\n    96%\n    97%\n    99%\n    ... finished\n    ... finished\nProperty Saturation Initialization ...\n    ... finished\nReflexive Property Computation ...\n    ... finished\nObje

#### Export GoFunction counts per chem and per disease

In [38]:
# Create df of counts of chem and disease gofuncs for export, later used in assessing NN results
chem_gofunc_counts = df_uni_ids.groupby('ChemicalID')['gofunc'].nunique()
dis_gofunc_counts = df_uni_ids_d.groupby('DiseaseID')['gofunc'].nunique()

chem_gofunc_counts = pd.DataFrame(chem_gofunc_counts)
chem_gofunc_counts = chem_gofunc_counts.reset_index()
dis_gofunc_counts = pd.DataFrame(dis_gofunc_counts)
dis_gofunc_counts = dis_gofunc_counts.reset_index()

gofunc_counts = chem_gofunc_counts.merge(dis_gofunc_counts, how='outer')
gofunc_counts.to_csv('gofunc_counts.csv', index=False)