# Notebook: Create OPA2VEC associations file
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Get genes associated with diseases (ctdbase) and chemicals (ctdbase), take their go functions and assign to the associated chem/dis in an associations file. Also output finalclasses.lst, a file that tells opa2vec which entities you would like the vectors for

In [19]:
import pandas as pd
import numpy as np
import subprocess

### 1. Get Genes for Each Chemical
Get the associated genes for each chemical and split into positive/negative/unclear_direction of correlation

In [20]:
# CHEMICALS
# Import CTD file to get the geneIDs we want 

# Set coltypes pre import to maximise memory
col_types = {   
    'GeneID': 'category',
    'ChemicalID': 'category',
    'InteractionActions': 'category',
}
cols=['ChemicalID', 'GeneID', 'InteractionActions', 'OrganismID', 'GeneSymbol']


df_cg = pd.read_csv('../ctd-to-nt/csvs/CTD_chem_gene_ixns.csv', usecols=cols, skiprows=27)
df_cg = df_cg.drop(0)

# Cut down to just humans if you want to
df_cg = df_cg.loc[df_cg['OrganismID'] == 9606.0]

# Need to change float to int for the later url to work
df_cg['GeneID'] = df_cg.GeneID.astype(int)

In [21]:
## Now I'm going to split the chem-gene relationships into pos/neg correlation with neutral for unclear 
#(further info is likely available to split the unclear better)

# Split the interactionActions into separate predicates
s = df_cg['InteractionActions'].str.split('|').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'InteractionActions'
df_cg = df_cg.join(s.apply(lambda x: pd.Series(x.split('|'))))

# Make the new column prettier
df_cg = df_cg.rename(columns={0: 'Predicate'})
df_cg['Predicate'] = df_cg.Predicate.str.replace('^', '_')
df_cg['Predicate'] = df_cg.Predicate.str.replace(' ', '_')

In [22]:
df_cg.sample(1)

Unnamed: 0,ChemicalID,GeneSymbol,GeneID,OrganismID,InteractionActions,Predicate
592463,D003907,INS,3630,9606.0,affects^cotreatment|decreases^expression,affects_cotreatment


In [23]:
df_cg.shape

(1120709, 6)

In [24]:
# Process InteractionActions
# df_cg['InteractionActions'] = df_cg.InteractionActions.str.replace('^', '_')
# df_cg['InteractionActions'] = df_cg.InteractionActions.str.replace(' ', '_')
# df_cg.InteractionActions = df_cg.InteractionActions.map(lambda x: x.split('|'))

In [25]:
# Map each c-g relationship to positive or negative (True is positive) based on the predicates

map_to_corr_direction = {
    'increases_expression' : True,
    'decreases_activity' : False,
    'decreases_expression': False,
    'increases_activity': True,
    'increases_stability': True,
    'decreases_stability': False,
    'increases_abundance': True,
    'decreases_abundance': False,
    'increases_degradation': False,
    'decreases_degradation': True,
    'increases_chemical_synthesis' : True,
    'increases_reaction' : True,
    'decreases_reaction' : False
}

# Apply map
df_cg['dir_corr'] = df_cg.Predicate.map(map_to_corr_direction).astype(str)
df_cg = df_cg.assign(**pd.get_dummies(df_cg['dir_corr']))
df_cg = df_cg.rename(columns={'False': 'Neg', 'True': 'Pos', 'nan': 'direction_unknown'}) # rename of a column

In [26]:
# Now this awkward bit to remove duplicate chem-gene relationships again
# Some chem-gene relationships are recorded as both pos and neg, these will be made neutral
df_cg['uid'] = df_cg.ChemicalID + df_cg.GeneSymbol

map_negs = dict(zip(df_cg[df_cg.Neg ==1].uid, df_cg[df_cg.Neg ==1].Neg))
map_pos = dict(zip(df_cg[df_cg.Pos ==1].uid, df_cg[df_cg.Pos ==1].Pos))

df_cg = df_cg.drop_duplicates(subset=['uid']) 

df_cg.Neg = np.nan
df_cg.Pos = np.nan

df_cg.Neg = df_cg.uid.map(map_negs).fillna(value='0')
df_cg.Pos = df_cg.uid.map(map_pos).fillna(value='0')

df_cg.Neg = df_cg.Neg.astype(np.uint8)
df_cg.Pos = df_cg.Pos.astype(np.uint8)

In [27]:
# Create neutral column for unclear direction of correlation
# Forgive this ugly code
df_cg['neut'] = np.where((((df_cg.Neg == 1) & (df_cg.Pos == 1)) | ((df_cg.Neg == 0) & (df_cg.Pos == 0))), 1, 0)

# If neut is 1 both pos and neg should be made 0
mask = df_cg.neut == 1
df_cg.loc[mask, 'Neg'] = 0
df_cg.loc[mask, 'Pos'] = 0

In [28]:
df_cg.sample(3)

Unnamed: 0,ChemicalID,GeneSymbol,GeneID,OrganismID,InteractionActions,Predicate,dir_corr,Neg,Pos,direction_unknown,uid,neut
280242,C487081,NCALD,83988,9606.0,decreases^expression,decreases_expression,False,1,0,0,C487081NCALD,0
237299,D018501,RABEP1,9135,9606.0,increases^expression,increases_expression,True,0,1,0,D018501RABEP1,0
1493704,D013849,PGBD2,267002,9606.0,increases^expression,increases_expression,True,0,1,0,D013849PGBD2,0


In [29]:
# Cut down to essential columns
df_cg = df_cg[['ChemicalID', 'GeneID', 'Neg', 'Pos', 'neut']]

### 2. Get Genes for Each Disease
Get the associated genes for each disease and split into positive/negative/unclear_direction of correlation

In [30]:
# This cell was the original one to use DisGeNet data but the next one replaces it, with CTD data
# # Import DisGeNet with disease IDs and Gene IDs
# df_cgd = pd.read_csv('../disgenet-nt/input_tsvs/curated_gene_disease_associations.tsv', sep='\t')

# # Limit by evidence Score 
# df_cgd = df_cgd[df_cgd.score >= 0.42]

In [31]:
# Alternative DISEASES --> Switching out DisGeNet for CTD (as they use UMLS/MESH respectively)
# Import preprepared g-d associations (from ctd-to-nt notebook) direct assocs only

# Set column types to optimise reading in csv
col_types = {   
    'GeneID': 'category',
    'DiseaseID': 'category',
    'DirectEvidence': 'category',
}


df_cgd = pd.read_csv('../ctd-to-nt/csvs/CTD_genes_diseases.csv', usecols=['GeneID', 'DiseaseID', 'DirectEvidence'],
                     dtype=col_types, skiprows=27).drop(0)

print('Total gene-disease associations', df_cgd.shape[0])
df_cgd = df_cgd.dropna(subset=['DirectEvidence']) # drop if it doesn't have direct evidence
print('Directly evidenced gene-dis associations', df_cgd.shape[0])
print('Of which are positively correlated', df_cgd[df_cgd.DirectEvidence =='marker/mechanism'].shape[0])
df_cgd['GeneID'] = df_cgd['GeneID'].astype(int)
df_cgd.head()

Total gene-disease associations 64684702
Directly evidenced gene-dis associations 30272
Of which are positively correlated 28412


Unnamed: 0,GeneID,DiseaseID,DirectEvidence
3323,50518,MESH:D003920,marker/mechanism
3326,50518,MESH:D003924,marker/mechanism
3545,50518,MESH:D008113,marker/mechanism
3652,50518,MESH:D009369,marker/mechanism
3702,50518,MESH:D009765,marker/mechanism


In [32]:
df_cgd.DiseaseID.nunique()

5793

In [33]:
# Cut down to essential columns
df_cgd = df_cgd[['GeneID', 'DiseaseID', 'DirectEvidence']]
df_cgd['GeneID'] = df_cgd.GeneID.astype(int)

### 3. Take All Unique Genes and Collect Their Uniprot IDs

In [34]:
# # Create list of all associated GeneIDs for each of Chemicals and Diseases
cgene_ids = df_cg.GeneID.unique()
dgene_ids = df_cgd.GeneID.unique()

# Combine them
all_genes = set(cgene_ids).union(set(dgene_ids))

In [35]:
print('Number of Disease Genes: ', len(dgene_ids))
print('Number of Chem Genes: ', len(cgene_ids))

Number of Disease Genes:  8371
Number of Chem Genes:  23568


In [36]:
# Write combined gene ids to file
with open('geneIDs.txt', 'w') as f:
    for item in all_genes:
        f.write("%s\n" % item)

### NOTE the next step is MANUAL
### You need to go to https://www.uniprot.org/uploadlists/ and give it the created geneIDs.txt file, ask it to convert
### entrez gene to uniprot ID. Then download this as uniprotIDs.txt (as uncompressed, mapping table)

##### Import the list of uniprot IDs

In [43]:
# Import manually generated file of geneID --> uniprotID
df_uni_ids = pd.read_csv('uniprotIDs.txt', sep='\t',usecols=[0,1])
df_uni_ids.columns = ['GeneID', 'UniprotID']
df_uni_ids['GeneID'] = df_uni_ids.GeneID.astype(str)

In [44]:
df_uni_ids.head()

Unnamed: 0,GeneID,UniprotID
0,1,P04217
1,1,V9HWD8
2,2,P01023
3,131076,C9JQ41
4,131076,Q4VC31


In [45]:
# Some of the GeneIds are actually a couple of IDs on one row. Split them into multiple rows
df_uni_ids = pd.concat([pd.Series(row['UniprotID'], row['GeneID'].split(','))              
                    for _, row in df_uni_ids.iterrows()]).reset_index()
df_uni_ids.columns = ['GeneID', 'UniprotID']
df_uni_ids['GeneID'] = df_uni_ids.GeneID.astype(int)

In [46]:
df_cg.head()

Unnamed: 0,ChemicalID,GeneID,Neg,Pos,neut
3,C534883,4609,1,0,0
13,C004822,2052,0,1,0
15,C039775,5243,0,0,1
20,C534422,836,0,1,0
21,C534422,841,0,1,0


In [47]:
df_cgd.head()

Unnamed: 0,GeneID,DiseaseID,DirectEvidence
3323,50518,MESH:D003920,marker/mechanism
3326,50518,MESH:D003924,marker/mechanism
3545,50518,MESH:D008113,marker/mechanism
3652,50518,MESH:D009369,marker/mechanism
3702,50518,MESH:D009765,marker/mechanism


In [48]:
df_uni_ids.head()

Unnamed: 0,GeneID,UniprotID
0,1,P04217
1,1,V9HWD8
2,2,P01023
3,131076,C9JQ41
4,131076,Q4VC31


### 4. Merge into Final DF for Export

In [49]:
# Create map of gene to uniprot id, to add the uniprot values to the existing DFs
gen_uprot = dict(zip(df_uni_ids.GeneID, df_uni_ids.UniprotID))

df_cg['UniprotID'] = df_cg.GeneID.astype(int).map(gen_uprot)
df_cgd['UniprotID'] = df_cgd.GeneID.astype(int).map(gen_uprot)

In [50]:
df_cgd.sample(13)

Unnamed: 0,GeneID,DiseaseID,DirectEvidence,UniprotID
20856487,27086,MESH:D000067877,marker/mechanism,Q9H334
48414015,4549,MESH:C565637,marker/mechanism,
11494862,84735,MESH:D009461,marker/mechanism,Q96KN2
45639557,5743,MESH:D009422,marker/mechanism,P35354
7786858,834,MESH:D011014,marker/mechanism,P29466
45652356,5741,MESH:D050723,therapeutic,P01270
61867090,7422,MESH:D008269,marker/mechanism,P15692
45772974,26191,MESH:D001172,marker/mechanism,Q9Y2R2
14989917,4921,MESH:C564794,marker/mechanism,Q16832
25226008,3123,MESH:D001249,marker/mechanism,X5DNQ0


In [None]:
# # Create mapping file of gene ID to chem ID... and gene ID to disease ID
# cg_dict = dict(zip(df_cg.GeneID, df_cg.ChemicalID))
# cgd_dict = dict(zip(df_cgd.GeneID, df_cgd.DiseaseID))

In [None]:
# df_uni_ids_d = df_uni_ids.copy() # make a copy for diseases

In [None]:
# df_uni_ids_d.shape

In [None]:
# # Apply the mapping, thereby creating chemical column
# df_uni_ids['ChemicalID'] = df_uni_ids.GeneID
# df_uni_ids['ChemicalID'] = df_uni_ids['ChemicalID'].astype(int).map(cg_dict)

In [None]:
# # Apply the mapping, thereby creating disease column
# df_uni_ids_d['DiseaseID'] = df_uni_ids_d.GeneID
# df_uni_ids_d['DiseaseID'] = df_uni_ids_d['DiseaseID'].astype(int).map(cgd_dict)

In [None]:
# df_uni_ids

In [None]:
# # Output disease list, later used in nn notebook
# disease_list = df_uni_ids_d.DiseaseID.unique()
# len(disease_list)
# np.savetxt(r'diseases.lst', disease_list, fmt='%s')

In [None]:
## This establishes that the len of a disease id is always 8 while a chem is 7 or 10
# df_uni_ids['IDlen'] = df_uni_ids.ChemicalID.map(lambda x: len(x))
# df_uni_ids_d['IDlen'] = df_uni_ids_d.DiseaseID.map(lambda x: len(x))
# print(df_uni_ids.IDlen.unique())
# print(df_uni_ids_d.IDlen.unique())

##### Group by Chem ID

In [None]:
# df_uni_ids.dropna().shape # 33381

In [None]:
# df_uni_ids.head()

In [None]:
# df_uni_ids.ChemicalID.nunique()

In [None]:
# df_uni_ids_d.dropna().shape #2914

In [None]:
# df_uni_ids_d.DiseaseID.nunique()

### 5. Mine goa file, attaching go function for each uniprot ID to the parent chemical/gene

In [53]:
# import goa file (uniprot ID to go_functions)
go_funcs = pd.read_csv('../goa_human.gaf', header=None, skiprows=30, sep='\t', low_memory=False)

In [54]:
# Cut out all cols except uniprot ids and go_funcs, rename these
go_funcs = go_funcs.rename(columns={ go_funcs.columns[1]: "UniprotID" })
go_funcs = go_funcs.rename(columns={ go_funcs.columns[4]: "gofunc" })
col_list = ['UniprotID', 'gofunc']
df_go = go_funcs[col_list]

In [55]:
# Merge the go functions into our existing chem-uniprotID and dis-uniprotID dfs
# df_uni_ids_d = df_uni_ids_d.merge(df_go, on='UniprotID', how='outer').dropna()
# df_uni_ids = df_uni_ids.merge(df_go, on='UniprotID', how='outer').dropna()

In [56]:
df_cg = df_cg.merge(df_go, on='UniprotID', how='outer').dropna()
df_cgd = df_cgd.merge(df_go, on='UniprotID', how='outer').dropna()

In [57]:
df_cg.head()

Unnamed: 0,ChemicalID,GeneID,Neg,Pos,neut,UniprotID,gofunc
0,C534883,4609.0,1.0,0.0,0.0,P01106,GO:0000082
1,C534883,4609.0,1.0,0.0,0.0,P01106,GO:0000122
2,C534883,4609.0,1.0,0.0,0.0,P01106,GO:0000122
3,C534883,4609.0,1.0,0.0,0.0,P01106,GO:0000122
4,C534883,4609.0,1.0,0.0,0.0,P01106,GO:0000165


In [58]:
df_cgd.head()

Unnamed: 0,GeneID,DiseaseID,DirectEvidence,UniprotID,gofunc
8,2.0,MESH:D058186,marker/mechanism,P01023,GO:0001869
9,2.0,MESH:D058186,marker/mechanism,P01023,GO:0002020
10,2.0,MESH:D058186,marker/mechanism,P01023,GO:0002020
11,2.0,MESH:D058186,marker/mechanism,P01023,GO:0002576
12,2.0,MESH:D058186,marker/mechanism,P01023,GO:0004867


In [None]:
# df_go[df_go.UniprotID == 'P01023']

#### Alternative vectors: Control for the number of disease go functions

In [None]:
# df_uni_ids.ChemicalID.value_counts()

In [None]:
# len(df_uni_ids_d.DiseaseID.value_counts()[(df_uni_ids_d.DiseaseID.value_counts()>=30) & (df_uni_ids_d.DiseaseID.value_counts()<1000)])

In [None]:
# df_uni_ids_d.DiseaseID.nunique()

### 6. Prep and write to output files

In [None]:
# df_uni_ids_d.shape

In [None]:
# df_uni_ids.sample(13)

In [None]:
# df_uni_ids.ChemicalID.nunique()

In [None]:
# df_uni_ids.shape

In [59]:
# Create a col with the full go url
df_cg['go_url'] = 'http://purl.obolibrary.org/obo/' + df_cg.gofunc.str.replace(':', '_') 
df_cgd['go_url'] =  'http://purl.obolibrary.org/obo/' + df_cgd.gofunc.str.replace(':', '_')

In [None]:
# df_uni_ids[df_uni_ids.ChemicalID == 'C031238'].go_url.unique()

In [None]:
# # Create a col with the full go url
# df_uni_ids['go_url'] = '<' + 'http://purl.obolibrary.org/obo/' + df_uni_ids.gofunc.str.replace(':', '_')  + '>'
# df_uni_ids_d['go_url'] =  '<' + 'http://purl.obolibrary.org/obo/' + df_uni_ids_d.gofunc.str.replace(':', '_')  + '>'

In [None]:
# Grab just the columns we want to output (diseaseID and go_url/ chemicalID and go_url)
# col_list_c = ['ChemicalID', 'go_url']
# col_list_d = ['DiseaseID', 'go_url']
# df_c = df_uni_ids[col_list_c]
# df_d = df_uni_ids_d[col_list_d]

In [60]:
for col in ['Neg', 'Pos', 'neut']:
    df_cg[col] = df_cg[col].astype(np.uint8)
    
df_cgd['GeneID'] = df_cgd.GeneID.astype(int)

In [61]:
df_cg.sample(13)

Unnamed: 0,ChemicalID,GeneID,Neg,Pos,neut,UniprotID,gofunc,go_url
10555826,C006780,204.0,1,0,0,P54819,GO:0070062,http://purl.obolibrary.org/obo/GO_0070062
10556009,D019833,204.0,1,0,0,P54819,GO:0097226,http://purl.obolibrary.org/obo/GO_0097226
10202592,D003042,5786.0,1,0,0,P18433,GO:0005886,http://purl.obolibrary.org/obo/GO_0005886
13209924,C017947,11186.0,0,1,0,Q9NS23,GO:0005874,http://purl.obolibrary.org/obo/GO_0005874
196467,D060754,836.0,0,1,0,P42574,GO:0097153,http://purl.obolibrary.org/obo/GO_0097153
1152406,D014635,1026.0,0,0,1,P38936,GO:0005515,http://purl.obolibrary.org/obo/GO_0005515
12566859,D014212,57464.0,1,0,0,Q9ULQ0,GO:0005737,http://purl.obolibrary.org/obo/GO_0005737
8905935,C516138,81930.0,1,0,0,Q8NI77,GO:0005829,http://purl.obolibrary.org/obo/GO_0005829
2516442,C059514,3099.0,1,0,0,P52789,GO:0008637,http://purl.obolibrary.org/obo/GO_0008637
11894198,D009538,9862.0,0,1,0,O75448,GO:0005654,http://purl.obolibrary.org/obo/GO_0005654


In [62]:
df_cg[df_cg.ChemicalID == 'C000121']

Unnamed: 0,ChemicalID,GeneID,Neg,Pos,neut,UniprotID,gofunc,go_url
1764947,C000121,4318.0,1,0,0,P14780,GO:0001501,http://purl.obolibrary.org/obo/GO_0001501
1764948,C000121,4318.0,1,0,0,P14780,GO:0001503,http://purl.obolibrary.org/obo/GO_0001503
1764949,C000121,4318.0,1,0,0,P14780,GO:0001934,http://purl.obolibrary.org/obo/GO_0001934
1764950,C000121,4318.0,1,0,0,P14780,GO:0004175,http://purl.obolibrary.org/obo/GO_0004175
1764951,C000121,4318.0,1,0,0,P14780,GO:0004175,http://purl.obolibrary.org/obo/GO_0004175
1764952,C000121,4318.0,1,0,0,P14780,GO:0004175,http://purl.obolibrary.org/obo/GO_0004175
1764953,C000121,4318.0,1,0,0,P14780,GO:0004222,http://purl.obolibrary.org/obo/GO_0004222
1764954,C000121,4318.0,1,0,0,P14780,GO:0004222,http://purl.obolibrary.org/obo/GO_0004222
1764955,C000121,4318.0,1,0,0,P14780,GO:0004222,http://purl.obolibrary.org/obo/GO_0004222
1764956,C000121,4318.0,1,0,0,P14780,GO:0004222,http://purl.obolibrary.org/obo/GO_0004222


In [64]:
df_cgd.sample(3)

Unnamed: 0,GeneID,DiseaseID,DirectEvidence,UniprotID,gofunc,go_url
1099900,23411,MESH:D055370,therapeutic,Q96EB6,GO:0005515,http://purl.obolibrary.org/obo/GO_0005515
618878,3557,MESH:D001171,marker/mechanism,P18510,GO:0005622,http://purl.obolibrary.org/obo/GO_0005622
1051750,84282,OMIM:614192,marker/mechanism,Q8IUD6,GO:0004842,http://purl.obolibrary.org/obo/GO_0004842


In [65]:
# Here I append a -n to entity-gene relationships that are negative and -p if positive
# This will enable us to run all vectorisation at once and retain the correlation for processing in opa-nn notebook

# Get a weird error if we don't reset from categorical to string type
df_cgd['DiseaseID'] = df_cgd['DiseaseID'].astype('str')
df_cg['ChemicalID'] = df_cg['ChemicalID'].astype('str')

mask = (df_cg.Neg == 1)
df_cg.loc[mask, 'ChemicalID'] += '-n'

mask = (df_cg.Pos == 1)
df_cg.loc[mask, 'ChemicalID'] += '-p'


mask = (df_cgd.DirectEvidence == 'marker/mechanism')
df_cgd.loc[mask, 'DiseaseID'] += '-p'

mask = (df_cgd.DirectEvidence == 'therapeutic')
df_cgd.loc[mask, 'DiseaseID'] += '-n'

In [66]:
df_c = df_cg[['ChemicalID', 'go_url']]
df_d = df_cgd[['DiseaseID', 'go_url']]

In [67]:
# Output an association file for each of chem and dis
np.savetxt(r'associations_c.txt', df_c.values, fmt='%s')
np.savetxt(r'associations_d.txt', df_d.values, fmt='%s')

In [68]:
# Merge these two into one single file
subprocess.call('cat associations_* > myassociations', shell=True)

0

In [69]:
# Create entities.lst to inform opa2vec which entities we want vectors for
entities = df_d.DiseaseID.unique().tolist() + df_c.ChemicalID.unique().tolist()
np.savetxt(r'entities.lst', entities, fmt='%s')

## NEXT STEP is to manually run opa2vec, though it could be automated here...

python2 runOPA2Vec.py -ontology ../ontologies/go.owl -associations ../msc-thesis/opa/myassociations  -entities ../msc-thesis/opa/entities.lst -outfile outter.lst

#### Export GoFunction counts per chem and per disease

In [None]:
# Create df of counts of chem and disease gofuncs for export, later used in assessing NN results
# chem_gofunc_counts = df_uni_ids.groupby('ChemicalID')['gofunc'].nunique()
# dis_gofunc_counts = df_uni_ids_d.groupby('DiseaseID')['gofunc'].nunique()

# chem_gofunc_counts = pd.DataFrame(chem_gofunc_counts)
# chem_gofunc_counts = chem_gofunc_counts.reset_index()
# dis_gofunc_counts = pd.DataFrame(dis_gofunc_counts)
# dis_gofunc_counts = dis_gofunc_counts.reset_index()

# gofunc_counts = chem_gofunc_counts.merge(dis_gofunc_counts, how='outer')
# gofunc_counts.to_csv('gofunc_counts.csv', index=False)