# Notebook: Run NN on KG embeddings
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Take word2vec embeddings of walking-rdf-owl random walks from my original Knowledge Graph

In [133]:
import pandas as pd
import numpy as np
import subprocess

### 1. Get Uniprot ID set for each Gene 

In [134]:
# CHEMICALS
# Import CTD file to get the geneIDs we want 
df_cg = pd.read_csv('../ctd-to-nt/csvs/CTD_chem_gene_ixns.csv', skiprows=27)
df_cg = df_cg.drop(0)

# Cut down to just humans if you want to
df_cg = df_cg.loc[df_cg['OrganismID'] == 9606.0]

# Need to change float to int for the url to work
df_cg['GeneID'] = df_cg.GeneID.astype(int)

In [135]:
# DISEASES
# Import DisGeNet with disease IDs and Gene IDs
df_cgd = pd.read_csv('../disgenet-nt/input_tsvs/curated_gene_disease_associations.tsv', sep='\t')

# Limit by evidence Score 
df_cgd = df_cgd[df_cgd.score >= 0.42]

In [136]:
# # Create list of associated GeneIDs for each of Chemicals and Diseases
cgene_ids = df_cg.GeneID.unique()
dgene_ids = df_cgd.geneId.unique()

# Combine them
all_genes = set(cgene_ids).union(set(dgene_ids))

In [139]:
with open('geneIDs.txt', 'w') as f:
    for item in all_genes:
        f.write("%s\n" % item)

In [140]:
# Export complete list of genes as txt file
# np.savetxt('geneIDs.txt', all_genes, fmt='%s')

In [None]:
# NOTE the next step is MANUAL
# You need to go to https://www.uniprot.org/uploadlists/ and give it the created geneIDs.txt file, ask it to convert
# entrez gene to uniprot ID. Then download this as uniprotIDs.txt (as uncompressed, mapping table)

##### Import the list of uniprot IDs

In [141]:
# Import
df_uni_ids = pd.read_csv('uniprotIDs.txt', sep='\t')
df_uni_ids.columns = ['GeneID', 'UniprotID']
df_uni_ids['GeneID'] = df_uni_ids.GeneID.astype(str)
# df_uni_ids = df_uni_ids.dropna()

In [142]:
# Some of the GeneIds are actually a couple of IDs on one row. Split them into multiple rows
df_uni_ids = pd.concat([pd.Series(row['UniprotID'], row['GeneID'].split(','))              
                    for _, row in df_uni_ids.iterrows()]).reset_index()
df_uni_ids.columns = ['GeneID', 'UniprotID']

##### Create merged df to enable grouping by chemicalID and diseaseID

In [143]:
# Create mapping file of gene ID to chem ID... and gene ID to disease ID 
cg_dict = dict(zip(df_cg.GeneID, df_cg.ChemicalID))
cgd_dict = dict(zip(df_cgd.geneId, df_cgd.diseaseId))

In [147]:
df_uni_ids.head()

Unnamed: 0,GeneID,UniprotID
0,4149,A0A024R682
1,4149,G3V302
2,4149,P61244
3,4149,Q8TAX8
4,4609,A0A087WVR4


In [148]:
df_uni_ids_d = df_uni_ids.copy() # make a copy for diseases

In [151]:
# Apply the mapping, thereby creating chemical column (replace is slower than map TODO use map method)
df_uni_ids['ChemicalID'] = df_uni_ids.GeneID
df_uni_ids['ChemicalID'] = df_uni_ids['ChemicalID'].astype(int).map(cg_dict)

In [160]:
# Apply the mapping, thereby creating disease column (replace is slower than map TODO use map method)
df_uni_ids_d['DiseaseID'] = df_uni_ids_d.GeneID
df_uni_ids_d['DiseaseID'] = df_uni_ids_d['DiseaseID'].astype(int).map(cgd_dict)

##### Group by Chem ID

In [165]:
df_uni_ids.dropna().shape # 33381

(33381, 3)

In [163]:
df_uni_ids_d.dropna().shape #2914

(2914, 3)

### 2. Mine goa file, attaching go function for each uniprot ID to the parent chemical/gene

### 3. 
