In [76]:
import pyobo
import pandas as pd

import json
import hetnetpy
from pathlib import Path

In [5]:
hetio_data_path = Path('/Users/dalivas/Documents/Projects/hetionet/hetnet')

In [6]:
hetio_json_path = hetio_data_path.joinpath('/json/')

In [7]:
from hetnetpy.readwrite import open_read_file, load

In [8]:
read_file = open_read_file('../data/hetionet/hetionet-v1.0.json')

hetio_data = load(read_file, formatting='json')

In [9]:
gene_dict = {}

for node in hetio_data['nodes']:
    if node['kind'] == 'Gene':
        gene_dict[node['identifier']] = node['data']['url']          

In [16]:
len(gene_dict)

20945

In [31]:
hgnc_mapping = pd.read_csv('../data/hetionet/hgnc_id_mapping.txt', sep='\t')

In [55]:
hgnc_mapping

Unnamed: 0,HGNC ID,RefSeq IDs,NCBI Gene ID,UniProt ID(supplied by UniProt)
0,HGNC:1,,,
1,HGNC:10,,,
2,HGNC:100,NM_020039,41.0,P78348
3,HGNC:1000,,,
4,HGNC:10000,NM_005613,5999.0,P49798
...,...,...,...,...
48658,HGNC:9995,NM_002927,6003.0,O14921
48659,HGNC:9996,NM_006480,10636.0,O43566
48660,HGNC:9997,NM_002928,6004.0,O15492
48661,HGNC:9998,NM_002923,5997.0,P41220


In [61]:
hgnc_mapping[hgnc_mapping['UniProt ID(supplied by UniProt)']]

Unnamed: 0,HGNC ID,RefSeq IDs,NCBI Gene ID,UniProt ID(supplied by UniProt)
619,HGNC:10561,NR_002717,6315.0,P0DMR3


In [33]:
hgnc_ncbi_uniprot = hgnc_mapping[['NCBI Gene ID', 'UniProt ID(supplied by UniProt)']]

In [34]:
hgnc_ncbi_uniprot = hgnc_ncbi_uniprot.dropna(axis=0,)

In [35]:
hgnc_ncbi_uniprot['NCBI Gene ID'] = hgnc_ncbi_uniprot['NCBI Gene ID'].astype(int)

In [36]:
hgnc_ncbi_uniprot

Unnamed: 0,NCBI Gene ID,UniProt ID(supplied by UniProt)
2,41,P78348
4,5999,P49798
5,8490,O15539
6,9628,P49758
7,6000,P49802
...,...,...
48658,6003,O14921
48659,10636,O43566
48660,6004,O15492
48661,5997,P41220


In [38]:
hgnc_ncbi_uniprot

Unnamed: 0,NCBI Gene ID,UniProt ID(supplied by UniProt)
2,41,P78348
4,5999,P49798
5,8490,O15539
6,9628,P49758
7,6000,P49802
...,...,...
48658,6003,O14921
48659,10636,O43566
48660,6004,O15492
48661,5997,P41220


In [39]:
hgnc_ncbi_uniprot['NCBI Gene ID'] = hgnc_ncbi_uniprot['NCBI Gene ID'].apply(lambda x: gene_dict[x] 
                                                                            if x in gene_dict.keys() else None)

In [51]:
nan_rows = hgnc_ncbi_uniprot[hgnc_ncbi_uniprot['NCBI Gene ID'].isnull()]

In [52]:
nan_rows

Unnamed: 0,NCBI Gene ID,UniProt ID(supplied by UniProt)
80,,P0C6P0
619,,P0DMR3
625,,Q9GZW5
784,,P0DI82
1882,,P51864
...,...,...
47919,,Q9NRI7
48048,,O43930
48195,,O15172
48385,,Q9NRI6


In [64]:
hgnc_ncbi_uniprot

Unnamed: 0,NCBI Gene ID,UniProt ID(supplied by UniProt)
2,http://identifiers.org/ncbigene/41,P78348
4,http://identifiers.org/ncbigene/5999,P49798
5,http://identifiers.org/ncbigene/8490,O15539
6,http://identifiers.org/ncbigene/9628,P49758
7,http://identifiers.org/ncbigene/6000,P49802
...,...,...
48658,http://identifiers.org/ncbigene/6003,O14921
48659,http://identifiers.org/ncbigene/10636,O43566
48660,http://identifiers.org/ncbigene/6004,O15492
48661,http://identifiers.org/ncbigene/5997,P41220


In [65]:
import torch

In [66]:
protein_emb = torch.load('../data/processed/protein_embeddings_full.pt')

In [95]:
test = torch.load('../data/processed/prot_id_to_embedding.pt')

In [98]:
identifier_list = test.index.to_list()

In [102]:
id_set_0 = set(identifier_list)

In [103]:
id_set_1 = set(hgnc_ncbi_uniprot['UniProt ID(supplied by UniProt)'])

In [104]:
id_set_1

{'Q16526',
 'P60608',
 'Q9NRY4',
 'Q9Y5I2',
 'O43869',
 'O75871',
 'Q5SZB4',
 'O60645',
 'Q9C0F3',
 'Q9H5K3',
 'Q8WW35',
 'Q9H9V4',
 'P11498',
 'Q7Z3Z2',
 'Q92692',
 'Q16595',
 'Q14145',
 'P16442',
 'Q9H425',
 'P0CG42',
 'Q8WXH0',
 'Q9H7V2',
 'Q5JSL3',
 'O94880',
 'O15392',
 'Q9UH64',
 'Q9NR23',
 'P56962',
 'Q9Y6V0',
 'Q9H8X9',
 'Q96DU9',
 'Q8N1B4',
 'Q96F86',
 'Q96DC9',
 'Q9H4Z3',
 'Q9UHL4',
 'Q9BW91',
 'P07478',
 'O95210',
 'Q8N6Q3',
 'Q9P2D0',
 'Q01094',
 'Q6ZNJ1',
 'Q9UF56',
 'O14958',
 'Q5SWW7',
 'Q5KSL6',
 'O15145',
 'O00634',
 'Q09470',
 'O95183',
 'Q96TA1',
 'Q7Z388',
 'Q9BXU0',
 'Q9H4E7',
 'Q92667',
 'O95336',
 'Q14159',
 'Q9UPR0',
 'P61583, Q9HDB8, Q9HDB9',
 'Q5T6S3',
 'Q9Y5H9',
 'Q6ZVD8',
 'Q969G9',
 'Q9P2E2',
 'Q8NCN2',
 'Q8NGI3',
 'Q96L50',
 'Q86UV6',
 'Q7Z7H8',
 'Q6T310',
 'Q99973',
 'Q5TZ20',
 'Q96R08',
 'Q9H6I2',
 'Q8NFB2',
 'A0A0A0MT76',
 'Q08462',
 'P08651',
 'Q9H3R2',
 'Q9GZN8',
 'Q494U1',
 'Q5VZQ5',
 'Q9UKV3',
 'O94933',
 'Q7Z434',
 'Q8N8E1',
 'Q9HCS5',
 'Q9H6A0',
 

In [107]:
hgnc_ncbi_uniprot[[', ' in x for x in hgnc_ncbi_uniprot['UniProt ID(supplied by UniProt)']]]

Unnamed: 0,NCBI Gene ID,UniProt ID(supplied by UniProt)
932,http://identifiers.org/ncbigene/7979,"P60896, Q6ZVN7"
1744,http://identifiers.org/ncbigene/706,"B1AH88, P30536"
2071,http://identifiers.org/ncbigene/7112,"P42166, P42167"
2205,http://identifiers.org/ncbigene/27433,"Q5JU69, Q8N2E6"
2241,http://identifiers.org/ncbigene/6955,"P0DSE1, P0DTU3"
2383,http://identifiers.org/ncbigene/6957,"P0DSE2, P0DTU4"
3932,http://identifiers.org/ncbigene/23499,"O94854, Q9UPN3"
4011,,"P61583, Q9HDB8, Q9HDB9"
4139,,"Q69383, Q69384, Q7LDI9, Q9BXR3, Q9Y6I0"
4589,http://identifiers.org/ncbigene/796,"P01258, P06881"
