In [None]:
import pandas as pd
import torch
import tqdm
import json

#### ID mapping creation

In [None]:
unique_biokg_prot_ids = pd.read_csv('../data/biokg/unique_proteins.csv', 
                                    index_col=0)

In [None]:
uniprot_to_id_mapping = unique_biokg_prot_ids['protein_id'].to_dict()
id_to_uniprot_mapping = {v:k for k,v in uniprot_to_id_mapping.items()}

In [None]:
id_to_uniprot_mapping

In [None]:
len(uniprot_to_id_mapping)

In [None]:
# Create uniprot_id, idx (internal index) from BioKG proteins.
uniprot_to_id_mapping = unique_biokg_prot_ids['protein_id'].to_dict()

# and the inverse...
id_to_uniprot_mapping = {v:k for k,v in uniprot_to_id_mapping.items()}

# create json objects from dictionaries
uniprot_to_id = json.dumps(uniprot_to_id_mapping)
id_to_uniprot = json.dumps(id_to_uniprot_mapping)

In [None]:
# Save the mappings
# Serialize data into file:
json.dump(uniprot_to_id, open("../data/processed/uniprot_to_id_mapping.json", 'w'))
json.dump(id_to_uniprot, open("../data/processed/id_to_uniprot_mapping.json", 'w'))

#### ID mapping done

#### Coverage check

In [None]:
biokg_uniprot_set = set(unique_biokg_prot_ids['protein_id'])

In [None]:
unique_biokg_uniprot_ids =  set(unique_biokg_prot_ids)

In [None]:
swissprot_to_uniprot_ids = pd.read_csv('../data/uniprot.tsv',
                                    sep='\t')

In [None]:
uniprot_emb_set = set(swissprot_to_uniprot_ids['Entry'])

In [None]:
len(biokg_uniprot_set.difference(uniprot_emb_set))

In [None]:
len(uniprot_emb_set.difference(biokg_uniprot_set))

In [None]:
biokg_proteins_without_emb = biokg_uniprot_set.difference(uniprot_emb_set)

In [None]:
test = pd.DataFrame(biokg_proteins_without_emb)

In [None]:
# To store the ones we're missing
# test.to_csv('prot_without_emb.csv', index=False, header=False)

In [None]:
# Load all missing proteins
biokg_missing_proteins = pd.read_csv('../data/uniprot_sequences.tsv',
                                    sep='\t')

In [None]:
biokg_missing_proteins.sort_values(by='Length').head(8)

In [None]:
protein_sequences = biokg_missing_proteins[['From', 'Sequence']]

In [None]:
protein_sequences.head(2)

#### Coverage check done

#### Embedding gen

In [None]:
import numpy as np
import bio_embeddings
from bio_embeddings.embed import SeqVecEmbedder, ProtTransBertBFDEmbedder, prottrans_t5_embedder, esm_embedder

In [None]:
# IF THIS CELLS EXECUTES FOR THE FIRST TIME, EXPECT A DELAY
prot_trans_embedder = ProtTransBertBFDEmbedder()

In [None]:
# Get the aggregated protein representation

def get_protein_repr(amino_repr, embedder):
    # Here we need to go from a collection of amino-acid embeddings to a full protein embedding
    #
    # Example: 
    #    
    #   M : (1,1024)
    #   A : (1,1024)
    #   S : (1,1024)
    #   
    #  Output: An aggregated representation for proteins
    #
    #  Type: Dict(protein_id: (embedding))
    #
    #  e.g Dict(: (LENG8_MOUSE, 1024))
    
    amino_repr_embedding = embedder.embed(amino_repr)
    emb_matrix = torch.Tensor(amino_repr_embedding)
    protein_emb = torch.mean(emb_matrix, dim=0)
    
    return protein_emb

In [None]:
protein_sequences['seq_len'] = protein_sequences['Sequence'].apply(len)

In [None]:
protein_sequences = protein_sequences.sort_values(by='seq_len')


In [None]:
protein_sequences['embedding'] = None

In [None]:
protein_sequences_a = protein_sequences[:10000]

protein_sequences_b = protein_sequences[10001:20000]

protein_sequences_c = protein_sequences[20001:30000]

protein_sequences_d = protein_sequences[30001:40000]

protein_sequences_e = protein_sequences[40001:50000]

protein_sequences_f = protein_sequences[50001:60000]

protein_sequences_g = protein_sequences[60001:70000]

protein_sequences_h = protein_sequences[70001:82324]

In [None]:
df_list = [protein_sequences_a, protein_sequences_b, protein_sequences_c, protein_sequences_d, protein_sequences_e, protein_sequences_f,
           protein_sequences_g]

In [None]:
# protein_sequences_a_test = protein_sequences_a.sample(10)
# protein_sequences_a_test

In [None]:
%%time

# Test the first and time it.
for row in protein_sequences_a.itertuples():
    protein_sequences_a.at[row.Index, 'embedding'] = get_protein_repr(row[1], prot_trans_embedder)

In [None]:
protein_sequences_a.to_csv('../data/processed/protein_sequences_a.tsv',
                                sep='\t')

In [None]:
# Loop that generates all embeddings for all proteins
for df in df_list:
    for row in df.itertuples():
        df.at[row.Index, 'embedding'] = get_protein_repr(row[1], prot_trans_embedder)

In [None]:
protein_sequences_a.head(3)

In the above example we see a protein of 406 amino-acids is represented by a (406,1024) matrix.

To get the final representation we will make it a (1,1024) by "squashing" the amino-acids together.

In [None]:
### Load embeddings - merge - store them for use down the line

In [None]:
from pathlib import Path

p = Path(r'../data/processed/').glob('**/*.tsv')
files = [x for x in p if x.is_file()]

In [None]:
files

In [None]:
li=[]

for file in files:
    df = pd.read_csv(f"{file}", 
                    index_col=None, 
                    header=0,
                    sep='\t')
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [None]:
frame.head(3)

In [None]:
frame = frame[['From','embedding']]

# Test for errors
frame.embedding.isnull().values.any()

In [None]:
# Match internal ids to uniprots
frame['indx'] = frame['From'].apply(lambda x: id_to_uniprot_mapping[x])


In [None]:
frame[['indx']]

In [None]:
# Store
frame.to_csv('../data/processed/biokg_prottrans_emb.tsv', sep='\t')