In [1]:
import pandas as pd
import numpy as np
import torch
import tqdm
import json
import dill

#### ID mapping creation

In [2]:
unique_biokg_prot_ids = pd.read_csv('../data/biokg/unique_proteins.csv', 
                                    index_col=0)

In [3]:
uniprot_to_id_mapping = unique_biokg_prot_ids['protein_id'].to_dict()
id_to_uniprot_mapping = {v:k for k,v in uniprot_to_id_mapping.items()}

In [4]:
# Create uniprot_id, idx (internal index) from BioKG proteins.
uniprot_to_id_mapping = unique_biokg_prot_ids['protein_id'].to_dict()

# and the inverse...
id_to_uniprot_mapping = {v:k for k,v in uniprot_to_id_mapping.items()}

# create json objects from dictionaries
uniprot_to_id = json.dumps(uniprot_to_id_mapping)
id_to_uniprot = json.dumps(id_to_uniprot_mapping)

In [5]:
# Save the mappings
# Serialize data into file:
json.dump(uniprot_to_id, open("../data/processed/uniprot_to_id_mapping.json", 'w'))
json.dump(id_to_uniprot, open("../data/processed/id_to_uniprot_mapping.json", 'w'))

#### ID mapping done

#### Coverage check

In [6]:
biokg_uniprot_set = set(unique_biokg_prot_ids['protein_id'])

In [7]:
unique_biokg_uniprot_ids =  set(unique_biokg_prot_ids)

In [8]:
swissprot_to_uniprot_ids = pd.read_csv('../data/uniprot.tsv',
                                    sep='\t')

In [9]:
uniprot_emb_set = set(swissprot_to_uniprot_ids['Entry'])

In [10]:
len(biokg_uniprot_set.difference(uniprot_emb_set))

82379

In [11]:
len(uniprot_emb_set.difference(biokg_uniprot_set))

444

In [12]:
biokg_proteins_without_emb = biokg_uniprot_set.difference(uniprot_emb_set)

In [13]:
test = pd.DataFrame(biokg_proteins_without_emb)

In [14]:
# To store the ones we're missing
# test.to_csv('prot_without_emb.csv', index=False, header=False)

In [15]:
# Load all missing proteins
biokg_missing_proteins = pd.read_csv('../data/uniprot_sequences.tsv',
                                    sep='\t')

In [16]:
biokg_missing_proteins.sort_values(by='Length').head(8)

Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Sequence
68847,P0DPR3,P0DPR3,reviewed,TRDD1_HUMAN,T cell receptor delta diversity 1,TRDD1,Homo sapiens (Human),2,EI
34400,P62968,P62968,reviewed,TRH_PIG,Thyrotropin-releasing hormone (TRH) (Protireli...,TRH,Sus scrofa (Pig),3,QHP
34254,P01858,P01858,reviewed,TUFT_HUMAN,Phagocytosis-stimulating peptide (Tuftsin),,Homo sapiens (Human),4,TKPR
16443,P0DPI4,P0DPI4,reviewed,TDB01_HUMAN,T cell receptor beta diversity 1,TRBD1,Homo sapiens (Human),4,GTGG
50277,P80826,P80826,reviewed,CWP02_ARATH,65 kDa cell wall protein,,Arabidopsis thaliana (Mouse-ear cress),5,EDRTY
45573,P0DKJ0,P0DKJ0,reviewed,P160B_ARATH,Peptide encoded by miPEP160b,miPEP160b At4g17787 FCAALL,Arabidopsis thaliana (Mouse-ear cress),5,MFSPQ
68429,P83308,P83308,reviewed,FARP_CHICK,FMRFamide-like neuropeptide (LPLRF-amide),,Gallus gallus (Chicken),5,LPLRF
26191,P38639,P38639,reviewed,UF01_MOUSE,Unknown protein from 2D-PAGE of fibroblasts (P19),,Mus musculus (Mouse),5,WIGRW


In [17]:
protein_sequences = biokg_missing_proteins[['From', 'Sequence']]

In [18]:
protein_sequences.head(2)

Unnamed: 0,From,Sequence
0,Q8X730,MALWGGRFTQAADQRFKQFNDSLRFDYRLAEQDIVGSVAWSKALVT...
1,Q29RM2,MITLITEQLQKQTLDELKCTRFSISLPLPDHADISNCGNPFQLVSE...


#### Coverage check done

#### Embedding gen

In [19]:
import numpy as np
import bio_embeddings
from bio_embeddings.embed import SeqVecEmbedder, ProtTransBertBFDEmbedder, prottrans_t5_embedder, esm_embedder

In [20]:
# IF THIS CELLS EXECUTES FOR THE FIRST TIME, EXPECT A DELAY
prot_trans_embedder = ProtTransBertBFDEmbedder()

In [21]:
# Get the aggregated protein representation

def get_protein_repr(amino_repr, embedder):
    # Here we need to go from a collection of amino-acid embeddings to a full protein embedding
    #
    # Example: 
    #    
    #   M : (1,1024)
    #   A : (1,1024)
    #   S : (1,1024)
    #   
    #  Output: An aggregated representation for proteins
    #
    #  Type: Dict(protein_id: (embedding))
    #
    #  e.g Dict(: (LENG8_MOUSE, 1024))
    
    amino_repr_embedding = embedder.embed(amino_repr)
    emb_matrix = torch.Tensor(amino_repr_embedding)
    protein_emb = torch.mean(emb_matrix, dim=0)

    return protein_emb

In [22]:
amino = 'EDRTY'
result = get_protein_repr(amino_repr=amino, embedder=prot_trans_embedder)

In [23]:
## A bit of preprocessing

protein_sequences['seq_len'] = protein_sequences['Sequence'].apply(len)

protein_sequences = protein_sequences.sort_values(by='seq_len')

protein_sequences['embedding'] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_sequences['seq_len'] = protein_sequences['Sequence'].apply(len)


In [24]:
protein_sequences.head(3)

Unnamed: 0,From,Sequence,seq_len,embedding
68847,P0DPR3,EI,2,
34400,P62968,QHP,3,
34254,P01858,TKPR,4,


In [25]:
# Batch it

protein_sequences_a = protein_sequences[:10000]

protein_sequences_b = protein_sequences[10001:20000]

protein_sequences_c = protein_sequences[20001:30000]

protein_sequences_d = protein_sequences[30001:40000]

protein_sequences_e = protein_sequences[40001:50000]

protein_sequences_f = protein_sequences[50001:60000]

protein_sequences_g = protein_sequences[60001:70000]

protein_sequences_h = protein_sequences[70001:82324]

In [26]:
df_list = [protein_sequences_a, protein_sequences_b, protein_sequences_c, 
           protein_sequences_d, protein_sequences_e, protein_sequences_f,
           protein_sequences_g, protein_sequences_h]

In [27]:
########### SMALL TEST BEFORE THE FULL THING 
protein_sequences_test = protein_sequences[:10]
    
for row in protein_sequences_test.itertuples():
    protein_sequences_test.at[row.Index, 'embedding'] = torch.Tensor(get_protein_repr(row[1], prot_trans_embedder))
    
# Out of the loop

protein_sequences_test = protein_sequences_test.set_index('From')

protein_emb_df = protein_sequences_test[['embedding']]

# Make it a dict
protein_emb = protein_emb_df.to_dict()

print(f"Saving protein embeddings......")
# Save it
filename = f'../data/processed/test_protein_batch.pt'

with open(filename, 'wb') as f:  # Overwrites any existing file.
    torch.save(protein_emb, f, pickle_module=dill)

########### SMALL TEST BEFORE THE FULL THING    

Saving protein embeddings......


In [28]:
# Loop that generates all embeddings for all proteins
for count, df in enumerate(df_list):
    
    # Get emb for every batch
    for row in df.itertuples():
        df.at[row.Index, 'embedding'] = torch.Tensor(get_protein_repr(row[1], prot_trans_embedder))
        
    df = df.set_index('From')
    
    # save every batch as a .pt
    df = df[['embedding']]

    # Make it a dict
    protein_emb = df.to_dict()

    print(f"Saving protein embeddings......")
    # Save it
    filename = f'../data/processed/{count}_protein_batch.pt'

    with open(filename, 'wb') as f:  # Overwrites any existing file.
        torch.save(protein_emb, f, pickle_module=dill)

Saving protein embeddings......
Saving protein embeddings......
Saving protein embeddings......
Saving protein embeddings......
Saving protein embeddings......
Saving protein embeddings......
Saving protein embeddings......


In [33]:
for row in protein_sequences_h.itertuples():
    protein_sequences_h.at[row.Index, 'embedding'] = torch.Tensor(get_protein_repr(row[1], prot_trans_embedder))
        
df = protein_sequences_h.set_index('From')

# save every batch as a .pt
df = df[['embedding']]

# Make it a dict
protein_emb = df.to_dict()

print(f"Saving protein embeddings......")
# Save it
filename = f'../data/processed/7_protein_batch.pt'

with open(filename, 'wb') as f:  # Overwrites any existing file.
    torch.save(protein_emb, f, pickle_module=dill)

Saving protein embeddings......


In [None]:
protein_sequences_a.head(3)

In the above example we see a protein of 406 amino-acids is represented by a (406,1024) matrix.

To get the final representation we will make it a (1,1024) by "squashing" the amino-acids together.

In [None]:
### Load embeddings - merge - store them for use down the line

In [30]:
test_prot = protein_sequences.set_index('From')

In [32]:
test_prot.sort_index()

Unnamed: 0_level_0,Sequence,seq_len,embedding
From,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A0A023PXA5,MLLSELVATASSLPYTAISIHNNCRVPAARHIHHGCRYFHGPPVMH...,189,"[tensor(-0.0164), tensor(-0.0954), tensor(-0.0..."
A0A023PXB0,MFINGFVNYPVRTPPNDLLQVVLHGFLRCPLDGSQVDSIGIGHTVH...,110,"[tensor(0.0625), tensor(-0.1014), tensor(-0.05..."
A0A023PXB5,MFALIISSKGKTSGFFFNSSFSSSALVGIAPLTAYSALVTPVFKSF...,102,"[tensor(0.0625), tensor(-0.1014), tensor(-0.05..."
A0A023PXB9,MEYVLIYNIWFFSFLQDKPCFCFVDYACSIFLLSSYCGNCLTAVAT...,121,"[tensor(0.0625), tensor(-0.1014), tensor(-0.05..."
A0A023PXC2,MLPLCLTFLSFFLSLGGSFKAVMTKEEADGTTEAAACLFWIFNWTV...,115,"[tensor(0.0523), tensor(-0.0484), tensor(-0.01..."
...,...,...,...
X5HYT8,MASPGPAAGMQQKLEAAAAAAGGGDGAEWGRGMQKMEAVGAGGEGV...,425,"[tensor(0.0267), tensor(-0.0787), tensor(0.020..."
X5JA13,MTEGIRARGPRSSSVNSVPLILDIEDFKGDFSFDALFGNLVNDLLP...,825,
X5JB51,MTERIRARGPRSSSVNSVPLILDIEDFKGDFSFDALFGNLVNDLLP...,829,
X5M5N0,MPDSITNGGRPPAPPSSVSSTTASTTGNFGTRRRLVNRIKKVDELH...,1850,


In [None]:
test_tensor = torch.load('../data/processed/test_protein_batch.pt')
test_df = pd.DataFrame.from_dict(test_tensor)

In [None]:
type(test_df.loc['P01858']['embedding'])