## This notebook will demonstrate the huggingface API with the pretrained model, ProtBERT, to get embeddings for each of the available proteins in the dataset

In [None]:
!pip install transformers

In [4]:
from transformers import BertForMaskedLM, BertTokenizer, pipeline
#import Auto
import pandas as pd
import numpy as np
import re
import torch

yaam = pd.DataFrame(np.load('../data/YAAM.pkl',allow_pickle=True))
yaam.columns = ['orf','residues','ptms','seqs','annotations','protein_token']
yaam.head(16)

Unnamed: 0,orf,residues,ptms,seqs,annotations,protein_token
0,YDR148C,14,18,MLSRATRTAAAKSLVKSKVARNVMAASFVKRHASTSLFKQANKVES...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 2, 3, 4, 5, 6, 4, 6, 5, 5, 5, 7, 3, 2, 8, ..."
1,YIL037C,1,1,MNNVHIIKPLSLPQRFFSCIFHPLLLIFFTSVILTIWGSFSVIDIT...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 9, 9, 8, 11, 15, 15, 7, 17, 2, 3, 2, 17, 1..."
2,YPL195W,30,30,MTSLYAPGAEDIRQRLRPFGFFFEKSLKDLIKGIRSHNETPEKLDQ...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 6, 3, 2, 16, 5, 17, 14, 5, 13, 18, 15, 4, ..."
3,YDL194W,15,15,MDPNSNSSSETLRQEKQGFLDKALQRVKGIALRRNNSNKDHTTDDT...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 18, 17, 9, 3, 9, 3, 3, 3, 13, 6, 2, 4, 12,..."
4,YNL331C,1,1,MTDLFKPLPEPPTELGRLRVLSKTAGIRVSPLILGGASIGDAWSGF...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 6, 18, 2, 10, 7, 17, 2, 17, 13, 17, 17, 6,..."
5,YIL079C,6,6,MSTLLSEVESIDTLPYVKDTTPTGSDSSSFNKLLAPSIEDVDANPE...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 3, 6, 2, 2, 3, 13, 8, 13, 3, 15, 18, 6, 2,..."
6,YOL086C,42,80,MSIPETQKGVIFYESHGKLEYKDIPVPKPKANELLINVKYSGVCHT...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 3, 15, 17, 13, 6, 12, 7, 14, 8, 15, 10, 16..."
7,YMR177W,1,1,MLRICVKRPCIKIVLSQVRPALLVRKENLHISTGVKVEKSSIINQK...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 2, 4, 15, 19, 8, 7, 4, 17, 19, 15, 7, 15, ..."
8,YKR067W,24,24,MSAPAADHNAAKPIPHVPQASRRYKNSYNGFVYNIHTWLYDVSVFL...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 3, 5, 17, 5, 5, 18, 11, 9, 5, 5, 7, 17, 15..."
9,YBR093C,14,14,MFKSVVYSILAASLANAGTIPLGKLADVDKIGTQKDIFPFLGGAGP...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 10, 7, 3, 8, 8, 16, 3, 15, 2, 5, 5, 3, 2, ..."


### Loading huggingface API for the ProtBERT model

In [None]:
from transformers import BertForMaskedLM, BertTokenizer, pipeline
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
model = BertForMaskedLM.from_pretrained("Rostlab/prot_bert")
model.to('cuda')

Load the pre-trained model, pre-process the data, and collect the outputs from the model. Take the LAST hidden layer to be used for token classification. This is the protocol for using the feature embeddings for token classification

In [None]:
from tqdm import tqdm
representations = []
for i in tqdm(range(len(yaam))):
    try:
    #print(i)
    sequence_Example = yaam['seqs'][i]
    #print(sequence_Example)
    sequence_Example = re.sub(r"[UZOB]", "X", sequence_Example)
    sequence_Example = ' '.join(sequence_Example.replace('\n',''))
    encoded_input = tokenizer(sequence_Example, return_tensors='pt')
    with torch.no_grad():
        output = model(**encoded_input.to('cuda'),output_hidden_states=True)

    representations.append(output['hidden_states'][-1])  # get last layers hidden state for classificaiton

The tokens can be used now for standard classification with ML tools like random forests