# Preprocessing protein embeddings

Here we get embeddings for all EColi, Human, and Arabidopsis proteins, downloaded directly from [UniProt](https://www.uniprot.org/help/downloads).

They are 1024 dimensions, produced by `prottrans_t5_xl_u50`.


In [1]:
import pandas as pd
import numpy as np
import h5py

In [2]:
file = h5py.File("ProtT5_embeddings/ecoli_embeddings.h5", "r")
print(f"number of entries: {len(file.items())}")
for sequence_id, embedding in file.items():
    print(
        f"  id: {sequence_id}, "
        f"  embeddings shape: {embedding.shape}, "
        f"  embeddings mean: {np.array(embedding).mean()}"
    )

    break
    
keys = []
ecoli_embeds = []

for k, v in file.items():
    keys.append(k)
    ecoli_embeds.append(np.array(v))

number of entries: 4403
  id: A0A385XJ53,   embeddings shape: (1024,),   embeddings mean: -0.004241943359375


In [3]:
pd.DataFrame(ecoli_embeds, index=keys, 
             columns=['PT5_%d' % i for i in range(1024)]).to_csv(
    'ProtT5_embeddings/Ecoli_preprocessed.csv')

In [4]:
file = h5py.File("ProtT5_embeddings/athaliana_embeddings.h5", "r")
print(f"number of entries: {len(file.items())}")
for sequence_id, embedding in file.items():
    print(
        f"  id: {sequence_id}, "
        f"  embeddings shape: {embedding.shape}, "
        f"  embeddings mean: {np.array(embedding).mean()}"
    )

    break
    
keys = []
athali_embeds = []

for k, v in file.items():
    keys.append(k)
    athali_embeds.append(np.array(v))

number of entries: 27448
  id: A0A0A7EPL0,   embeddings shape: (1024,),   embeddings mean: 0.00220489501953125


In [5]:
pd.DataFrame(athali_embeds, index=keys, 
             columns=['PT5_%d' % i for i in range(1024)]).to_csv(
    'ProtT5_embeddings/Athaliana_preprocessed.csv')

In [6]:
file = h5py.File("ProtT5_embeddings/human_embeddings.h5", "r")
print(f"number of entries: {len(file.items())}")
for sequence_id, embedding in file.items():
    print(
        f"  id: {sequence_id}, "
        f"  embeddings shape: {embedding.shape}, "
        f"  embeddings mean: {np.array(embedding).mean()}"
    )

    break
    
keys = []
h_embeds = []

for k, v in file.items():
    keys.append(k)
    h_embeds.append(np.array(v))

number of entries: 20594
  id: A0A024R1R8,   embeddings shape: (1024,),   embeddings mean: -0.0029697418212890625


In [7]:
pd.DataFrame(h_embeds, index=keys, 
             columns=['PT5_%d' % i for i in range(1024)]).to_csv(
    'ProtT5_embeddings/Human_preprocessed.csv')