# Get embeddings for all genes
- get the data https://www.uniprot.org/help/downloads
  - https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/embeddings/
    - https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/embeddings/uniprot_sprot/
      - Download per-protein.h5
  - https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/embeddings/README
    - One thing to note in the README is that:
      - 'Note: Protein sequences longer than 12k residues are excluded due to limitation of GPU memory (this concerns only a handful of proteins).'
  - As seen below, we have most of the embeddings but some of them weren't available

In [17]:
import h5py
import pandas as pd
import pandas as pd
import numpy as np

# Open the h5 file in read-only mode
with h5py.File('data/per-protein.h5', 'r') as f:
    # Read in human_genes_nfs.csv
    df = pd.read_csv('./data/human_genes_nfs.csv')
    # Make 'Accession Number' the index and remove the first column
    df = df.set_index('Accession Number')
    df = df.drop('Unnamed: 0', axis=1)
    # Create a list to hold the data
    data = []
    
    # Iterate over the keys in the h5 file
    for key in f.keys():
        # Get the embedding for this key
        embedding = f[key][:]
        
        if key in df.index:
            # Wrap the embedding in a 2D array
            embedding = np.array([embedding])
        
            # Add the key and embedding to the list
            data.append((key, embedding))
    
    # Convert the list to a pandas DataFrame
    df = pd.DataFrame(data, columns=['key', 'embedding'])
    
    # Save the dataframe to a new csv
    df.to_pickle('./data/human_genes_nfs_embeddings.pkl')

In [19]:
df_embeddings = pd.read_pickle('./data/human_genes_nfs_embeddings.pkl')

In [21]:
# Get information about the DataFrame
df_embeddings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20326 entries, 0 to 20325
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   key        20326 non-null  object
 1   embedding  20326 non-null  object
dtypes: object(2)
memory usage: 317.7+ KB


In [22]:
# Read in human_genes_nfs.csv
import pandas as pd

df = pd.read_csv('./data/human_genes_nfs.csv')
# Make 'Accession Number' the index and remove the first column
df = df.set_index('Accession Number')
df = df.drop('Unnamed: 0', axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20591 entries, A0A075B6Y1 to Q9Y5J6
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         20591 non-null  object 
 1   AA Sequence  20591 non-null  object 
 2   NT Sequence  4683 non-null   object 
 3   Score        20591 non-null  float64
dtypes: float64(1), object(3)
memory usage: 804.3+ KB


In [7]:
# I want to know the unique values in Score
len(df['Score'].unique()), len(df['Score'])