# Getting My Head Around InterPro data

In [16]:
import torch.nn as nn

from accelerate import Accelerator
from esm.models.esm3 import ESM3
from esm.sdk.api import ESMProtein, SamplingConfig
from esm.utils.constants.models import ESM3_OPEN_SMALL
from esm.utils.function.interpro import InterPro

In [2]:
# device = Accelerator().device
# device = "cpu"
interpro = InterPro()

# TODO
- Start with a single InterPro domain - train a model that predicts if one domain is expressed in these proteins or not

# Helpful Links
- https://github.com/evolutionaryscale/esm/issues/2
- https://www.ebi.ac.uk/interpro/protein/reviewed/P00748/entry/InterPro/#table

# Entries

Entries seem to be categories - there are $\approx$29k

In [3]:
len(interpro.entries)

29026

In [29]:
interpro.entries['IPR000742']

InterProEntry(id='IPR000742', type=3, name='EGF-like domain', description=None)

**Example**: Here's a protein `P00748` (Coagulation factor XII - https://www.ebi.ac.uk/interpro/protein/reviewed/P00748/). Its sequence is:

In [4]:
cur_sequence = "MRALLLLGFLLVSLESTLSIPPWEAPKEHKYKAEEHTVVLTVTGEPCHFPFQYHRQLYHKCTHKGRPGPQPWCATTPNFDQDQRWGYCLEPKKVKDHCSKHSPCQKGGTCVNMPSGPHCLCPQHLTGNHCQKEKCFEPQLLRFFHKNEIWYRTEQAAVARCQCKGPDAHCQRLASQACRTNPCLHGGRCLEVEGHRLCHCPVGYTGAFCDVDTKASCYDGRGLSYRGLARTTLSGAPCQPWASEATYRNVTAEQARNWGLGGHAFCRNPDNDIRPWCFVLNRDRLSWEYCDLAQCQTPTQAAPPTPVSPRLHVPLMPAQPAPPKPQPTTRTPPQSQTPGALPAKREQPPSLTRNGPLSCGQRLRKSLSSMTRVVGGLVALRGAHPYIAALYWGHSFCAGSLIAPCWVLTAAHCLQDRPAPEDLTVVLGQERRNHSCEPCQTLAVRSYRLHEAFSPVSYQHDLALLRLQEDADGSCALLSPYVQPVCLPSGAARPSETTLCQVAGWGHQFEGAEEYASFLQEAQVPFLSLERCSAPDVHGSSILPGMLCAGFLEGGTDACQGDSGGPLVCEDQAAERRLTLQGIISWGSGCGDRNKPGVYTDVAYYLAWIREHTVS"

In [5]:
len(cur_sequence)

615

# ESM Embeddings

We can easily get embeddings from ESM models:

In [7]:
client = ESM3.from_pretrained(ESM3_OPEN_SMALL)

Fetching 22 files:   0%|          | 0/22 [00:00<?, ?it/s]

  state_dict = torch.load(


In [13]:
protein = ESMProtein(sequence=cur_sequence)
protein_tensor = client.encode(protein)
output = client.forward_and_sample(
    protein_tensor, SamplingConfig(return_per_residue_embeddings=True)
)

In [14]:
output.per_residue_embedding.shape

torch.Size([617, 1536])

# Basic InterPro Head

In [31]:
class InterProHead(nn.Module):
    def __init__(
        self,
        input_dim: int,
        output_dim: int = len(interpro.entries),
        dropout_rate: float = 0.1
    ):
        super(InterProHead, self).__init__()
        layers = []

        # Simple one-layer affair
        layers.append(nn.Linear(input_dim, output_dim))
        layers.append(nn.GELU())  # Activation
        layers.append(nn.Dropout(dropout_rate))
        layers.append(nn.Sigmoid())  # Binary classification output
        
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


In [32]:
interpro_head = InterProHead(
    input_dim=output.per_residue_embedding.shape[-1],
    output_dim=1
)

In [33]:
# Probability of 
interpro_head(output.per_residue_embedding).shape

torch.Size([617, 1])

In [34]:
# Probability of 
interpro_head(output.per_residue_embedding)

tensor([[1.0000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [1

# All UniProtKB proteins and the InterPro entries

More work is needed here - need to figure out how to read the data

In [35]:
import gzip
import os

In [36]:
i = 0

with gzip.open('esm/data/interpro/match_complete.xml.gz', 'rt') as f:
    for line in f:
        print(line)  # process each line
        i = i + 1
        if i > 200:
            break

<?xml version="1.0" encoding="UTF-8"?>

<!DOCTYPE interpromatch SYSTEM "match_complete.dtd">

<interpromatch>

<release>

  <dbinfo dbname="SFLD" version="4" entry_count="303" file_date="07-SEP-18"/>

  <dbinfo dbname="PRINTS" version="42.0" entry_count="2106" file_date="14-JUN-12"/>

  <dbinfo dbname="PFAM" version="37.2" entry_count="24076" file_date="06-FEB-25"/>

  <dbinfo dbname="INTERPRO" version="104.0" entry_count="47677" file_date="06-FEB-25"/>

  <dbinfo dbname="CDD" version="3.21" entry_count="19902" file_date="18-APR-24"/>

  <dbinfo dbname="PROFILE" version="2023_05" entry_count="1379" file_date="08-NOV-23"/>

  <dbinfo dbname="NCBIFAM" version="17.0" entry_count="31915" file_date="16-DEC-24"/>

  <dbinfo dbname="PROSITE" version="2023_05" entry_count="1311" file_date="08-NOV-23"/>

  <dbinfo dbname="HAMAP" version="2023_05" entry_count="2389" file_date="08-NOV-23"/>

  <dbinfo dbname="SMART" version="9.0" entry_count="1322" file_date="14-FEB-20"/>

  <dbinfo dbname="PIRSF