# Learning Encodes Alignment within a Protein Family.

In [1]:
# Needed to import modules from helpers
import sys
import os

current_dir = os.getcwd()
# Gehe einen Ordner nach oben
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
import torch
import esm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import matplotlib.cm as cm
from helpers import helper
import time
import io
import urllib3
import requests

# kleineres Modell 'esm2_t6_8M_UR50D' zum testen 
# verwendet 36-layer Transformer trained on UniParc" (ca. 670 Mio. Parameter ) im Paper.
model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()

if torch.cuda.is_available():
    model = model.cuda()
    print("Modell auf GPU geladen.")

### Hypothesis 1:
final hidden representations of a sequence encode information about the family it belongs to.

### Method:

- Get Dataset (Pfam)
- compare the distribution of cosine similarities of representations between pairs of residues that are aligned in the family’s MSA background distribution of cosine similarities between unaligned pairs of residues.
- Compare with distributions befor learning (We need the embeddings befor pretraining (randomize model))


In [5]:
import gzip
import urllib.request
from Bio import AlignIO
from io import StringIO

def get_pfam_seed_by_id(pfam_accession):
    """
    Streams the Pfam-A seed database from EBI FTP and extracts a specific family.
    Bypasses broken API endpoints.
    """
    # Official EBI FTP URL for the current release of Pfam Seed Alignments
    url = "http://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.seed.gz"
    
    print(f"Streaming Pfam database to find {pfam_accession}...")
    print("This may take 1-2 minutes as it searches the compressed stream.")

    with urllib.request.urlopen(url) as response:
        # Decompress the stream on the fly
        with gzip.open(response, 'rt') as f:
            entry_buffer = []
            for line in f:
                entry_buffer.append(line)
                # End of a record in Stockholm format
                if line.startswith("//"):
                    block = "".join(entry_buffer)
                    # Check if this block matches our target Accession
                    if f"AC   {pfam_accession}" in block:
                        return block
                    entry_buffer = [] # Reset buffer for next entry
    return None

# 1. Fetch the data for PF01010 (Response regulator receiver domain)
msa_data = get_pfam_seed_by_id("PF01010")

if msa_data:
    # 2. Parse using BioPython
    msa = AlignIO.read(StringIO(msa_data), "stockholm")
    
    # 3. Verify
    print(f"\nSuccess! Loaded {msa[0].id}")
    print(f"Sequence count: {len(msa)}")
    print(f"Alignment length: {msa.get_alignment_length()}")
else:
    print("Family not found in the current Pfam release.")

Streaming Pfam database to find PF01010...
This may take 1-2 minutes as it searches the compressed stream.

Success! Loaded Q8HUX0_9ROSA/106-348
Sequence count: 76
Alignment length: 344


In [None]:
# 1. Embeddings mit dem TRAINIERTEN Modell holen
print("Berechne TRAINIERTE Embeddings...")
# Schritt 1: Hidden Representations holen
token_reps_trained, batch_strs_trained = helper.get_hidden_representations(model, alphabet, labels, seqs)
# Schritt 2: Mean Pooling durchführen
emb_trained = helper.get_protein_embedding(token_reps_trained, batch_strs_trained)

Berechne TRAINIERTE Embeddings...


In [None]:
# 2. Embeddings mit einem zufälligen (UNTRAINIERTEN) Modell holen (natürlich ist hier ein Problem, 
# dass wir den seed nicht kennen alleine deshalb werden sich hier Sachen vom original Paper unterscheiden)
print("Berechne UNTRAINIERTE Embeddings...")
untrained_model = helper.randomize_model(model)
if torch.cuda.is_available(): 
    untrained_model = untrained_model.cuda()

# Schritt 1: Hidden Representations holen (mit untrainiertem Modell)
token_reps_untrained, batch_strs_untrained = helper.get_hidden_representations(untrained_model, alphabet, labels, seqs)
# Schritt 2: Mean Pooling durchführen
emb_untrained = helper.get_protein_embedding(token_reps_untrained, batch_strs_untrained)

Berechne UNTRAINIERTE Embeddings...
