# Learning Encodes Remote Homology.

In [1]:
# Needed to import modules from helpers
import sys
import os

current_dir = os.getcwd()
# Gehe einen Ordner nach oben
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [6]:
import torch
import esm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import matplotlib.cm as cm
from helpers import helper
import time
import io
import urllib3
import requests

# kleineres Modell 'esm2_t6_8M_UR50D' zum testen 
# verwendet 36-layer Transformer trained on UniParc" (ca. 670 Mio. Parameter ) im Paper.
model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()

if torch.cuda.is_available():
    model = model.cuda()
    print("Modell auf GPU geladen.")

If structural homology is encoded in the metric structure of the representation space, then the distance between proteins reflects their degree of structural relatedness

### Hypothesis 1:
proteins of same superfamily are closer in the representation space then to proteins of other superfamilies.

### Hypothesis 2:
proteins that have the same fold even if they are of different superfamilies are closer to each other in the representation space than proteins of different superfamilies which dont have the fold. 

### Method:

- Get Dataset
- Test Hypothesis 1
- Test Hypothesis 2


In [7]:
# 1. SETUP: IGNORE SSL ERRORS
# We suppress the warning that appears when verifying is disabled
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# 2. DOWNLOAD/LOAD SCOPe DATA
# Using the latest stable version (2.07)
url = "https://scop.berkeley.edu/downloads/scopeseq-2.07/dir.cla.scope.2.07-stable.txt"

print(f"Downloading SCOPe data from {url}...")

# FIX: Added verify=False to bypass the certificate error
response = requests.get(url, verify=False)

# The file has a header, we skip the first few lines that start with '#'
content = response.content.decode('utf-8')
data_lines = [line for line in content.split('\n') if not line.startswith('#') and line.strip()]

# Parse into DataFrame
# SCOPe format usually: FA-DOMID  PDB  CHAIN  SCCS  SID
data = []
for line in data_lines:
    parts = line.split('\t')
    if len(parts) >= 5:
        data.append({
            'domain_id': parts[0],
            'pdb_id': parts[1],
            'chain': parts[2],
            'sccs': parts[3], # Structural Classification Code (e.g., b.1.1.1)
            'sid': parts[4]
        })

df_scope = pd.DataFrame(data)
print(f"Total domains loaded: {len(df_scope)}")

# 3. DEFINE EXCLUSION CRITERIA
# [cite_start]Based on the paper[cite: 205]:
# "exclude Rossmann-like folds (c.2 to c.5, c.27 and 28, c.30 and 31)"
# "four- to eight-bladed β-propellers (b.66 to b.70)"

rossmann_exclusions = [
    'c.2', 'c.3', 'c.4', 'c.5', 
    'c.27', 'c.28', 
    'c.30', 'c.31'
]

propeller_exclusions = [
    'b.66', 'b.67', 'b.68', 'b.69', 'b.70'
]

# Combine all excluded folds
excluded_folds = set(rossmann_exclusions + propeller_exclusions)

# 4. APPLY FILTERING
def is_excluded(sccs):
    """
    Parses the SCCS code (e.g., 'c.2.1.1') and checks if the fold (e.g., 'c.2')
    is in the exclusion list.
    """
    try:
        parts = sccs.split('.')
        # Fold is the first two parts (Class.Fold)
        fold_code = f"{parts[0]}.{parts[1]}"
        return fold_code in excluded_folds
    except IndexError:
        return False

# Create a boolean mask
mask_excluded = df_scope['sccs'].apply(is_excluded)

# Split datasets
df_excluded_structures = df_scope[mask_excluded]
df_clean_dataset = df_scope[~mask_excluded].copy()

# 5. REPORT RESULTS
print("-" * 40)
print(f"Filtering Report:")
print(f"Original Count:   {len(df_scope)}")
print(f"Excluded Count:   {len(df_excluded_structures)}")
print(f"Final Dataset:    {len(df_clean_dataset)}")
print("-" * 40)
print("Exclusion Breakdown:")
print(f"Rossmann-like excluded: {df_excluded_structures['sccs'].apply(lambda x: x.startswith('c')).sum()}")
print(f"Beta-propellers excluded: {df_excluded_structures['sccs'].apply(lambda x: x.startswith('b')).sum()}")

Downloading SCOPe data from https://scop.berkeley.edu/downloads/scopeseq-2.07/dir.cla.scope.2.07-stable.txt...
Total domains loaded: 0


KeyError: 'sccs'

In [None]:
# 1. Embeddings mit dem TRAINIERTEN Modell holen
print("Berechne TRAINIERTE Embeddings...")
# Schritt 1: Hidden Representations holen
token_reps_trained, batch_strs_trained = helper.get_hidden_representations(model, alphabet, labels, seqs)
# Schritt 2: Mean Pooling durchführen
emb_trained = helper.get_protein_embedding(token_reps_trained, batch_strs_trained)

Berechne TRAINIERTE Embeddings...


In [None]:
# 2. Embeddings mit einem zufälligen (UNTRAINIERTEN) Modell holen (natürlich ist hier ein Problem, 
# dass wir den seed nicht kennen alleine deshalb werden sich hier Sachen vom original Paper unterscheiden)
print("Berechne UNTRAINIERTE Embeddings...")
untrained_model = helper.randomize_model(model)
if torch.cuda.is_available(): 
    untrained_model = untrained_model.cuda()

# Schritt 1: Hidden Representations holen (mit untrainiertem Modell)
token_reps_untrained, batch_strs_untrained = helper.get_hidden_representations(untrained_model, alphabet, labels, seqs)
# Schritt 2: Mean Pooling durchführen
emb_untrained = helper.get_protein_embedding(token_reps_untrained, batch_strs_untrained)

Berechne UNTRAINIERTE Embeddings...
