# CLIP approach

This notebook is inspired by this paper, where the authors tried a CLIP approach to predict structure of antiobody sequences. Here we try to apply the same to RNA folding. 

- https://www.mlsb.io/papers_2023/Enhancing_Antibody_Language_Models_with_Structural_Information.pdf


In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

fasta_files = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if not"fasta" in filename:
            print(os.path.join(dirname, filename))
        else:
            fasta_files.append(filename)
print(f"{len(fasta_files)} fasta files")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stanford-rna-3d-folding/sample_submission.csv
/kaggle/input/stanford-rna-3d-folding/validation_sequences.csv
/kaggle/input/stanford-rna-3d-folding/test_sequences.csv
/kaggle/input/stanford-rna-3d-folding/validation_labels.csv
/kaggle/input/stanford-rna-3d-folding/train_labels.csv
/kaggle/input/stanford-rna-3d-folding/train_sequences.csv
856 fasta files


In [2]:
sequences_df = pd.read_csv("/kaggle/input/stanford-rna-3d-folding/train_sequences.csv")
sequences_df[["target_id", "sequence"]]

Unnamed: 0,target_id,sequence
0,1SCL_A,GGGUGCUCAGUACGAGAGGAACCGCACCC
1,1RNK_A,GGCGCAGUGGGCUAGCGCCACUCAAAAGGCCCAU
2,1RHT_A,GGGACUGACGAUCACGCAGUCUAU
3,1HLX_A,GGGAUAACUUCGGUUGUCCC
4,1HMH_E,GGCGACCCUGAUGAGGCCGAAAGGCCGAAACCGU
...,...,...
839,8T3E_EC,AAACUCCAUGUAUUGGUUACCCAUCUGCAUCGAAAACUCUCCGAAC...
840,8T3F_EC,AAACUCCAUGUAUUGGUUACCCAUCUGCAUCGAAAACUCUCCGAAC...
841,8XCC_B,GUGCUGCUGUCUCCCAGACGGGAGGCAGAACUGCACCUUCCAUCAG...
842,8Z1G_T,GGUAAAAUGGCUGAGUGAAGCAUUGGACUGUAAAUCUAAAGACAGG...


In [3]:
labels_df = pd.read_csv("/kaggle/input/stanford-rna-3d-folding/train_labels.csv")
labels_df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,ID,resname,resid,x_1,y_1,z_1
0,1SCL_A_1,G,1,13.760,-25.974001,0.102
1,1SCL_A_2,G,2,9.310,-29.638000,2.669
2,1SCL_A_3,G,3,5.529,-27.813000,5.878
3,1SCL_A_4,U,4,2.678,-24.900999,9.793
4,1SCL_A_5,G,5,1.827,-20.136000,11.793
...,...,...,...,...,...,...
137090,8Z1F_T_82,U,82,,,
137091,8Z1F_T_83,C,83,,,
137092,8Z1F_T_84,A,84,,,
137093,8Z1F_T_85,U,85,,,


In [4]:
submission = pd.read_csv("/kaggle/input/stanford-rna-3d-folding/sample_submission.csv")
submission.head()

Unnamed: 0,ID,resname,resid,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,R1107_1,G,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,R1107_2,G,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,R1107_3,G,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,R1107_4,G,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,R1107_5,G,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Using CLIP approach

In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

# Assume you already have your two dataframes loaded:
# sequences_df has columns: "target_id" and "sequence"
# labels_df has columns: "ID", "resname", "resid", "x_1", "y_1", "z_1"

# Example: sequences_df = pd.read_csv("sequences.csv")
#          labels_df = pd.read_csv("labels.csv")

# --- Process the labels dataframe to get one structure representation per sequence ---
# Extract target_id from the "ID" field (e.g. "1SCL_A_1" -> "1SCL_A")
def extract_target_id(label_id):
    return "_".join(label_id.split("_")[:-1])

labels_df['target_id'] = labels_df['ID'].apply(extract_target_id)

# Group by target_id and compute average coordinates (ignoring NaNs)
structure_agg = labels_df.groupby("target_id")[["x_1", "y_1", "z_1"]].mean().reset_index()

# Merge the structure data with sequences
merged_df = pd.merge(sequences_df, structure_agg, on="target_id", how="inner")
# Now merged_df has columns: target_id, sequence, x_1, y_1, z_1

# --- Define helper functions to compute simple features ---

# For RNA sequence, compute normalized nucleotide frequencies (order: A, C, G, U)
def sequence_to_feature(seq):
    seq = seq.upper()
    counts = [seq.count(nuc) for nuc in "ACGU"]
    total = sum(counts)
    if total > 0:
        return np.array([c/total for c in counts], dtype=np.float32)
    else:
        return np.zeros(4, dtype=np.float32)

# For structure, we already have average x, y, z coordinates
def structure_to_feature(row):
    # row: a pandas Series with x_1, y_1, z_1 values
    return np.array([row["x_1"], row["y_1"], row["z_1"]], dtype=np.float32)

# --- Prepare feature matrices from the merged dataframe ---
seq_features = np.stack(merged_df['sequence'].apply(sequence_to_feature).values)  # shape: (N, 4)
struct_features = np.stack(merged_df.apply(structure_to_feature, axis=1).values)  # shape: (N, 3)

# Convert features to torch tensors
seq_features_tensor = torch.tensor(seq_features)       # shape: (batch_size, 4)
struct_features_tensor = torch.tensor(struct_features) # shape: (batch_size, 3)

# --- Define simple encoder networks ---
# In practice, you'll replace these with your Helix-mRNA model (or its embeddings) and a proper structure encoder.

class RNASequenceEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim):
        super(RNASequenceEncoder, self).__init__()
        self.fc = nn.Linear(input_dim, emb_dim)
    def forward(self, x):
        return self.fc(x)

class RNAStructureEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim):
        super(RNAStructureEncoder, self).__init__()
        self.fc = nn.Linear(input_dim, emb_dim)
    def forward(self, x):
        return self.fc(x)

# Set embedding dimension (for example, 256)
emb_dim = 256

seq_encoder = RNASequenceEncoder(input_dim=4, emb_dim=emb_dim)
struct_encoder = RNAStructureEncoder(input_dim=3, emb_dim=emb_dim)

# --- Define a contrastive loss function similar to CLIP's ---
class ContrastiveCLIPLoss(nn.Module):
    def __init__(self, temperature=0.07):
        super(ContrastiveCLIPLoss, self).__init__()
        self.temperature = temperature
    def forward(self, seq_embeddings, struct_embeddings):
        # Normalize embeddings
        seq_norm = F.normalize(seq_embeddings, dim=1)
        struct_norm = F.normalize(struct_embeddings, dim=1)
        # Compute cosine similarity matrix
        logits = torch.matmul(seq_norm, struct_norm.t()) / self.temperature
        batch_size = logits.size(0)
        labels = torch.arange(batch_size).to(logits.device)
        loss_seq = F.cross_entropy(logits, labels)
        loss_struct = F.cross_entropy(logits.t(), labels)
        return (loss_seq + loss_struct) / 2.0

# --- Compute embeddings and loss ---
seq_embeddings = seq_encoder(seq_features_tensor)         # (N, emb_dim)
struct_embeddings = struct_encoder(struct_features_tensor)  # (N, emb_dim)

# Optionally, normalize embeddings (the loss function does this internally too)
criterion = ContrastiveCLIPLoss(temperature=0.07)
loss = criterion(seq_embeddings, struct_embeddings)
print("Contrastive loss:", loss.item())

# --- Training Considerations ---
# In a real training loop, you would iterate over batches of your data,
# update the sequence encoder (and optionally the structure encoder), and backpropagate the contrastive loss.


Contrastive loss: nan
