# CLIP approach

This notebook is inspired by this paper, where the authors tried a CLIP approach to predict structure of antiobody sequences. Here we try to apply the same to RNA folding. 

- https://www.mlsb.io/papers_2023/Enhancing_Antibody_Language_Models_with_Structural_Information.pdf


In [16]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

fasta_files = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if not"fasta" in filename:
            print(os.path.join(dirname, filename))
        else:
            fasta_files.append(filename)
print(f"{len(fasta_files)} fasta files")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stanford-rna-3d-folding/sample_submission.csv
/kaggle/input/stanford-rna-3d-folding/validation_sequences.csv
/kaggle/input/stanford-rna-3d-folding/test_sequences.csv
/kaggle/input/stanford-rna-3d-folding/validation_labels.csv
/kaggle/input/stanford-rna-3d-folding/train_labels.csv
/kaggle/input/stanford-rna-3d-folding/train_sequences.csv
856 fasta files


In [17]:
sequences_df = pd.read_csv("/kaggle/input/stanford-rna-3d-folding/train_sequences.csv")
sequences_df[["target_id", "sequence"]]

Unnamed: 0,target_id,sequence
0,1SCL_A,GGGUGCUCAGUACGAGAGGAACCGCACCC
1,1RNK_A,GGCGCAGUGGGCUAGCGCCACUCAAAAGGCCCAU
2,1RHT_A,GGGACUGACGAUCACGCAGUCUAU
3,1HLX_A,GGGAUAACUUCGGUUGUCCC
4,1HMH_E,GGCGACCCUGAUGAGGCCGAAAGGCCGAAACCGU
...,...,...
839,8T3E_EC,AAACUCCAUGUAUUGGUUACCCAUCUGCAUCGAAAACUCUCCGAAC...
840,8T3F_EC,AAACUCCAUGUAUUGGUUACCCAUCUGCAUCGAAAACUCUCCGAAC...
841,8XCC_B,GUGCUGCUGUCUCCCAGACGGGAGGCAGAACUGCACCUUCCAUCAG...
842,8Z1G_T,GGUAAAAUGGCUGAGUGAAGCAUUGGACUGUAAAUCUAAAGACAGG...


In [18]:
labels_df = pd.read_csv("/kaggle/input/stanford-rna-3d-folding/train_labels.csv")
labels_df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,ID,resname,resid,x_1,y_1,z_1
0,1SCL_A_1,G,1,13.760,-25.974001,0.102
1,1SCL_A_2,G,2,9.310,-29.638000,2.669
2,1SCL_A_3,G,3,5.529,-27.813000,5.878
3,1SCL_A_4,U,4,2.678,-24.900999,9.793
4,1SCL_A_5,G,5,1.827,-20.136000,11.793
...,...,...,...,...,...,...
137090,8Z1F_T_82,U,82,,,
137091,8Z1F_T_83,C,83,,,
137092,8Z1F_T_84,A,84,,,
137093,8Z1F_T_85,U,85,,,


In [19]:
submission = pd.read_csv("/kaggle/input/stanford-rna-3d-folding/sample_submission.csv")
submission.head()

Unnamed: 0,ID,resname,resid,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,R1107_1,G,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,R1107_2,G,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,R1107_3,G,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,R1107_4,G,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,R1107_5,G,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Using CLIP approach

In [20]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# Example dataset class: replace with your actual data loading/preprocessing
class RNADataset(Dataset):
    def __init__(self, seq_features, struct_features):
        self.seq_features = seq_features  # numpy array or list, shape: (N, seq_input_dim)
        self.struct_features = struct_features  # shape: (N, struct_input_dim)
    
    def __len__(self):
        return len(self.seq_features)
    
    def __getitem__(self, idx):
        return {
            'seq_features': torch.tensor(self.seq_features[idx], dtype=torch.float),
            'struct_features': torch.tensor(self.struct_features[idx], dtype=torch.float)
        }


# Create dataset and dataloader
dataset = RNADataset(sequences_df["sequence"], labels_df)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [21]:
seq_input_dim = len(sequences_df)
struct_input_dim = len(labels_df)

In [22]:


# Define the simple encoders and contrastive loss from previous code
class RNASequenceEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim):
        super(RNASequenceEncoder, self).__init__()
        self.fc = nn.Linear(input_dim, emb_dim)
    def forward(self, x):
        return self.fc(x)

class RNAStructureEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim):
        super(RNAStructureEncoder, self).__init__()
        self.fc = nn.Linear(input_dim, emb_dim)
    def forward(self, x):
        return self.fc(x)

class ContrastiveCLIPLoss(nn.Module):
    def __init__(self, temperature=0.07):
        super(ContrastiveCLIPLoss, self).__init__()
        self.temperature = temperature
    def forward(self, seq_embeddings, struct_embeddings):
        seq_norm = F.normalize(seq_embeddings, dim=1, eps=1e-6)
        struct_norm = F.normalize(struct_embeddings, dim=1, eps=1e-6)
        logits = torch.matmul(seq_norm, struct_norm.t()) / self.temperature
        batch_size = logits.size(0)
        labels = torch.arange(batch_size).to(logits.device)
        loss_seq = F.cross_entropy(logits, labels)
        loss_struct = F.cross_entropy(logits.t(), labels)
        return (loss_seq + loss_struct) / 2.0

# Set embedding dimension
emb_dim = 256

# Initialize models
seq_encoder = RNASequenceEncoder(seq_input_dim, emb_dim)
struct_encoder = RNAStructureEncoder(struct_input_dim, emb_dim)
criterion = ContrastiveCLIPLoss(temperature=0.07)

# Set up optimizer (if desired, you could update only one encoder)
optimizer = torch.optim.Adam(list(seq_encoder.parameters()) + list(struct_encoder.parameters()), lr=1e-3)

# Optional: move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seq_encoder.to(device)
struct_encoder.to(device)
criterion.to(device)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    for batch in dataloader:
        # Get batch data and send to device
        seq_batch = batch['seq_features'].to(device)      # shape: (batch_size, seq_input_dim)
        struct_batch = batch['struct_features'].to(device)  # shape: (batch_size, struct_input_dim)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass: compute embeddings for each modality
        seq_embeddings = seq_encoder(seq_batch)         # (batch_size, emb_dim)
        struct_embeddings = struct_encoder(struct_batch)  # (batch_size, emb_dim)
        
        # Compute contrastive loss
        loss = criterion(seq_embeddings, struct_embeddings)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    avg_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

print("Training complete.")


TypeError: new(): invalid data type 'str'

In [None]:
import pandas as pd
import numpy as np

# Read the test sequences file. Adjust engine/quoting if necessary.
test_df = pd.read_csv("/kaggle/input/stanford-rna-3d-folding/test_sequences.csv", engine="python")

# Prepare a list to collect submission rows
submission_rows = []

# For each test sequence, create one row per residue
for _, row in test_df.iterrows():
    target_id = row["target_id"]
    sequence = str(row["sequence"]).strip()  # Ensure it's a string and remove extra whitespace/newlines
    for i, nucleotide in enumerate(sequence):
        resid = i + 1
        # Create an ID by appending the residue index to the target_id, e.g., "R1107_1"
        new_id = f"{target_id}_{resid}"
        # For a random submission, fill coordinates with zeros.
        coords = [0.0] * (3 * 5)  # 5 predictions, each with x, y, z (total 15 numbers)
        submission_rows.append([new_id, nucleotide, resid] + coords)

# Define column names: ID, resname, resid, followed by x_1, y_1, z_1, ..., x_5, y_5, z_5.
columns = ["ID", "resname", "resid"] + [f"{axis}_{i}" for i in range(1, 6) for axis in ["x", "y", "z"]]

# Create the submission DataFrame
submission_df = pd.DataFrame(submission_rows, columns=columns)

# Save the submission file to /kaggle/working (this is the working directory in Kaggle notebooks)
submission_df.to_csv("/kaggle/working/submission.csv", index=False)
print("Submission file saved to /kaggle/working/submission.csv")


In [None]:
submission_df