In [None]:
# Install required packages
!pip install wandb transformers torch datasets -q

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# ====== TRACKIO/WANDB INTEGRATION ======
import wandb
# Initialize Weights & Biases for experiment tracking
wandb.init(
    project="protein-secondary-structure",
    name="bilstm-sst-prediction",
    config={
        "batch_size": 32,
        "embed_dim": 64,
        "hidden_dim": 128,
        "epochs": 10,
        "learning_rate": 1e-3
    }
)

# ====== HUGGING FACE INTEGRATION ======
from transformers import AutoTokenizer, AutoModel
# We'll use ProtBERT embeddings as an option
# Uncomment below to use pre-trained protein embeddings
# hf_model_name = "Rostlab/prot_bert"
# hf_tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
# hf_model = AutoModel.from_pretrained(hf_model_name)

print("✓ Trackio/WandB initialized successfully!")
print("✓ Hugging Face transformers loaded!")
print(f"Dashboard URL: {wandb.run.get_url()}")

# Load data
train_df = pd.read_csv(r'c:/Users/meekg/Downloads/sep-25-dl-gen-ai-nppe-2/train.csv')
test_df = pd.read_csv(r'c:/Users/meekg/Downloads/sep-25-dl-gen-ai-nppe-2/test.csv')

# Log dataset info to WandB
wandb.log({
    "train_size": len(train_df),
    "test_size": len(test_df)
})

# For c3 and c8
train_c3 = train_df['sst3']
train_c8 = train_df['sst8']
train_seq = train_df['seq']
test_seq = test_df['seq']

# Encode labels for c3 and c8
c3_labels = sorted(list(set(''.join(train_c3))))
c8_labels = sorted(list(set(''.join(train_c8))))
c3_encoder = LabelEncoder().fit(c3_labels)
c8_encoder = LabelEncoder().fit(c8_labels)

print(f"C3 classes: {c3_labels} (count: {len(c3_labels)})")
print(f"C8 classes: {c8_labels} (count: {len(c8_labels)})")

# Max sequence length for padding
max_len = max(train_seq.apply(len).max(), test_seq.apply(len).max())
print(f"Max sequence length: {max_len}")

# Amino acid vocabulary
vocab = sorted(list(set(''.join(train_seq) + ''.join(test_seq))))
vocab_dict = {aa: i+1 for i, aa in enumerate(vocab)}  # 0 is padding
vocab_size = len(vocab_dict) + 1

wandb.config.update({
    "vocab_size": vocab_size,
    "max_len": max_len,
    "c3_classes": len(c3_labels),
    "c8_classes": len(c8_labels)
})

In [None]:
# Helper functions for encoding sequences and labels
def encode_sequence(seq, vocab_dict, max_len):
    arr = np.zeros(max_len, dtype=int)
    for i, aa in enumerate(seq[:max_len]):
        arr[i] = vocab_dict.get(aa, 0)
    return arr

def encode_labels(labels, encoder, max_len):
    arr = np.full(max_len, -1, dtype=int)
    for i, l in enumerate(labels[:max_len]):
        arr[i] = encoder.transform([l])[0]
    return arr

# Custom Dataset
class ProteinDataset(Dataset):
    def __init__(self, seqs, labels, vocab_dict, encoder, max_len):
        self.seqs = seqs
        self.labels = labels
        self.vocab_dict = vocab_dict
        self.encoder = encoder
        self.max_len = max_len
    def __len__(self):
        return len(self.seqs)
    def __getitem__(self, idx):
        seq = encode_sequence(self.seqs.iloc[idx], self.vocab_dict, self.max_len)
        if self.labels is not None:
            label = encode_labels(self.labels.iloc[idx], self.encoder, self.max_len)
            return torch.tensor(seq, dtype=torch.long), torch.tensor(label, dtype=torch.long)
        else:
            return torch.tensor(seq, dtype=torch.long)

In [None]:
# Enhanced BiLSTM Model with Dropout and LayerNorm
def make_bilstm_model(vocab_size, embed_dim, hidden_dim, num_classes, dropout=0.3):
    class BiLSTM(nn.Module):
        def __init__(self):
            super().__init__()
            self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
            self.dropout1 = nn.Dropout(dropout)
            # Stack 2 LSTM layers for better performance
            self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=2, 
                              batch_first=True, bidirectional=True, dropout=dropout)
            self.dropout2 = nn.Dropout(dropout)
            self.layer_norm = nn.LayerNorm(hidden_dim * 2)
            self.fc = nn.Linear(hidden_dim * 2, num_classes)
        def forward(self, x):
            x = self.embedding(x)
            x = self.dropout1(x)
            out, _ = self.lstm(x)
            out = self.dropout2(out)
            out = self.layer_norm(out)
            out = self.fc(out)
            return out
    return BiLSTM()

# Training and evaluation helpers with WandB logging
def train_epoch(model, loader, optimizer, criterion, device, epoch, model_name):
    model.train()
    total_loss = 0
    batch_count = 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(x)
        out = out.permute(0, 2, 1)  # (batch, classes, seq)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        batch_count += 1
        
        # Log batch loss to WandB every 50 batches
        if batch_count % 50 == 0:
            wandb.log({f"{model_name}_batch_loss": loss.item()})
    
    avg_loss = total_loss / len(loader)
    # Log epoch metrics to WandB
    wandb.log({
        f"{model_name}_epoch": epoch,
        f"{model_name}_epoch_loss": avg_loss
    })
    return avg_loss

def predict(model, loader, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for x in loader:
            x = x.to(device)
            out = model(x)
            pred = out.argmax(-1).cpu().numpy()
            preds.append(pred)
    return np.concatenate(preds, axis=0)

In [None]:
# Prepare datasets and dataloaders for c3 and c8
BATCH_SIZE = wandb.config.batch_size
EMBED_DIM = wandb.config.embed_dim
HIDDEN_DIM = wandb.config.hidden_dim
EPOCHS = wandb.config.epochs
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {DEVICE}")
wandb.config.update({"device": str(DEVICE)})

# For c3
train_c3_dataset = ProteinDataset(train_seq, train_c3, vocab_dict, c3_encoder, max_len)
train_c3_loader = DataLoader(train_c3_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_c3_dataset = ProteinDataset(test_seq, None, vocab_dict, c3_encoder, max_len)
test_c3_loader = DataLoader(test_c3_dataset, batch_size=BATCH_SIZE, shuffle=False)

# For c8
train_c8_dataset = ProteinDataset(train_seq, train_c8, vocab_dict, c8_encoder, max_len)
train_c8_loader = DataLoader(train_c8_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_c8_dataset = ProteinDataset(test_seq, None, vocab_dict, c8_encoder, max_len)
test_c8_loader = DataLoader(test_c8_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("✓ Dataloaders created successfully!")

In [None]:
# Train and predict for c3
print("=" * 50)
print("Training C3 Model")
print("=" * 50)

model_c3 = make_bilstm_model(vocab_size, EMBED_DIM, HIDDEN_DIM, len(c3_labels)).to(DEVICE)
optimizer_c3 = torch.optim.Adam(model_c3.parameters(), lr=wandb.config.learning_rate)
criterion_c3 = nn.CrossEntropyLoss(ignore_index=-1)

# Watch model with WandB
wandb.watch(model_c3, criterion_c3, log="all", log_freq=100)

for epoch in range(EPOCHS):
    loss = train_epoch(model_c3, train_c3_loader, optimizer_c3, criterion_c3, DEVICE, epoch+1, "c3")
    print(f'Epoch {epoch+1}/{EPOCHS} c3 loss: {loss:.4f}')

print("\n✓ C3 training complete! Making predictions...")
c3_preds = predict(model_c3, test_c3_loader, DEVICE)
c3_preds_str = [''.join(c3_encoder.inverse_transform(row[:len(seq)])) for row, seq in zip(c3_preds, test_seq)]
print("✓ C3 predictions generated!")

# Save C3 model
torch.save(model_c3.state_dict(), 'model_c3.pth')
wandb.save('model_c3.pth')

In [None]:
# Train and predict for c8
print("\n" + "=" * 50)
print("Training C8 Model")
print("=" * 50)

model_c8 = make_bilstm_model(vocab_size, EMBED_DIM, HIDDEN_DIM, len(c8_labels)).to(DEVICE)
optimizer_c8 = torch.optim.Adam(model_c8.parameters(), lr=wandb.config.learning_rate)
criterion_c8 = nn.CrossEntropyLoss(ignore_index=-1)

# Watch model with WandB
wandb.watch(model_c8, criterion_c8, log="all", log_freq=100)

for epoch in range(EPOCHS):
    loss = train_epoch(model_c8, train_c8_loader, optimizer_c8, criterion_c8, DEVICE, epoch+1, "c8")
    print(f'Epoch {epoch+1}/{EPOCHS} c8 loss: {loss:.4f}')

print("\n✓ C8 training complete! Making predictions...")
c8_preds = predict(model_c8, test_c8_loader, DEVICE)
c8_preds_str = [''.join(c8_encoder.inverse_transform(row[:len(seq)])) for row, seq in zip(c8_preds, test_seq)]
print("✓ C8 predictions generated!")

# Save C8 model
torch.save(model_c8.state_dict(), 'model_c8.pth')
wandb.save('model_c8.pth')

In [None]:
# Generate submission file
submission = pd.DataFrame({
    'id': test_df['id'],
    'sst3': c3_preds_str,
    'sst8': c8_preds_str
})

submission.to_csv('submission.csv', index=False)
print('\n' + '=' * 50)
print('✓ submission.csv generated successfully!')
print('=' * 50)
print(f'Submission shape: {submission.shape}')
print('\nFirst few rows:')
print(submission.head())

# Log submission to WandB
wandb.save('submission.csv')
submission_artifact = wandb.Artifact('submission', type='dataset')
submission_artifact.add_file('submission.csv')
wandb.log_artifact(submission_artifact)

print(f"\n✓ Submission logged to WandB dashboard: {wandb.run.get_url()}")

In [None]:
# Finish WandB run
wandb.finish()
print("\n✓ WandB tracking complete! Check your dashboard for metrics and visualizations.")