In [1]:
# Install required packages
!pip install trackio torch transformers datasets pandas numpy scikit-learn huggingface_hub -q

In [47]:
import numpy as np
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
import os

# TrackIO import
try:
    import trackio
    TRACKIO_AVAILABLE = True
    print("‚úì TrackIO available")
except ImportError:
    TRACKIO_AVAILABLE = False
    print("‚ö†Ô∏è TrackIO not available - install with: pip install trackio")

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

‚úì TrackIO available
PyTorch version: 2.7.1+cu118
CUDA available: True


In [48]:
# ============================================================================
# ü§ó HUGGING FACE LOGIN (for shareable TrackIO dashboard)
# ============================================================================
# Get your token from: https://huggingface.co/settings/tokens

from huggingface_hub import login, HfApi

# Login to Hugging Face
print("Please login to Hugging Face to enable shareable dashboards...")
login()

# Get your username
api = HfApi()
user_info = api.whoami()
HF_USERNAME = user_info['name']
print(f"‚úì Logged in as: {HF_USERNAME}")
print(f"‚úì Your dashboard will be at: https://huggingface.co/spaces/{HF_USERNAME}/protein-track")

Please login to Hugging Face to enable shareable dashboards...


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

‚úì Logged in as: cjayyy05
‚úì Your dashboard will be at: https://huggingface.co/spaces/cjayyy05/protein-track


In [49]:
# Set environment variables if needed
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
# os.environ['HF_TOKEN'] = 'your_token_here' # Uncomment and set if needed

In [50]:
# Load data using local paths
train_path = r'c:/Users/meekg/Downloads/sep-25-dl-gen-ai-nppe-2/train.csv'
test_path = r'c:/Users/meekg/Downloads/sep-25-dl-gen-ai-nppe-2/test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("Train columns:", train_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train columns: ['id', 'seq', 'sst8', 'sst3']
Test columns: ['id', 'seq']
Train shape: (7262, 4)
Test shape: (1816, 2)


In [51]:
# =============================================================================
# 1Ô∏è‚É£ DATA PREPARATION
# =============================================================================

# Split train/validation
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42, shuffle=True)
print(f"\nTrain samples: {len(train_data)} | Val samples: {len(val_data)}")

# Build vocabularies
aa_list = list("ACDEFGHIKLMNPQRSTVWY")
aa2idx = {aa: i+1 for i, aa in enumerate(aa_list)}
aa2idx['<PAD>'] = 0
aa2idx['<UNK>'] = 21

Q8_LABELS = sorted(list(set("".join(train_df["sst8"]))))
q8_2idx = {l: i for i, l in enumerate(Q8_LABELS)}
idx2q8 = {i: l for l, i in q8_2idx.items()}

Q3_LABELS = sorted(list(set("".join(train_df["sst3"]))))
q3_2idx = {l: i for i, l in enumerate(Q3_LABELS)}
idx2q3 = {i: l for l, i in q3_2idx.items()}

print(f"AA vocab size: 22")
print(f"Q8 classes: {len(q8_2idx)} -> {Q8_LABELS}")
print(f"Q3 classes: {len(q3_2idx)} -> {Q3_LABELS}")


Train samples: 6535 | Val samples: 727
AA vocab size: 22
Q8 classes: 8 -> ['B', 'C', 'E', 'G', 'H', 'I', 'S', 'T']
Q3 classes: 3 -> ['C', 'E', 'H']


In [52]:
# =============================================================================
# 2Ô∏è‚É£ DATASET & DATALOADER
# =============================================================================

class ProteinDataset(Dataset):
    def __init__(self, df, task="q8"):
        self.seqs = df["seq"].values
        self.task = task
        if task == "q8":
            self.labels = df["sst8"].values
            self.label2idx = q8_2idx
        else:
            self.labels = df["sst3"].values
            self.label2idx = q3_2idx

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        seq = self.seqs[idx]
        lab = self.labels[idx]
        
        seq_encoded = torch.tensor([aa2idx.get(a, 21) for a in seq], dtype=torch.long)
        lab_encoded = torch.tensor([self.label2idx.get(c, 0) for c in lab], dtype=torch.long)
        
        return seq_encoded, lab_encoded, len(seq_encoded)

def collate_fn(batch):
    seqs, labels, lengths = zip(*batch)
    lengths = torch.tensor(lengths)
    seqs_padded = pad_sequence(seqs, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)
    return seqs_padded, labels_padded, lengths

In [53]:
# =============================================================================
# 3Ô∏è‚É£ MODEL ARCHITECTURE
# =============================================================================

class BiLSTM_CNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim,
                 num_layers=1, cnn_filters=[64, 64], kernel_sizes=[3,5], dropout=0.33,use_masking=True):
        super().__init__()
        
        # Embedding layer for amino acids
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.dropout_embed = nn.Dropout(dropout)
        
        # -------------------
        # CNN for local patterns
        # -------------------
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=f, kernel_size=k, padding=k//2)
            for f, k in zip(cnn_filters, kernel_sizes)
        ])
        
        # -------------------
        # BiLSTM for long-range dependencies
        # -------------------
        lstm_input_dim = sum(cnn_filters)  # CNN output channels concatenated
        self.rnn = nn.LSTM(
            input_size=lstm_input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        self.layer_norm = nn.LayerNorm(hidden_dim*2)
        self.dropout_out = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        
    def forward(self, x, lengths):
        # -------------------
        # Embedding
        # x: [B, L]
        # -------------------
        embed = self.embedding(x)       # [B, L, embed_dim]
        embed = self.dropout_embed(embed)
        
        # -------------------
        # CNN expects [B, embed_dim, L]
        # -------------------
        cnn_input = embed.transpose(1,2)
        cnn_outs = [F.relu(conv(cnn_input)) for conv in self.convs]  # list of [B, out_channels, L]
        cnn_out = torch.cat(cnn_outs, dim=1).transpose(1,2)          # [B, L, sum(out_channels)]
        
        # -------------------
        # Pack for LSTM
        # -------------------
        packed = nn.utils.rnn.pack_padded_sequence(cnn_out, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, _ = self.rnn(packed)
        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)  # [B, L, hidden*2]
        
        # -------------------
        # Layer norm + dropout + classifier
        # -------------------
        lstm_out = self.layer_norm(lstm_out)
        lstm_out = self.dropout_out(lstm_out)
        logits = self.fc(lstm_out)  # [B, L, output_dim]
        
        return logits

In [54]:
# =============================================================================
# 4Ô∏è‚É£ TRAINING FUNCTION (with TrackIO!)
# =============================================================================

def train_model(task="q8", num_epochs=20, batch_size=64, lr=1e-3, use_trackio=True, hf_username=None):
    """Train model for either Q8 or Q3 task using BiLSTM+CNN"""
    
    print(f"\n{'='*60}")
    print(f"Training BiLSTM+CNN for {task.upper()} task")
    print(f"{'='*60}")
    
    tracker = None
    if use_trackio and TRACKIO_AVAILABLE:
        try:
            trackio.init(
                project="25-t3-nppe2",
                group=f"bilstm_cnn_{task}",
                name=f"bilstm_cnn_{task}_run",
                resume="never"
            )
            
            trackio.config.update({
                "model": "BiLSTM_CNN",
                "task": task,
                "vocab_size": 22,
                "embed_dim": 128,
                "hidden_dim": 256,
                "num_layers": 2,
                "cnn_filters": [64,128],
                "kernel_sizes": [3,5],
                "dropout": 0.55,
                "use_masking": True,
                "batch_size": batch_size,
                "learning_rate": lr,
                "num_epochs": num_epochs,
                "output_classes": len(q8_2idx) if task == "q8" else len(q3_2idx)
            })
            tracker = True
            print("‚úì TrackIO initialized (saving locally)")
            if hf_username:
                print(f"üìù Results will be uploaded to: https://huggingface.co/spaces/{hf_username}/protein-sst-tracking")
        except Exception as e:
            print(f"‚ö†Ô∏è TrackIO initialization failed: {e}")
            print("   Training will continue without tracking.")
            tracker = None
    
    train_dataset = ProteinDataset(train_data, task=task)
    val_dataset = ProteinDataset(val_data, task=task)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    output_dim = len(q8_2idx) if task == "q8" else len(q3_2idx)

    model = BiLSTM_CNN(
        vocab_size=22,
        embed_dim=128,
        hidden_dim=256,
        output_dim=output_dim,
        num_layers=2,
        cnn_filters=[64,128],
        kernel_sizes=[3,5],
        dropout=0.55
    ).to(device)

    class_weights = torch.ones(output_dim, device=device)
    criterion = nn.CrossEntropyLoss(ignore_index=-100, weight=class_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=2
    )
    
    print(f"Device: {device}")
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    start_epoch = 0
    best_f1 = 0
     
    print("\nStarting training...")
    for epoch in range(start_epoch, num_epochs):
        model.train()
        total_loss = 0
        
        for seqs, labels, lengths in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            seqs, labels, lengths = seqs.to(device), labels.to(device), lengths.to(device)
            
            optimizer.zero_grad()
            logits = model(seqs, lengths)
            loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        
        model.eval()
        all_preds, all_labels = [], []
        
        with torch.no_grad():
            for seqs, labels, lengths in val_loader:
                seqs, labels, lengths = seqs.to(device), labels.to(device), lengths.to(device)
                
                logits = model(seqs, lengths)
                preds = torch.argmax(logits, dim=-1)
                
                mask = labels.view(-1) != -100
                all_preds.extend(preds.view(-1)[mask].cpu().numpy())
                all_labels.extend(labels.view(-1)[mask].cpu().numpy())
        
        val_f1 = f1_score(all_labels, all_preds, average='macro')
        scheduler.step(val_f1)
        
        current_lr = optimizer.param_groups[0]['lr']
        
        print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | Val F1: {val_f1:.4f} | LR: {current_lr:.6f}")
        
        if tracker:
            trackio.log({
                "train_loss": avg_loss,
                "val_f1": val_f1,
                "learning_rate": current_lr,
                "epoch": epoch + 1
            })
        
        if val_f1 > best_f1:
            best_f1 = val_f1
            print(f"  ‚Üí New best F1: {best_f1:.4f}")
            
            if tracker:
                try:
                    trackio.log({"best_val_f1": best_f1})
                except:
                    pass
    
    if tracker:
        try:
            trackio.log({
                "final_best_f1": best_f1,
                "total_epochs": num_epochs,
                "task": task
            })
            trackio.finish()
            print("‚úì TrackIO logging complete")
            if hf_username:
                print(f"\nüì§ Run the 'upload' cell after training to get your shareable URL!")
        except:
            pass
    
    print(f"\nTraining complete! Best F1: {best_f1:.4f}")
    return model, best_f1

In [55]:
# =============================================================================
# 5Ô∏è‚É£ INFERENCE FUNCTION
# =============================================================================

def predict(model, test_df, task="q8", device="cuda"):
    """Generate predictions for test set"""
    
    class TestDataset(Dataset):
        def __init__(self, df):
            self.seqs = df["seq"].values
        
        def __len__(self):
            return len(self.seqs)
        
        def __getitem__(self, idx):
            seq = self.seqs[idx]
            seq_encoded = torch.tensor([aa2idx.get(a, 21) for a in seq], dtype=torch.long)
            return seq_encoded, len(seq_encoded)
    
    def test_collate_fn(batch):
        seqs, lengths = zip(*batch)
        lengths = torch.tensor(lengths)
        seqs_padded = pad_sequence(seqs, batch_first=True, padding_value=0)
        return seqs_padded, lengths
    
    test_dataset = TestDataset(test_df)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=test_collate_fn)
    
    idx2label = idx2q8 if task == "q8" else idx2q3
    
    model.eval()
    all_predictions = []
    
    with torch.no_grad():
        for seqs, lengths in tqdm(test_loader, desc=f"Predicting {task.upper()}"):
            seqs, lengths = seqs.to(device), lengths.to(device)
            logits = model(seqs, lengths)
            preds = torch.argmax(logits, dim=-1)
            
            # Convert to strings
            for i, length in enumerate(lengths):
                pred_seq = "".join([idx2label.get(idx.item(), 'C') for idx in preds[i][:length]])
                all_predictions.append(pred_seq)
    
    return all_predictions

In [56]:
# Train Q8 Model with TrackIO
model_q8, f1_q8 = train_model(
    task="q8", 
    num_epochs=20, 
    batch_size=32, 
    lr=1e-3, 
    use_trackio=True,
    hf_username=HF_USERNAME
)

# Train Q3 Model
model_q3, f1_q3 = train_model(
    task="q3", 
    num_epochs=20, 
    batch_size=32, 
    lr=1e-3, 
    use_trackio=True,
    hf_username=HF_USERNAME
)


Training BiLSTM+CNN for Q8 task
* Created new run: nice-star-11
‚úì TrackIO initialized (saving locally)
üìù Results will be uploaded to: https://huggingface.co/spaces/cjayyy05/protein-sst-tracking
Device: cuda
Model parameters: 2,613,192

Starting training...




Epoch 1/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 1 | Loss: 1.4460 | Val F1: 0.2550 | LR: 0.000300
  ‚Üí New best F1: 0.2550


Epoch 2/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 2 | Loss: 1.2961 | Val F1: 0.2751 | LR: 0.000300
  ‚Üí New best F1: 0.2751


Epoch 3/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 3 | Loss: 1.2541 | Val F1: 0.2820 | LR: 0.000300
  ‚Üí New best F1: 0.2820


Epoch 4/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 4 | Loss: 1.2222 | Val F1: 0.2934 | LR: 0.000300
  ‚Üí New best F1: 0.2934


Epoch 5/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 5 | Loss: 1.2027 | Val F1: 0.2904 | LR: 0.000300


Epoch 6/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 6 | Loss: 1.1870 | Val F1: 0.2939 | LR: 0.000300
  ‚Üí New best F1: 0.2939


Epoch 7/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 7 | Loss: 1.1715 | Val F1: 0.3077 | LR: 0.000300
  ‚Üí New best F1: 0.3077


Epoch 8/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 8 | Loss: 1.1597 | Val F1: 0.3059 | LR: 0.000300


Epoch 9/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 9 | Loss: 1.1479 | Val F1: 0.3102 | LR: 0.000300
  ‚Üí New best F1: 0.3102


Epoch 10/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 10 | Loss: 1.1368 | Val F1: 0.3159 | LR: 0.000300
  ‚Üí New best F1: 0.3159


Epoch 11/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 11 | Loss: 1.1316 | Val F1: 0.3155 | LR: 0.000300


Epoch 12/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 12 | Loss: 1.1214 | Val F1: 0.3132 | LR: 0.000300


Epoch 13/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 13 | Loss: 1.1136 | Val F1: 0.3241 | LR: 0.000300
  ‚Üí New best F1: 0.3241


Epoch 14/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 14 | Loss: 1.1097 | Val F1: 0.3171 | LR: 0.000300


Epoch 15/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 15 | Loss: 1.1004 | Val F1: 0.3248 | LR: 0.000300
  ‚Üí New best F1: 0.3248


Epoch 16/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 16 | Loss: 1.0932 | Val F1: 0.3310 | LR: 0.000300
  ‚Üí New best F1: 0.3310


Epoch 17/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 17 | Loss: 1.0851 | Val F1: 0.3318 | LR: 0.000300
  ‚Üí New best F1: 0.3318


Epoch 18/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 18 | Loss: 1.0804 | Val F1: 0.3241 | LR: 0.000300


Epoch 19/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 19 | Loss: 1.0712 | Val F1: 0.3264 | LR: 0.000300


Epoch 20/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 20 | Loss: 1.0652 | Val F1: 0.3291 | LR: 0.000150
* Run finished. Uploading logs to Trackio (please wait...)
‚úì TrackIO logging complete

üì§ Run the 'upload' cell after training to get your shareable URL!

Training complete! Best F1: 0.3318

Training BiLSTM+CNN for Q3 task
* Created new run: proud-rain-12
‚úì TrackIO initialized (saving locally)
üìù Results will be uploaded to: https://huggingface.co/spaces/cjayyy05/protein-sst-tracking
Device: cuda
Model parameters: 2,610,627

Starting training...




Epoch 1/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 1 | Loss: 0.8980 | Val F1: 0.6343 | LR: 0.000300
  ‚Üí New best F1: 0.6343


Epoch 2/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 2 | Loss: 0.7900 | Val F1: 0.6564 | LR: 0.000300
  ‚Üí New best F1: 0.6564


Epoch 3/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 3 | Loss: 0.7559 | Val F1: 0.6731 | LR: 0.000300
  ‚Üí New best F1: 0.6731


Epoch 4/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 4 | Loss: 0.7322 | Val F1: 0.6840 | LR: 0.000300
  ‚Üí New best F1: 0.6840


Epoch 5/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 5 | Loss: 0.7198 | Val F1: 0.6669 | LR: 0.000300


Epoch 6/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 6 | Loss: 0.7084 | Val F1: 0.6921 | LR: 0.000300
  ‚Üí New best F1: 0.6921


Epoch 7/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 7 | Loss: 0.6977 | Val F1: 0.6982 | LR: 0.000300
  ‚Üí New best F1: 0.6982


Epoch 8/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 8 | Loss: 0.6896 | Val F1: 0.6879 | LR: 0.000300


Epoch 9/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 9 | Loss: 0.6807 | Val F1: 0.7015 | LR: 0.000300
  ‚Üí New best F1: 0.7015


Epoch 10/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 10 | Loss: 0.6723 | Val F1: 0.6989 | LR: 0.000300


Epoch 11/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 11 | Loss: 0.6640 | Val F1: 0.7053 | LR: 0.000300
  ‚Üí New best F1: 0.7053


Epoch 12/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 12 | Loss: 0.6596 | Val F1: 0.7088 | LR: 0.000300
  ‚Üí New best F1: 0.7088


Epoch 13/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 13 | Loss: 0.6525 | Val F1: 0.7096 | LR: 0.000300
  ‚Üí New best F1: 0.7096


Epoch 14/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 14 | Loss: 0.6478 | Val F1: 0.7118 | LR: 0.000300
  ‚Üí New best F1: 0.7118


Epoch 15/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 15 | Loss: 0.6425 | Val F1: 0.7100 | LR: 0.000300


Epoch 16/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 16 | Loss: 0.6367 | Val F1: 0.7104 | LR: 0.000300


Epoch 17/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 17 | Loss: 0.6311 | Val F1: 0.7127 | LR: 0.000300
  ‚Üí New best F1: 0.7127


Epoch 18/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 18 | Loss: 0.6268 | Val F1: 0.7120 | LR: 0.000300


Epoch 19/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 19 | Loss: 0.6190 | Val F1: 0.7101 | LR: 0.000300


Epoch 20/20:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 20 | Loss: 0.6162 | Val F1: 0.7137 | LR: 0.000300
  ‚Üí New best F1: 0.7137
* Run finished. Uploading logs to Trackio (please wait...)
‚úì TrackIO logging complete

üì§ Run the 'upload' cell after training to get your shareable URL!

Training complete! Best F1: 0.7137


In [16]:
# Generate predictions
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Generating Q8 predictions...")
q8_preds = predict(model_q8, test_df, task="q8", device=device)

print("Generating Q3 predictions...")
q3_preds = predict(model_q3, test_df, task="q3", device=device)

Generating Q8 predictions...


Predicting Q8:   0%|          | 0/29 [00:00<?, ?it/s]

Generating Q3 predictions...


Predicting Q3:   0%|          | 0/29 [00:00<?, ?it/s]

In [17]:
# Create submission file
submission = pd.DataFrame({
    'id': test_df['id'],
    'sst3': q3_preds,
    'sst8': q8_preds
})

# Ensure order matches test_df
print("Sample submission head:")
print(submission.head())

submission.to_csv('submission.csv', index=False)
print("\n‚úì submission.csv generated successfully!")

Sample submission head:
   id                                               sst3  \
0   0  CEECCCCCCHHHHHHHHHHHHHHCCEEEEECCCCCCCCCEEEEECC...   
1   1  CCCCCCCCCEEEEEEEECCCCCEEEEEECCCHHHHHHHHCCCCCHH...   
2   2  CCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCHHHHHH...   
3   3  CCCCCHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCEHCHCHHH...   
4   4  CEEEEEECCCCCHHHHHHHHHHHHHHHCECCCCCCCCCEEEECCEE...   

                                                sst8  
0  CCCTTHHHHHHHHHHHHHHHHHHCSEEEEECCCCCCCCCEEEEECC...  
1  CCCCCCCCCEEEEEEEECSTTCEEEEEECTTCCHGGHGCCCCTTHH...  
2  CTHHHHHHHHHHHHHHHHHCTHHHHHHHHHHHHHHCCCCHHHHHHH...  
3  CCCTTHHHHHHHHHHHHHHHHHHHHHHHCCEEEECTTHHHHHHHHH...  
4  CEEEEEESCCCTHHHHHHHHHHHHHHHCCCCCBCSTTCEEEECSEE...  

‚úì submission.csv generated successfully!


In [20]:
# ============================================================================
# üì§ UPLOAD TRACKIO RESULTS TO HUGGING FACE (Run after training!)
# ============================================================================

from huggingface_hub import HfApi, create_repo
import os

print("Uploading TrackIO results to Hugging Face Space...")

try:
    api = HfApi()
    
    # Create the Space
    space_id = f"{HF_USERNAME}/protein-sst-tracking"
    try:
        create_repo(
            repo_id=space_id,
            repo_type="space",
            space_sdk="gradio",
            exist_ok=True
        )
        print(f"‚úì Space created: {space_id}")
    except:
        print(f"‚úì Space exists: {space_id}")
    
    # Upload trackio folder
    trackio_path = os.path.expanduser("~/.trackio")
    
    if os.path.exists(trackio_path):
        api.upload_folder(
            folder_path=trackio_path,
            repo_id=space_id,
            repo_type="space",
            path_in_repo="trackio_data"
        )
        print(f"\nüéâ SUCCESS! Your TrackIO dashboard is live at:")
        print(f"   https://huggingface.co/spaces/{space_id}")
        print(f"\nüìã Submit this URL for your project!")
    else:
        print(f"‚ö†Ô∏è TrackIO data not found at: {trackio_path}")
        print("   Make sure training completed successfully.")
        
except Exception as e:
    print(f"‚ùå Upload failed: {e}")
    print("   Make sure you're logged into Hugging Face")

Uploading TrackIO results to Hugging Face Space...
‚úì Space created: cjayyy05/protein-sst-tracking


No files have been modified since last commit. Skipping to prevent empty commit.



üéâ SUCCESS! Your TrackIO dashboard is live at:
   https://huggingface.co/spaces/cjayyy05/protein-sst-tracking

üìã Submit this URL for your project!
