In [33]:

import pandas as pd

metadata = pd.read_csv('../pdb_data_no_dups.csv')
sequences = pd.read_csv('../pdb_data_seq.csv')

df = pd.merge(sequences, metadata[['structureId', 'classification']], on='structureId')

df = df[['sequence', 'classification']]
#del NaN row
df.dropna(inplace=True)


In [42]:
#pick top ten freq classification
top_classes = df['classification'].value_counts().nlargest(10).index
#print(top_classes.tolist())
df['label'] = df['classification'].apply(lambda x: x if x in top_classes else 'others')
sample = df[df['label'] != 'others'].iloc[0]
for label in df['label'].unique():
    if label != 'others':
        example = df[df['label'] == label].iloc[0]
        print(f"{label}: {example['sequence']}")

HYDROLASE: TYTTRQIGAKNTLEYKVYIEKDGKPVSAFHDIPLYADKENNIFNMVVEIPRWTNAKLEITKEETLNPIIQDTKKGKLRFVRNCFPHHGYIHNYGAFPQTWEDPNVSHPETKAVGDNEPIDVLEIGETIAYTGQVKQVKALGIMALLDEGETDWKVIAIDINDPLAPKLNDIEDVEKYFPGLLRATNEWFRIYKIPDGKPENQFAFSGEAKNKKYALDIIKETHDSWKQLIAGKSSDSKGIDLTNVTLPDTPTYSKAASDAIPPASLKADAPIDKSIDKWFFISGSV
TRANSFERASE: MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASCLYGQLPKFQDGDLTLYQSNTILRHLGRTLGLYGKDQQEAALVDMVNDGVEDLRCKYISLIYTNYEAGKDDYVKALPGQLKPFETLLSQNQGGKTFIVGDQISFADYNLLDLLLIHEVLAPGCLDAFPLLSAYVGRLSARPKLKAFLASPEYVNLPINGNGKQ
OXIDOREDUCTASE: MKKIAIFAGDGIGPEIVAAARQVLDAVDQAAHLGLRCTEGLVGGAALDASDDPLPAASLQLAMAADAVILGAVGGPRWDAYPPAKRPEQGLLRLRKGLDLYANLRPAQIFPQLLDASPLRPELVRDVDILVVRELTGDIYFGQPRGLEVIDGKRRGFNTMVYDEDEIRRIAHVAFRAAQGRRKQLCSVDKANVLETTRLWREVVTEVARDYPDVRLSHMYVDNAAMQLIRAPAQFDVLLTGNMFGDILSDEASQLTGSIGMLPSASLGEGRAMYEPIHGSAPDIAGQDKANPLATILSVAMMLRHSLNAEPWAQRVEAAVQRVLDQGLRTADIAAPGTPVIGTKAMGAAVVNALNLKD
HYDROLASE/HYDROLASE INHIBITOR: SPLLETCVPDRGREYRGRLAVTTHGSRCLAWSSEQAKALSKDQDFNPAVPLAENFCRNPDGDEEGAWCYVAD

In [None]:

# Define the 20 standard amino acids using their single-letter codes
amino_acids = 'ARNDCQEGHILKMFPSTWYV'
# Create a dictionary that maps each amino acid to a unique integer (starting from 1).
# Using i+1 so that 0 can be reserved for unknown or padding characters.
aa_dict = {aa: i+1 for i, aa in enumerate(amino_acids)}

def encode_seq(seq, max_len=500):
    # Define a function to encode a protein sequence into a fixed-length numerical vector
    seq = seq[:max_len].ljust(max_len, 'X')
    return [aa_dict.get(aa, 0) for aa in seq]
# Apply the encoding function to every sequence in the DataFrame and store the result in a new column 'encoded'
# If many sequences exceed 500 residues (amino acid length), each sample will require more computation in the LSTM.
# The time complexity of LSTM is O(sequence length), meaning longer sequences take more time per step for computation and memory usage.
# df['encoded'].apply(len).describe() 471117.0
df['encoded'] = df['sequence'].apply(lambda x: encode_seq(x))
## In the early stages of training, use a subset of the data—for example, randomly sample the first 2,000 entries.
# df_small = df.sample(n=2000, random_state=42)
# print(len(df_small))


In [36]:

from torch.utils.data import DataLoader, Dataset
import torch

class ProteinDataset(Dataset):
    def __init__(self, sequences, labels, label_dict):
        self.sequences = sequences
        self.labels = labels
        self.label_dict = label_dict

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = torch.tensor(self.sequences[idx])
        label = torch.tensor(self.label_dict[self.labels[idx]])
        return seq, label

label_dict = {label: idx for idx, label in enumerate(df['label'].unique())}
dataset = ProteinDataset(df['encoded'].tolist(), df['label'].tolist(), label_dict)
loader = DataLoader(dataset, batch_size=8, shuffle=True)


In [None]:

import torch.nn as nn

class ProteinClassifier(nn.Module):
    #32,64
    def __init__(self, num_classes, embedding_dim=32, hidden_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(21, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        _, (h, _) = self.lstm(x)
        return self.fc(h[-1])


In [None]:
# protein_classification.ipynb (Cell 6 - enhanced)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 先切分 train / val 資料
X_train, X_val, y_train, y_val = train_test_split(
    df['encoded'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

train_dataset = ProteinDataset(X_train, y_train, label_dict)
val_dataset = ProteinDataset(X_val, y_val, label_dict)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# 初始化模型、損失、優化器
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ProteinClassifier(len(label_dict)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 訓練 loop
epochs = 5
best_val_acc = 0.0  # 儲存最佳驗證準確率

for epoch in range(epochs):
    model.train()
    train_losses, train_preds, train_labels = [], [], []

    for seq, labels in train_loader:
        seq, labels = seq.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(seq)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
        train_preds.extend(output.argmax(dim=1).cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    train_acc = accuracy_score(train_labels, train_preds)
    #print(f"[Epoch {epoch+1}] Train Loss: {np.mean(train_losses):.4f}, Train Acc: {train_acc:.4f}")

    # 驗證模式
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for seq, labels in val_loader:
            seq, labels = seq.to(device), labels.to(device)
            output = model(seq)
            val_preds.extend(output.argmax(dim=1).cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_acc = accuracy_score(val_labels, val_preds)
    print(f"           → Val Accuracy: {val_acc:.4f}")

    # 儲存最佳模型狀態
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save({
            'model_state_dict': model.state_dict(),
            'label_dict': label_dict,
        }, 'best_protein_model.pth')
        print("           ✓ Best model saved.")

KeyboardInterrupt: 

In [None]:

torch.save({
    'model_state_dict': model.state_dict(),
    'label_dict': label_dict,
}, 'protein_model.pth')
