<a href="https://colab.research.google.com/github/daothu2023/generalized_aggregation/blob/main/2MLP_pretrain_gui_%C4%90%E1%BB%8Bnh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git


2.6.0+cu124
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.preprocessing import StandardScaler
import numpy as np

# ---------- 1. Đọc dữ liệu ----------
edges_df = pd.read_csv('/content/drive/MyDrive/PPI_STRING/ppi_for_gnn_filled.csv')
features_df = pd.read_csv('/content/drive/My Drive/PPI_STRING/features_for_BRCA.csv', index_col=0)
label1_df = pd.read_csv('/content/drive/MyDrive/PPI_STRING/New_BRCA_labels(0_1).csv')
label2_df = pd.read_csv('/content/drive/MyDrive/PPI_STRING/New_dataset_dinhgui/label_telomere.csv')

# ---------- 2. Danh sách tất cả gene từ PPI ----------
genes_from_edges = set(edges_df['protein1']).union(set(edges_df['protein2']))
genes_from_features = set(features_df.index)
all_genes = sorted(genes_from_edges)  # đảm bảo thứ tự cố định

# Mapping gene <-> index
node_to_idx = {gene: i for i, gene in enumerate(all_genes)}
idx_to_node = {i: gene for gene, i in node_to_idx.items()}

# ---------- 3. edge_index ----------
edges = edges_df[['protein1', 'protein2']].dropna()
edge_index = torch.tensor([[node_to_idx[a], node_to_idx[b]]
                           for a, b in edges.values if a in node_to_idx and b in node_to_idx],
                          dtype=torch.long).t().contiguous()

# ---------- 4. Tạo đặc trưng x ----------
feature_dim = features_df.shape[1]
x_matrix = np.zeros((len(all_genes), feature_dim))
has_feature = np.zeros(len(all_genes), dtype=bool)

# Chuẩn hóa đặc trưng
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_df.values)
features_scaled_df = pd.DataFrame(features_scaled, index=features_df.index)

# Gán features cho những gene có sẵn
for gene in features_scaled_df.index:
    if gene in node_to_idx:
        idx = node_to_idx[gene]
        x_matrix[idx] = features_scaled_df.loc[gene].values
        has_feature[idx] = True

# Tính hàng xóm
neighbors_dict = {i: [] for i in range(len(all_genes))}
for src, dst in edge_index.t().tolist():
    neighbors_dict[src].append(dst)
    neighbors_dict[dst].append(src)

# Gán đặc trưng trung bình hàng xóm cho node thiếu
for i in range(len(all_genes)):
    if not has_feature[i]:
        neighbor_feats = [x_matrix[n] for n in neighbors_dict[i] if has_feature[n]]
        if neighbor_feats:
            x_matrix[i] = np.mean(neighbor_feats, axis=0)
        # nếu không có hàng xóm nào có feature thì giữ nguyên (toàn 0)

x = torch.tensor(x_matrix, dtype=torch.float)

# -------- 5. Tạo nhãn --------
num_nodes = len(all_genes)
y1 = torch.full((num_nodes,), -1, dtype=torch.long)
y2 = torch.full((num_nodes,), -1, dtype=torch.long)

label1_map = dict(zip(label1_df['Gene'], label1_df['Labels']))
label2_map = dict(zip(label2_df['Gene'], label2_df['Labels']))

for gene, label in label1_map.items():
    if gene in node_to_idx:
        y1[node_to_idx[gene]] = int(label)

for gene, label in label2_map.items():
    if gene in node_to_idx:
        y2[node_to_idx[gene]] = int(label)

# -------- 6. Tạo đối tượng Data --------
data = Data(x=x, edge_index=edge_index, y1=y1, y2=y2)

# -------- 7. In thông tin --------
print(data)
print(f"Tổng số đỉnh: {data.num_nodes}")
print(f"Tổng số cạnh: {data.num_edges}")
print(f"Số đỉnh có label1: {(y1 != -1).sum().item()}")
print(f"Số đỉnh có label2: {(y2 != -1).sum().item()}")

# -------- 8. Thống kê số lượng nhãn 0 và 1 --------
label1_0 = (y1 == 0).sum().item()
label1_1 = (y1 == 1).sum().item()
label2_0 = (y2 == 0).sum().item()
label2_1 = (y2 == 1).sum().item()

print(f"📊 Label1 - Số nhãn 0: {label1_0}, nhãn 1: {label1_1}")
print(f"📊 Label2 - Số nhãn 0: {label2_0}, nhãn 1: {label2_1}")



Data(x=[12809, 11], edge_index=[2, 243840], y1=[12809], y2=[12809])
Tổng số đỉnh: 12809
Tổng số cạnh: 243840
Số đỉnh có label1: 1337
Số đỉnh có label2: 3286
📊 Label1 - Số nhãn 0: 1290, nhãn 1: 47
📊 Label2 - Số nhãn 0: 1539, nhãn 1: 1747


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data

# ===== Định nghĩa GCNEncoder =====
class GCNEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc1 = nn.Linear(hidden_channels, hidden_channels)
        self.out_proj = nn.Linear(hidden_channels, in_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = F.relu(self.fc1(x))
        return self.out_proj(x)

# ===== Hàm pretrain encoder và lưu tại các epoch mong muốn =====
def pretrain_encoder_save_multiple(data, hidden_channels=32, num_epochs=200,
                                   mask_ratio=0.15, lr=0.01,
                                   save_epochs=[50, 100, 150, 200],
                                   save_folder="/content/drive/MyDrive/Pretrained_Encoders"):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data = data.to(device)

    encoder = GCNEncoder(data.num_node_features, hidden_channels).to(device)
    optimizer = torch.optim.Adam(encoder.parameters(), lr=lr)

    for epoch in range(1, num_epochs + 1):
        encoder.train()
        optimizer.zero_grad()

        x_masked = data.x.clone()
        mask = torch.rand_like(x_masked) < mask_ratio
        x_masked[mask] = 0

        out = encoder(x_masked, data.edge_index)
        loss = F.mse_loss(out, data.x)

        loss.backward()
        optimizer.step()

        if epoch % 10 == 0 or epoch == 1:
            print(f"Epoch {epoch:03d} | Pretrain Loss: {loss.item():.6f}")

        # === Save nếu tới các epoch mong muốn
        if epoch in save_epochs:
            save_path = f"{save_folder}/BRCA_encoder_epoch{epoch}.pth"
            torch.save(encoder.state_dict(), save_path)
            print(f"💾 Đã lưu encoder tại epoch {epoch} -> {save_path}")

    return encoder


In [None]:
# Pretrain và tự động lưu tại 50, 100, 150, 200
pretrain_encoder_save_multiple(
    data,
    hidden_channels=32,
    num_epochs=200,
    mask_ratio=0.15,
    lr=0.01,
    save_epochs=[50, 100, 150, 200],
    save_folder="/content/drive/MyDrive/Pretrained_Encoders"
)


Epoch 001 | Pretrain Loss: 1.155570
Epoch 010 | Pretrain Loss: 0.919709
Epoch 020 | Pretrain Loss: 0.700116
Epoch 030 | Pretrain Loss: 0.617062
Epoch 040 | Pretrain Loss: 0.564116
Epoch 050 | Pretrain Loss: 0.542802
💾 Đã lưu encoder tại epoch 50 -> /content/drive/MyDrive/Pretrained_Encoders/BRCA_encoder_epoch50.pth
Epoch 060 | Pretrain Loss: 0.524768
Epoch 070 | Pretrain Loss: 0.519284
Epoch 080 | Pretrain Loss: 0.513322
Epoch 090 | Pretrain Loss: 0.507946
Epoch 100 | Pretrain Loss: 0.491350
💾 Đã lưu encoder tại epoch 100 -> /content/drive/MyDrive/Pretrained_Encoders/BRCA_encoder_epoch100.pth
Epoch 110 | Pretrain Loss: 0.491382
Epoch 120 | Pretrain Loss: 0.482907
Epoch 130 | Pretrain Loss: 0.477237
Epoch 140 | Pretrain Loss: 0.476862
Epoch 150 | Pretrain Loss: 0.480598
💾 Đã lưu encoder tại epoch 150 -> /content/drive/MyDrive/Pretrained_Encoders/BRCA_encoder_epoch150.pth
Epoch 160 | Pretrain Loss: 0.477450
Epoch 170 | Pretrain Loss: 0.469857
Epoch 180 | Pretrain Loss: 0.465825
Epoch 190

GCNEncoder(
  (conv1): GCNConv(11, 32)
  (conv2): GCNConv(32, 32)
  (fc1): Linear(in_features=32, out_features=32, bias=True)
  (out_proj): Linear(in_features=32, out_features=11, bias=True)
)

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import average_precision_score
from collections import Counter
import numpy as np

ALPHA = 0.7

class GCN_MultiHead(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels1, out_channels2):
        super(GCN_MultiHead, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.mlp1 = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_channels, out_channels1)
        )
        self.mlp2 = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_channels, out_channels2)
        )

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        return self.mlp1(x), self.mlp2(x)

def train_one_epoch_dual(model, data, optimizer, loss_fn):
    model.train()
    optimizer.zero_grad()
    out1, out2 = model(data.x, data.edge_index)

    mask1 = data.train_mask & (data.y1 != -1)
    mask2 = data.y2 != -1  # full y2

    loss1 = loss_fn(out1[mask1], data.y1[mask1]) if mask1.sum() > 0 else 0
    loss2 = loss_fn(out2[mask2], data.y2[mask2]) if mask2.sum() > 0 else 0
    total_loss = ALPHA * loss1 + (1 - ALPHA) * loss2

    total_loss.backward()
    optimizer.step()
    return loss1.item(), loss2.item(), total_loss.item()

def evaluate_y1_only(model, data, mask, loss_fn):
    model.eval()
    with torch.no_grad():
        out1, _ = model(data.x, data.edge_index)
        mask1 = mask & (data.y1 != -1)
        loss1 = loss_fn(out1[mask1], data.y1[mask1]) if mask1.sum() > 0 else 0
        acc1 = (out1[mask1].argmax(dim=1) == data.y1[mask1]).float().mean().item() if mask1.sum() > 0 else 0
        auprc1 = 0
        if (data.y1[mask1] == 1).sum() > 0:
            probs1 = F.softmax(out1[mask1], dim=1)
            auprc1 = average_precision_score(data.y1[mask1].cpu(), probs1[:, 1].cpu())
    return acc1, auprc1, loss1.item()

def oversample_features(x, y, idx):
    labels = y[idx].cpu().numpy()
    class_counts = Counter(labels)
    max_class = max(class_counts.values())
    new_x, new_y = [], []
    for c in class_counts:
        c_idx = idx[(y[idx] == c)]
        repeats = max_class - class_counts[c]
        if repeats > 0:
            repeat_idx = c_idx.repeat((repeats // len(c_idx)) + 1)[:repeats]
            new_x.append(x[repeat_idx])
            new_y.append(y[repeat_idx])
    if new_x:
        x_added = torch.cat(new_x)
        y_added = torch.cat(new_y)
        return torch.cat([x, x_added]), torch.cat([y, y_added])
    return x, y

def run_kfold_training_with_pretrained_encoder(data, pretrained_encoder_path, hidden_channels=32):
    # === 1. Load pretrained encoder ===
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    encoder = GCNEncoder(data.num_node_features, hidden_channels).to(device)
    encoder.load_state_dict(torch.load(pretrained_encoder_path, map_location=device))
    print(f"✅ Loaded pretrained encoder từ: {pretrained_encoder_path}")

    labeled_idx = torch.where((data.y1 != -1) & (data.y2 != -1))[0]
    labels_for_split = data.y1[labeled_idx]
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    results = []

    for fold, (train_val_idx, test_idx) in enumerate(skf.split(labeled_idx, labels_for_split)):
        print(f"\n📁 Fold {fold+1}/5")

        sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
        train_idx, val_idx = next(sss_val.split(train_val_idx, labels_for_split[train_val_idx]))

        train_nodes = labeled_idx[train_val_idx][train_idx]
        val_nodes = labeled_idx[train_val_idx][val_idx]
        test_nodes = labeled_idx[test_idx]

        # === 2. Tạo model từ encoder pretrain
        model = GCN_MultiHead(data.num_node_features, hidden_channels, out_channels1=2, out_channels2=2).to(device)
        model.conv1.load_state_dict(encoder.conv1.state_dict())
        model.conv2.load_state_dict(encoder.conv2.state_dict())

        # === 3. Chuẩn bị dữ liệu
        x_train = data.x[train_nodes]
        y1_train = data.y1[train_nodes]
        x_resampled, y1_resampled = oversample_features(x_train, y1_train, torch.arange(len(train_nodes)))

        x_new = torch.cat([data.x, x_resampled[len(train_nodes):]], dim=0)
        y1_new = torch.cat([data.y1, y1_resampled[len(train_nodes):]], dim=0)
        y2_new = torch.cat([data.y2, torch.full((len(x_resampled) - len(train_nodes),), -1, dtype=torch.long)], dim=0)

        n_total = x_new.size(0)
        train_mask = torch.zeros(n_total, dtype=torch.bool)
        val_mask = torch.zeros(n_total, dtype=torch.bool)
        test_mask = torch.zeros(n_total, dtype=torch.bool)

        train_mask[train_nodes] = True
        train_mask[len(train_nodes):] = True
        val_mask[val_nodes] = True
        test_mask[test_nodes] = True

        data_new = Data(
            x=x_new.to(device),
            edge_index=data.edge_index.to(device),
            y1=y1_new.to(device),
            y2=y2_new.to(device),
            train_mask=train_mask.to(device),
            val_mask=val_mask.to(device),
            test_mask=test_mask.to(device)
        )

        # === 4. Loss + Optimizer
        classes = np.unique(y1_resampled.cpu().numpy())
        weights = compute_class_weight(class_weight='balanced', classes=classes, y=y1_resampled.cpu().numpy())
        loss_fn = torch.nn.CrossEntropyLoss(weight=torch.tensor(weights, dtype=torch.float32).to(device))
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

        best_val_loss = float('inf')
        patience_counter = 0

        # === 5. Fine-tune
        for epoch in range(1, 201):
            loss1, loss2, train_loss = train_one_epoch_dual(model, data_new, optimizer, loss_fn)
            acc1, auprc1, val_loss1 = evaluate_y1_only(model, data_new, data_new.val_mask, loss_fn)

            print(f"Epoch {epoch:03d} | Train Loss1: {loss1:.4f} | Loss2: {loss2:.4f} | Total: {train_loss:.4f} || "
                  f"Val Loss1: {val_loss1:.4f} || Acc1: {acc1:.4f} | AUPRC1: {auprc1:.4f}")

            if val_loss1 < best_val_loss:
                best_val_loss = val_loss1
                best_model_state = model.state_dict()
                patience_counter = 0
            else:
                patience_counter += 1
            if patience_counter >= 40:
                break

        model.load_state_dict(best_model_state)
        acc1, auprc1, _ = evaluate_y1_only(model, data_new, data_new.test_mask, loss_fn)
        print(f"\n✅ Test Accuracy: {acc1:.4f} | AUPRC1: {auprc1:.4f}")
        results.append((acc1, auprc1))

    acc1s, auprc1s = zip(*results)
    print("\n===== Tổng kết sau 5 fold =====")
    print(f"[Label 1] Accuracy: {np.mean(acc1s):.4f} ± {np.std(acc1s):.4f}")
    print(f"[Label 1] AUPRC:   {np.mean(auprc1s):.4f} ± {np.std(auprc1s):.4f}")
run_kfold_training_with_pretrained_encoder(
    data,
    pretrained_encoder_path="/content/drive/MyDrive/Pretrained_Encoders/BRCA_encoder_epoch150.pth",
    hidden_channels=32
)


✅ Loaded pretrained encoder từ: /content/drive/MyDrive/Pretrained_Encoders/BRCA_encoder_epoch150.pth

📁 Fold 1/5
Epoch 001 | Train Loss1: 0.7301 | Loss2: 0.6855 | Total: 0.7167 || Val Loss1: 0.7459 || Acc1: 0.1061 | AUPRC1: 0.5603
Epoch 002 | Train Loss1: 0.6830 | Loss2: 0.6714 | Total: 0.6795 || Val Loss1: 0.6948 || Acc1: 0.6364 | AUPRC1: 0.5267
Epoch 003 | Train Loss1: 0.6292 | Loss2: 0.6630 | Total: 0.6393 || Val Loss1: 0.6639 || Acc1: 0.8788 | AUPRC1: 0.2048
Epoch 004 | Train Loss1: 0.5959 | Loss2: 0.6561 | Total: 0.6140 || Val Loss1: 0.6348 || Acc1: 0.8939 | AUPRC1: 0.1334
Epoch 005 | Train Loss1: 0.5547 | Loss2: 0.6492 | Total: 0.5831 || Val Loss1: 0.5891 || Acc1: 0.8939 | AUPRC1: 0.2470
Epoch 006 | Train Loss1: 0.4974 | Loss2: 0.6425 | Total: 0.5409 || Val Loss1: 0.5374 || Acc1: 0.8939 | AUPRC1: 0.4567
Epoch 007 | Train Loss1: 0.4426 | Loss2: 0.6373 | Total: 0.5010 || Val Loss1: 0.4842 || Acc1: 0.8636 | AUPRC1: 0.4539
Epoch 008 | Train Loss1: 0.3863 | Loss2: 0.6339 | Total: 0.46