<a href="https://colab.research.google.com/github/daothu2023/generalized_aggregation/blob/main/Prediction_Gene__PPI_String_pretrain_gui_DInh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git


2.6.0+cu124
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import torch
import numpy as np
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import Data

# Đọc dữ liệu
edges_df = pd.read_csv('/content/drive/My Drive/PPI_STRING/ppi_for_gnn_filled.csv')
features_df = pd.read_csv('/content/drive/My Drive/PPI_STRING/features_for_CESC.csv', index_col=0)
labels_df = pd.read_csv('/content/drive/My Drive/PPI_STRING/CESC_labels(0_1).csv')

# Danh sách gene và ánh xạ index
genes_from_edges = set(edges_df['protein1']).union(set(edges_df['protein2']))
all_genes = sorted(genes_from_edges)
node_to_idx = {gene: i for i, gene in enumerate(all_genes)}
idx_to_node = {i: gene for gene, i in node_to_idx.items()}

# edge_index
edges = edges_df[['protein1', 'protein2']].dropna()
edge_index = torch.tensor([[node_to_idx[a], node_to_idx[b]]
                           for a, b in edges.values if a in node_to_idx and b in node_to_idx],
                          dtype=torch.long).t().contiguous()

# Tạo ma trận đặc trưng với placeholder 0
feature_dim = features_df.shape[1]
x_matrix = np.zeros((len(all_genes), feature_dim))
has_feature = np.zeros(len(all_genes), dtype=bool)

# Gán đặc trưng chưa chuẩn hóa
for gene in features_df.index:
    if gene in node_to_idx:
        idx = node_to_idx[gene]
        x_matrix[idx] = features_df.loc[gene].values
        has_feature[idx] = True

# Tính trung bình hàng xóm cho các node không có đặc trưng
neighbors_dict = {i: [] for i in range(len(all_genes))}
for src, dst in edge_index.t().tolist():
    neighbors_dict[src].append(dst)
    neighbors_dict[dst].append(src)

for i in range(len(all_genes)):
    if not has_feature[i]:
        neighbor_feats = [x_matrix[n] for n in neighbors_dict[i] if has_feature[n]]
        if neighbor_feats:
            x_matrix[i] = np.mean(neighbor_feats, axis=0)

# Tạo nhãn
labels_map = {row['Gene']: row['Labels'] for _, row in labels_df.iterrows()}
y = torch.full((x_matrix.shape[0],), -1, dtype=torch.long)
for gene, label in labels_map.items():
    if gene in node_to_idx:
        y[node_to_idx[gene]] = int(label)

# Chuyển x_matrix sang tensor (chưa chuẩn hóa lúc này!)
x = torch.tensor(x_matrix, dtype=torch.float)
data = Data(x=x, edge_index=edge_index, y=y)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, average_precision_score
from collections import Counter
import numpy as np

# ======================= 1. Định nghĩa các mô hình =======================
class GCNEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc1 = nn.Linear(hidden_channels, hidden_channels)
        self.out_proj = nn.Linear(hidden_channels, in_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = F.relu(self.fc1(x))
        return self.out_proj(x)

class GCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc1 = nn.Linear(hidden_channels, hidden_channels)
        self.fc2 = nn.Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# ======================= 2. Hàm huấn luyện & đánh giá =======================
def evaluate(model, data, mask, loss_fn):
    model.eval()
    with torch.no_grad():
        valid_mask = mask & (data.y != -1)
        out = model(data.x, data.edge_index)
        probs = F.softmax(out, dim=1)
        loss = loss_fn(out[valid_mask], data.y[valid_mask]).item()
        preds = out[valid_mask].argmax(dim=1)
        acc = accuracy_score(data.y[valid_mask].cpu(), preds.cpu())
        auprc = average_precision_score(data.y[valid_mask].cpu(), probs[valid_mask][:, 1].cpu())
    return acc, auprc, loss

def train_one_epoch(model, data, optimizer, loss_fn):
    model.train()
    optimizer.zero_grad()
    mask = data.train_mask & (data.y != -1)
    out = model(data.x, data.edge_index)
    loss = loss_fn(out[mask], data.y[mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def oversample_features(x, y, idx):
    labels = y[idx].cpu().numpy()
    class_counts = Counter(labels)
    max_class = max(class_counts.values())
    new_x, new_y = [], []

    for c in class_counts:
        c_idx = idx[(y[idx] == c)]
        repeats = max_class - class_counts[c]
        repeat_idx = c_idx.repeat(int(repeats // len(c_idx)) + 1)[:repeats]
        new_x.append(x[repeat_idx])
        new_y.append(y[repeat_idx])
    return torch.cat([x] + new_x), torch.cat([y] + new_y)

# ======================= 3. Hàm chính: pretrain + fine-tune =======================
def run_pretrain_finetune_pipeline(data_raw, hidden_channels=32):
    labeled_idx = torch.where(data_raw.y != -1)[0]
    labeled_y = data_raw.y[labeled_idx]
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=45)

    accs, auprcs = [], []

    for fold, (train_val_idx, test_idx) in enumerate(skf.split(labeled_idx, labeled_y)):
        print(f"\n📂 Fold {fold+1}/5")

        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=fold+1)
        train_idx, val_idx = next(sss.split(train_val_idx, labeled_y[train_val_idx]))

        train_nodes = labeled_idx[train_val_idx][train_idx]
        val_nodes = labeled_idx[train_val_idx][val_idx]
        test_nodes = labeled_idx[test_idx]

        # 1. Chuẩn hoá DỮ LIỆU TOÀN BỘ bằng scaler từ TRAIN
        x_np = data_raw.x.cpu().numpy()
        scaler = StandardScaler()
        x_scaled = scaler.fit(data_raw.x[train_nodes].cpu()).transform(x_np)
        data = Data(x=torch.tensor(x_scaled, dtype=torch.float32),
                    edge_index=data_raw.edge_index.clone(),
                    y=data_raw.y.clone())

        # 2. Pretrain trên toàn bộ dữ liệu (không nhãn)
        encoder = GCNEncoder(data.num_node_features, hidden_channels)
        optimizer = torch.optim.Adam(encoder.parameters(), lr=0.01)
        for epoch in range(200):
            encoder.train()
            optimizer.zero_grad()
            x_masked = data.x.clone()
            mask = torch.rand_like(x_masked) < 0.15
            x_masked[mask] = 0
            out = encoder(x_masked, data.edge_index)
            loss = F.mse_loss(out, data.x)
            loss.backward()
            optimizer.step()

        # 3. Tạo GCN & gán trọng số
        model = GCN(data.num_node_features, hidden_channels, out_channels=2)
        model.conv1.load_state_dict(encoder.conv1.state_dict())
        model.conv2.load_state_dict(encoder.conv2.state_dict())
        model.fc1.load_state_dict(encoder.fc1.state_dict())

        # 4. Tạo train/val/test mask
        x_train = data.x[train_nodes]
        y_train = data.y[train_nodes]
        x_resampled, y_resampled = oversample_features(x_train, y_train, torch.arange(len(train_nodes)))

        data.x = torch.cat([data.x, x_resampled[len(train_nodes):]], dim=0)
        data.y = torch.cat([data.y, y_resampled[len(train_nodes):]], dim=0)

        n_total = data.x.shape[0]
        train_mask = torch.zeros(n_total, dtype=torch.bool)
        val_mask = torch.zeros(n_total, dtype=torch.bool)
        test_mask = torch.zeros(n_total, dtype=torch.bool)

        train_mask[train_nodes] = True
        train_mask[len(train_nodes):] = True
        val_mask[val_nodes] = True
        test_mask[test_nodes] = True

        data.train_mask = train_mask
        data.val_mask = val_mask
        data.test_mask = test_mask

        # 5. Train fine-tune
        y_train_all = torch.cat([y_train, y_resampled[len(train_nodes):]]).cpu().numpy()
        weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_all), y=y_train_all)
        loss_fn = nn.CrossEntropyLoss(weight=torch.tensor(weights, dtype=torch.float32))

        optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
        best_val_loss = float('inf')
        patience_counter = 0

        for epoch in range(1, 201):
            train_loss = train_one_epoch(model, data, optimizer, loss_fn)
            train_acc, train_auprc, _ = evaluate(model, data, train_mask, loss_fn)
            val_acc, val_auprc, val_loss = evaluate(model, data, val_mask, loss_fn)

            print(f"Epoch {epoch:03d} | Train Acc: {train_acc:.4f} | AUPRC: {train_auprc:.4f} || "
                  f"Val Acc: {val_acc:.4f} | AUPRC: {val_auprc:.4f} | Val Loss: {val_loss:.4f}")

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = model.state_dict()
                patience_counter = 0
            else:
                patience_counter += 1
            if patience_counter >= 40:
                break
         # ⬇️ Lưu mô hình tốt nhất theo fold
        model_path = f"/content/drive/My Drive/PPI_STRING/Best_model_pretrain/CESCrun2_best_model_fold{fold+1}.pth"
        torch.save(best_model, model_path)
        print(f"💾 Mô hình tốt nhất Fold {fold+1} đã được lưu vào: {model_path}")

        # 6. Test
        model.load_state_dict(best_model)
        test_acc, test_auprc, _ = evaluate(model, data, test_mask, loss_fn)
        print(f"✅ Test Accuracy: {test_acc:.4f} | AUPRC: {test_auprc:.4f}")
        accs.append(test_acc)
        auprcs.append(test_auprc)

    print("\n📊 Tổng kết sau 5 fold:")
    print(f"Accuracy: {np.mean(accs):.4f} ± {np.std(accs):.4f}")
    print(f"AUPRC:    {np.mean(auprcs):.4f} ± {np.std(auprcs):.4f}")


In [None]:
run_pretrain_finetune_pipeline(data)