<a href="https://colab.research.google.com/github/daothu2023/generalized_aggregation/blob/main/Thu_Prediction_Gene__PPI_String_gui_Dinh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.preprocessing import StandardScaler
import numpy as np

# ---------- 1. Đọc dữ liệu ----------
edges_df = pd.read_csv('/content/drive/My Drive/PPI_STRING/ppi_for_gnn_filled.csv')
features_df = pd.read_csv('/content/drive/My Drive/PPI_STRING/features_for_BRCA.csv', index_col=0)
labels_df = pd.read_csv('/content/drive/My Drive/PPI_STRING/BRCA_labels(0_1).csv')

# ---------- 2. Danh sách tất cả gene từ PPI ----------
genes_from_edges = set(edges_df['protein1']).union(set(edges_df['protein2']))
genes_from_features = set(features_df.index)
all_genes = sorted(genes_from_edges)  # đảm bảo thứ tự cố định

# Mapping gene <-> index
node_to_idx = {gene: i for i, gene in enumerate(all_genes)}
idx_to_node = {i: gene for gene, i in node_to_idx.items()}

# ---------- 3. edge_index ----------
edges = edges_df[['protein1', 'protein2']].dropna()
edge_index = torch.tensor([[node_to_idx[a], node_to_idx[b]]
                           for a, b in edges.values if a in node_to_idx and b in node_to_idx],
                          dtype=torch.long).t().contiguous()

# ---------- 4. Tạo đặc trưng x ----------
feature_dim = features_df.shape[1]
x_matrix = np.zeros((len(all_genes), feature_dim))
has_feature = np.zeros(len(all_genes), dtype=bool)

# Chuẩn hóa đặc trưng
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_df.values)
features_scaled_df = pd.DataFrame(features_scaled, index=features_df.index)

# Gán features cho những gene có sẵn
for gene in features_scaled_df.index:
    if gene in node_to_idx:
        idx = node_to_idx[gene]
        x_matrix[idx] = features_scaled_df.loc[gene].values
        has_feature[idx] = True

# Tính hàng xóm
neighbors_dict = {i: [] for i in range(len(all_genes))}
for src, dst in edge_index.t().tolist():
    neighbors_dict[src].append(dst)
    neighbors_dict[dst].append(src)

# Gán đặc trưng trung bình hàng xóm cho node thiếu
for i in range(len(all_genes)):
    if not has_feature[i]:
        neighbor_feats = [x_matrix[n] for n in neighbors_dict[i] if has_feature[n]]
        if neighbor_feats:
            x_matrix[i] = np.mean(neighbor_feats, axis=0)
        # nếu không có hàng xóm nào có feature thì giữ nguyên (toàn 0)

x = torch.tensor(x_matrix, dtype=torch.float)

# ---------- 5. Tạo nhãn ----------
labels_map = {row['Gene']: row['Labels'] for _, row in labels_df.iterrows()}
y = torch.full((x.size(0),), -1, dtype=torch.long)  # gán mặc định -1

for gene, label in labels_map.items():
    if gene in node_to_idx:
        y[node_to_idx[gene]] = int(label)

# ---------- 6. Tạo Data ----------
data = Data(x=x, edge_index=edge_index, y=y)
# data.train_mask = y != -1  # mask các node có nhãn

# print(data)
# print("Số đỉnh có nhãn:", data.train_mask.sum().item())


In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import average_precision_score
import numpy as np

# GCN Model
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc1 = torch.nn.Linear(hidden_channels, hidden_channels)
        self.fc2 = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

# Evaluate
def evaluate(model, data, mask, loss_fn):
    model.eval()
    with torch.no_grad():
        valid_mask = mask & (data.y != -1)
        out = model(data.x, data.edge_index)
        probs = F.softmax(out, dim=1)
        loss = loss_fn(out[valid_mask], data.y[valid_mask]).item()
        preds = out[valid_mask].argmax(dim=1)
        acc = (preds == data.y[valid_mask]).sum().item() / valid_mask.sum().item()
        auprc = average_precision_score(data.y[valid_mask].cpu(), probs[valid_mask][:, 1].cpu())
    return acc, auprc, loss

# Train
def train_one_epoch(model, data, optimizer, loss_fn):
    model.train()
    optimizer.zero_grad()
    mask = data.train_mask & (data.y != -1)
    out = model(data.x, data.edge_index)
    loss = loss_fn(out[mask], data.y[mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Oversampling helper
def oversample_features(x, y, idx):
    """Lặp lại dữ liệu lớp thiểu số để cân bằng"""
    from collections import Counter
    labels = y[idx].cpu().numpy()
    class_counts = Counter(labels)
    max_class = max(class_counts.values())

    new_x = []
    new_y = []
    for c in class_counts:
        c_idx = idx[(y[idx] == c)]
        repeats = max_class - class_counts[c]
        repeat_idx = c_idx.repeat(int(repeats // len(c_idx)) + 1)[:repeats]
        new_x.append(x[repeat_idx])
        new_y.append(y[repeat_idx])
    if new_x:
        x_added = torch.cat(new_x)
        y_added = torch.cat(new_y)
        return torch.cat([x, x_added]), torch.cat([y, y_added])
    else:
        return x, y
lr=0.01
weight_decay=5e-4
hidden_channels = 32
epoch = 200

# Run training
def run_gcn_with_oversampling(data, hidden_channels):
    labeled_idx = torch.where(data.y != -1)[0]
    labeled_y = data.y[labeled_idx]
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    results = []

    for fold, (train_val_idx, test_idx) in enumerate(skf.split(labeled_idx, labeled_y)):
        print(f"\n📂 Fold {fold+1}/5")

        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
        train_idx, val_idx = next(sss.split(train_val_idx, labeled_y[train_val_idx]))

        train_nodes = labeled_idx[train_val_idx][train_idx]
        val_nodes = labeled_idx[train_val_idx][val_idx]
        test_nodes = labeled_idx[test_idx]

        # Oversampling trên train_nodes
        x_train = data.x[train_nodes]
        y_train = data.y[train_nodes]
        x_resampled, y_resampled = oversample_features(x_train, y_train, torch.arange(len(train_nodes)))

        data.x = torch.cat([data.x, x_resampled[len(train_nodes):]], dim=0)
        data.y = torch.cat([data.y, y_resampled[len(train_nodes):]], dim=0)

        n_total = data.x.shape[0]
        train_mask = torch.zeros(n_total, dtype=torch.bool)
        val_mask = torch.zeros(n_total, dtype=torch.bool)
        test_mask = torch.zeros(n_total, dtype=torch.bool)

        train_mask[train_nodes] = True
        train_mask[len(train_nodes):] = True  # Mask cho các node oversampled
        val_mask[val_nodes] = True
        test_mask[test_nodes] = True

        data.train_mask = train_mask
        data.val_mask = val_mask
        data.test_mask = test_mask

        # Weighted loss
        y_train_all = torch.cat([y_train, y_resampled[len(train_nodes):]]).cpu().numpy()
        weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_all), y=y_train_all)
        loss_fn = torch.nn.CrossEntropyLoss(weight=torch.tensor(weights, dtype=torch.float32))

        model = GCN(data.num_node_features, hidden_channels, out_channels=2)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

        best_val_loss = float('inf')
        patience_counter = 0
        for epoch in range(1, epoch):
            train_loss = train_one_epoch(model, data, optimizer, loss_fn)
            train_acc, train_auprc, _ = evaluate(model, data, train_mask, loss_fn)
            val_acc, val_auprc, val_loss = evaluate(model, data, val_mask, loss_fn)

            print(f"Epoch {epoch:03d} | Train Acc: {train_acc:.4f} | AUPRC: {train_auprc:.4f} || "
                  f"Val Acc: {val_acc:.4f} | AUPRC: {val_auprc:.4f} | Val Loss: {val_loss:.4f}")

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = model.state_dict()
                patience_counter = 0
            else:
                patience_counter += 1
            if patience_counter >= 40:
                break

        model.load_state_dict(best_model)
        test_acc, test_auprc, _ = evaluate(model, data, test_mask, loss_fn)
        print(f"✅ Test Accuracy: {test_acc:.4f} | AUPRC: {test_auprc:.4f}")
        results.append((test_acc, test_auprc))

    accs, auprcs = zip(*results)
    print("\n📊 Tổng kết sau 5 fold:")
    print(f"Accuracy: {np.mean(accs):.4f} ± {np.std(accs):.4f}")
    print(f"AUPRC:    {np.mean(auprcs):.4f} ± {np.std(auprcs):.4f}")

# Gọi chạy:
run_gcn_with_oversampling(data)
