In [1]:
import torch
from torch.nn import Sequential, Linear, ReLU, BatchNorm1d
from torch_geometric.nn import GCNConv, global_mean_pool
import torch.nn.functional as F
from torch.nn import Linear, Sequential, ReLU, BatchNorm1d
from torch_geometric.data import DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr
import numpy as np
import os
import pandas as pd
import glob
import random

  from .autonotebook import tqdm as notebook_tqdm


## Model construction for GCN

In [2]:
class EarlyStopping:
    def __init__(self, patience=10, verbose=False, delta=0):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.best_epoch = None

    def __call__(self, val_loss, model, epoch):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.best_epoch = epoch
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.best_epoch = epoch
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...')
        torch.save(model.state_dict(), 'checkpoint.pt')
        self.val_loss_min = val_loss

class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GCN, self).__init__()
        self.build_model(num_node_features, num_classes)

    def build_model(self, num_node_features, num_classes):
        self.conv1 = GCNConv(num_node_features, 64)
        self.bn1 = BatchNorm1d(64)
        self.conv2 = GCNConv(64, 64)
        self.bn2 = BatchNorm1d(64)
        
        self.fc1 = Linear(64, 64)
        self.fc2 = Linear(64, 32)
        self.fc3 = Linear(32, num_classes)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        
        x = F.relu(self.conv1(x, edge_index))
        x = self.bn1(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = F.relu(self.conv2(x, edge_index))
        x = self.bn2(x)
        x = global_mean_pool(x, batch)  
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

## Model training and evaluation

In [3]:
def r2_score(predictions, targets):
    target_mean = torch.mean(targets)
    ss_tot = torch.sum((targets - target_mean) ** 2)
    ss_res = torch.sum((targets - predictions) ** 2)
    return 1 - ss_res / ss_tot

def pcc_score(predictions, targets):
    predictions = predictions.squeeze().detach().numpy()
    targets = targets.squeeze().detach().numpy()
    pcc, _ = pearsonr(predictions, targets)
    return pcc

def train(model, optimizer, loader, device):
    model.train()
    total_loss = 0
    targets = []
    predictions = []
    for data in loader:
        optimizer.zero_grad()
        data.x = data.x.float()
        data = data.to(device)
        output = model(data)
        label = data.y.to(torch.float).view(-1, 1).to(device)
        loss = F.mse_loss(output, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
        targets.append(label.cpu())
        predictions.append(output.cpu())
    return process_results(total_loss, targets, predictions, loader)

def evaluate(model, loader, device):
    model.eval()
    total_loss = 0
    targets = []
    predictions = []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            data.x = data.x.float()
            output = model(data)            
            label = data.y.to(torch.float).view(-1, 1).to(device)
            loss = F.mse_loss(output, label)
            total_loss += loss.item() * data.num_graphs
            targets.append(label.cpu())
            predictions.append(output.cpu())
    return process_results(total_loss, targets, predictions, loader)

def process_results(total_loss, targets, predictions, loader):
    targets = torch.cat(targets, dim=0)
    predictions = torch.cat(predictions, dim=0)
    rmse = torch.sqrt(F.mse_loss(predictions, targets))
    mae = torch.mean(torch.abs(predictions - targets))
    r2 = r2_score(predictions, targets)
    pcc = pcc_score(predictions, targets)
    return total_loss / len(loader.dataset), rmse, mae, r2, pcc


def save_best_results(best_metrics, best_weights, best_results_path, model_path):
    best_results = {
        'epoch': [best_metrics['epoch']],
        'best_val_loss': [best_metrics['best_val_loss']],
        'best_val_rmse': [best_metrics['best_val_rmse']],
        'best_val_mae': [best_metrics['best_val_mae']],
        'best_val_r2': [best_metrics['best_val_r2']],
        'best_val_pcc': [best_metrics['best_val_pcc']],
        'best_train_loss': [best_metrics['best_train_loss']],
        'best_train_rmse': [best_metrics['best_train_rmse']],
        'best_train_mae': [best_metrics['best_train_mae']],
        'best_train_r2': [best_metrics['best_train_r2']],
        'best_train_pcc': [best_metrics['best_train_pcc']],
        'best_test_loss': [best_metrics['best_test_loss']],
        'best_test_rmse': [best_metrics['best_test_rmse']],
        'best_test_mae': [best_metrics['best_test_mae']],
        'best_test_r2': [best_metrics['best_test_r2']],
        'best_test_pcc': [best_metrics['best_test_pcc']],
    }

    best_results_df = pd.DataFrame.from_dict(best_results)
    best_results_df.to_csv(best_results_path, index=False)

    torch.save(best_weights, model_path)

def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    

In [5]:
def main(data_path, results_path, best_results_path, model_path):
    set_seed(42)
    train_data = torch.load(os.path.join(data_path, "train.pt"))
    val_data = torch.load(os.path.join(data_path, "valid.pt"))
    test_data = torch.load(os.path.join(data_path, "test.pt"))

    train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=128, shuffle=False)
    test_loader = DataLoader(test_data, batch_size=128, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GCN(num_node_features=145, num_classes=1).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
    early_stopping = EarlyStopping(patience=50, verbose=True)

    results = {
        'epoch': [],
        'train_loss': [],
        'train_rmse': [],
        'train_mae': [],
        'val_loss': [],
        'val_rmse': [],
        'val_mae': [],
        'test_loss': [],
        'test_rmse': [],
        'test_mae': [],
        'train_r2': [],
        'train_pcc': [],
        'val_r2': [],
        'val_pcc': [],
        'test_r2': [],
        'test_pcc': []
    }

    best_metrics = {
        'best_val_loss': float('inf'),
        'best_val_rmse': float('inf'),
        'best_val_mae': float('inf'),
        'best_val_r2': -float('inf'),
        'best_val_pcc': -float('inf'),
        'best_train_loss': None,
        'best_train_rmse': None,
        'best_train_mae': None,
        'best_train_r2': None,
        'best_train_pcc': None,
        'best_test_loss': None,
        'best_test_rmse': None,
        'best_test_mae': None,
        'best_test_r2': None,
        'best_test_pcc': None,
        'epoch': -1
    }

    for epoch in range(1000):
        train_loss, train_rmse, train_mae, train_r2, train_pcc = train(model, optimizer, train_loader, device)
        val_loss, val_rmse, val_mae, val_r2, val_pcc = evaluate(model, val_loader, device)
        test_loss, test_rmse, test_mae, test_r2, test_pcc = evaluate(model, test_loader, device)

        results['epoch'].append(epoch)
        results['train_loss'].append(round(train_loss, 3))
        results['train_rmse'].append(round(train_rmse.item(), 3))
        results['train_mae'].append(round(train_mae.item(), 3))
        results['val_loss'].append(round(val_loss, 3))
        results['val_rmse'].append(round(val_rmse.item(), 3))
        results['val_mae'].append(round(val_mae.item(), 3))
        results['test_loss'].append(round(test_loss, 3))
        results['test_rmse'].append(round(test_rmse.item(), 3))
        results['test_mae'].append(round(test_mae.item(), 3))
        results['train_r2'].append(round(train_r2.item(), 3))
        results['train_pcc'].append(round(train_pcc, 3))
        results['val_r2'].append(round(val_r2.item(), 3))
        results['val_pcc'].append(round(val_pcc, 3))
        results['test_r2'].append(round(test_r2.item(), 3))
        results['test_pcc'].append(round(test_pcc, 3))

        print(f'Epoch {epoch}: Train Loss: {train_loss:.3f}, RMSE: {train_rmse:.3f}, MAE: {train_mae:.3f}, R2: {train_r2:.3f}, PCC: {train_pcc:.3f}')
        print(f'Epoch {epoch}: Validation Loss: {val_loss:.3f}, RMSE: {val_rmse:.3f}, MAE: {val_mae:.3f}, R2: {val_r2:.3f}, PCC: {val_pcc:.3f}')
        print(f'Epoch {epoch}: Test Loss: {test_loss:.3f}, RMSE: {test_rmse:.3f}, MAE: {test_mae:.3f}, R2: {test_r2:.3f}, PCC: {test_pcc:.3f}')

        early_stopping(val_loss, model, epoch)
        if early_stopping.early_stop:
            print("Early stopping")
            break

        if val_loss < best_metrics['best_val_loss']:
            best_metrics['best_val_loss'] = val_loss
            best_metrics['best_val_rmse'] = val_rmse.item()
            best_metrics['best_val_mae'] = val_mae.item()
            best_metrics['best_val_r2'] = val_r2.item()
            best_metrics['best_val_pcc'] = val_pcc
            best_metrics['best_train_loss'] = train_loss
            best_metrics['best_train_rmse'] = train_rmse.item()
            best_metrics['best_train_mae'] = train_mae.item()
            best_metrics['best_train_r2'] = train_r2.item()
            best_metrics['best_train_pcc'] = train_pcc
            best_metrics['best_test_loss'] = test_loss
            best_metrics['best_test_rmse'] = test_rmse.item()
            best_metrics['best_test_mae'] = test_mae.item()
            best_metrics['best_test_r2'] = test_r2.item()
            best_metrics['best_test_pcc'] = test_pcc
            best_metrics['epoch'] = epoch
            best_weights = model.state_dict()

    results_df = pd.DataFrame.from_dict(results)
    results_df.to_csv(results_path, index=False)

    torch.save(model.state_dict(), 'model.pth')
    save_best_results(best_metrics, best_weights, best_results_path, model_path)

## GCN training and evaluation

In [7]:
base_dir = "/mnt/USR_DATA/ChenGeng/Project/SATCMF/Dataset/10_fold_cv/"
result_base_dir = "/mnt/USR_DATA/ChenGeng/Project/SATCMF/Dataset/"

for fold in glob.glob(base_dir + "*"):
    folder_name = fold.split("/")[-1]
    result_dir = os.path.join(result_base_dir, 'base_gnn_fold_result', folder_name)

    if not os.path.exists(result_dir):
        os.makedirs(result_dir)

    results_path = os.path.join(result_dir, folder_name + "_GCN_result.csv")
    best_results_path = os.path.join(result_dir, folder_name + "_GCN_best_result.csv")
    model_path = os.path.join(result_dir, folder_name + "_GCN_best_model.pth")

    main(fold, results_path, best_results_path, model_path)



Epoch 0: Train Loss: 144.955, RMSE: 12.040, MAE: 10.398, R2: -2.945, PCC: -0.034
Epoch 0: Validation Loss: 129.838, RMSE: 11.395, MAE: 9.831, R2: -2.889, PCC: 0.316
Epoch 0: Test Loss: 154.477, RMSE: 12.429, MAE: 10.636, R2: -2.718, PCC: 0.313
Validation loss decreased (inf --> 129.838051). Saving model ...
Epoch 1: Train Loss: 122.711, RMSE: 11.077, MAE: 9.263, R2: -2.339, PCC: 0.004
Epoch 1: Validation Loss: 102.882, RMSE: 10.143, MAE: 8.359, R2: -2.082, PCC: 0.052
Epoch 1: Test Loss: 125.282, RMSE: 11.193, MAE: 9.193, R2: -2.015, PCC: 0.076
Validation loss decreased (129.838051 --> 102.881899). Saving model ...
Epoch 2: Train Loss: 63.890, RMSE: 7.993, MAE: 6.195, R2: -0.739, PCC: 0.151
Epoch 2: Validation Loss: 39.200, RMSE: 6.261, MAE: 4.772, R2: -0.174, PCC: 0.284
Epoch 2: Test Loss: 50.383, RMSE: 7.098, MAE: 5.310, R2: -0.213, PCC: 0.288
Validation loss decreased (102.881899 --> 39.200005). Saving model ...
Epoch 3: Train Loss: 32.227, RMSE: 5.677, MAE: 4.264, R2: 0.123, PCC: 0.

In [8]:
result_base_dir = "/mnt/USR_DATA/ChenGeng/Project/SATCMF/Dataset/"
result_dir = os.path.join(result_base_dir, 'base_gnn_fold_result')

all_data = pd.DataFrame()

for folder_name in os.listdir(result_dir):
    best_results_file = os.path.join(result_dir, folder_name, folder_name + "_GCN_best_result.csv")
    if os.path.isfile(best_results_file):
        df = pd.read_csv(best_results_file)
        all_data = pd.concat([all_data, df])

stats = all_data.agg(['mean', 'std']).transpose()

stats['formatted'] = stats.apply(lambda x: f"{x['mean']:.3f} ± {x['std']:.3f}", axis=1)
print(stats)

formatted_stats = stats[['formatted']]

formatted_stats = formatted_stats.transpose()

stats_output_path = os.path.join(result_base_dir, "GCN_aggregate_formatted_stats.csv")
formatted_stats.to_csv(stats_output_path)
print(f"Saved formatted aggregate statistics to {stats_output_path}")

                       mean        std         formatted
epoch            238.800000  93.623122  238.800 ± 93.623
best_val_loss      2.290877   0.722753     2.291 ± 0.723
best_val_rmse      1.496991   0.235453     1.497 ± 0.235
best_val_mae       0.909719   0.129326     0.910 ± 0.129
best_val_r2        0.934426   0.024239     0.934 ± 0.024
best_val_pcc       0.967145   0.012043     0.967 ± 0.012
best_train_loss    1.948803   0.373255     1.949 ± 0.373
best_train_rmse    1.390243   0.133448     1.390 ± 0.133
best_train_mae     0.980324   0.107285     0.980 ± 0.107
best_train_r2      0.947464   0.009830     0.947 ± 0.010
best_train_pcc     0.973399   0.005033     0.973 ± 0.005
best_test_loss     3.311863   1.157949     3.312 ± 1.158
best_test_rmse     1.793513   0.325189     1.794 ± 0.325
best_test_mae      1.017844   0.131783     1.018 ± 0.132
best_test_r2       0.910653   0.028520     0.911 ± 0.029
best_test_pcc      0.955677   0.014402     0.956 ± 0.014
Saved formatted aggregate stati

In [15]:
import pandas as pd

# Read the CSV file
file_path = '/mnt/USR_DATA/ChenGeng/Project/SATCMF/Dataset/GCN_aggregate_formatted_stats.csv'
df = pd.read_csv(file_path)

# Specify the columns to extract
metric_columns = [
    'best_train_rmse', 'best_train_mae', 'best_train_r2', 'best_train_pcc',
    'best_test_rmse', 'best_test_mae', 'best_test_r2', 'best_test_pcc'
]
new_columns = [
    'train_rmse', 'train_mae', 'train_r2', 'train_pcc',
    'test_rmse', 'test_mae', 'test_r2', 'test_pcc'
]

# Extract the label from the file path
label = file_path.split("/")[-1].split("_")[0]

# Create a new DataFrame to store the extracted data
new_df = pd.DataFrame()
new_df['label'] = [label] * len(df)  # Assume each row has the same label

# Copy the specified columns from the original data to the new DataFrame
for old_col, new_col in zip(metric_columns, new_columns):
    new_df[new_col] = df[old_col]

# Save to a new CSV file
new_file_path = '/mnt/USR_DATA/ChenGeng/Project/SATCMF/Dataset/GCN_aggregate_stats_cleaned.csv'
new_df.to_csv(new_file_path, index=False)
