In [1]:
import os
import re
import pandas as pd

In [2]:
data_dir = "../../data/replication_gcn/final_model/benchmark-models"

file_pattern = r"gcn_(regression|classification)_fold(\d+)_(random|stratified)_seed_\d+_lr_(lr_rep|lr0001|lr001)_dim_dim(\d+)_predictions_(train|test)"

In [3]:
# List to store individual DataFrames
dataframes = []

# Mapping for lr and dim to human-readable values
lr_map = {
    "lr_rep": "Learning rate as study",
    "lr0001": "Learning rate 0.001",
    "lr001": "Learning rate 0.01"
}

dim_map = {
    "dim50": "50 dimensions",
    "dim100": "100 dimensions"
}

# Loop through each file in the directory
for filename in os.listdir(data_dir):
    match = re.match(file_pattern, filename)
    if match:
        task, fold, sampling, lr, dim, split = match.groups()
        
        # Read the CSV file
        file_path = os.path.join(data_dir, filename)
        df = pd.read_csv(file_path)
        
        # Add new columns based on parsed filename values
        df['model'] = "GCN " + task
        df['Learning rate'] = lr_map.get(lr, lr)  # Replace lr value with readable format
        df['Dimensions'] = dim_map.get(f"dim{dim}", dim)  # Replace dim value with readable format
        df['fold'] = int(fold)
        df['sampling'] = sampling
        
        # Append the DataFrame to the list
        dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df['feature'] = combined_df['Learning rate'] + " | " + combined_df['Dimensions']
combined_df = combined_df[['smiles', 'prediction_neglogld50', 'prediction_mgkg', 'prediction_epa', 
                           'actual_neglogld50', 'actual_mgkg', 'actual_epa', 'model', 'feature', 
                           'fold', 'sampling']]
combined_df.head()

Unnamed: 0,smiles,prediction_neglogld50,prediction_mgkg,prediction_epa,actual_neglogld50,actual_mgkg,actual_epa,model,feature,fold,sampling
0,CCOP(=S)(OCC)SCCC(F)=C(F)F,,,1,,,2,GCN classification,Learning rate 0.001 | 100 dimensions,0,random
1,CCN(CC)c1c([N+](=O)[O-])cc(C(F)(F)F)c(N)c1[N+]...,,,1,,,2,GCN classification,Learning rate 0.001 | 100 dimensions,0,random
2,CCCN(CCC)c1c([N+](=O)[O-])cc(C(F)(F)F)cc1[N+](...,,,2,,,3,GCN classification,Learning rate 0.001 | 100 dimensions,0,random
3,Cn1c2cc(C(F)(F)F)ccc2c2oc(C(=O)NCCCN3CCCCC3)cc21,,,2,,,2,GCN classification,Learning rate 0.001 | 100 dimensions,0,random
4,CS(=O)(=O)c1nnc(C(F)(F)F)s1,,,2,,,1,GCN classification,Learning rate 0.001 | 100 dimensions,0,random


In [4]:
combined_df.to_csv('../../data/replication_gcn/final_model/gcn_predictions.csv',index=False)

In [5]:
final_report = combined_df[combined_df['feature'] == 'Learning rate 0.001 | 100 dimensions']

In [7]:
final_report.to_csv('../../data/replication_gcn/final_model/gcn_predictions_folds.csv',index=False)