In [2]:
import pandas as pd
from snakemake.utils import Paramspace
from scripts.utils import create_tasks_df
from pprint import pprint
import numpy as np
import os
import yaml # type: ignore
import hashlib




In [3]:

# from https://github.com/HCA-integration/hca_integration_toolbox/blob/main/workflow/utils/misc.py#L129
def create_hash(string: str, digest_size: int = 5):
    string = string.encode('utf-8')
    return hashlib.blake2b(string, digest_size=digest_size).hexdigest()


def create_tasks_df(config, save=None):
    tasks_df = []
    with open(config, "r") as stream:
        params = yaml.safe_load(stream)
    
    for task in params['TASKS']:
        task_dict = params['TASKS'][task]
        method_dfs = []
        
        for method, method_data in task_dict['methods'].items():
            # Determine if method_data is a string (file path) or dict (with params and featsel)
            if isinstance(method_data, str):
                method_params = method_data  # Only a params file is provided
                featsel_list = [None]  # No featsel options
            elif isinstance(method_data, dict):
                method_params = method_data.get('params')  # Extract params file path
                featsel_list = method_data.get('featsel', [None])  # Extract featsel list or default to [None]
            else:
                raise ValueError(f"Unexpected format for method_data: {method_data}")
            
            # Read parameters file if it exists
            if method_params:
                df_params = pd.read_csv(method_params, sep='\t', index_col=0)
                params_list = [str(row) for row in df_params.to_dict(orient='records')]
            else:
                df_params = pd.DataFrame()
                params_list = [{}]
            
            # Create rows for each feature selection method
            for featsel in featsel_list:
                featsel_suffix = featsel if featsel else "None"
                method_df = {
                    'params': params_list,
                    'hash': [create_hash(row + method + task + featsel_suffix) for row in params_list],
                    'method': [method] * len(params_list),
                    'featsel': [featsel] * len(params_list),
                }
                method_dfs.append(pd.DataFrame(method_df))
        
        # Combine all methods for the current task
        if method_dfs:
            method_dfs = pd.concat(method_dfs, ignore_index=True)
            method_dfs['task'] = task

            # Add task-level attributes (e.g., input_rna, input_metabolomics)
            for key in task_dict:
                if key != 'methods':
                    method_dfs[key] = task_dict[key]
            
            tasks_df.append(method_dfs)
    
    # Combine all tasks
    if tasks_df:
        tasks_df = pd.concat(tasks_df, ignore_index=True)
    else:
        tasks_df = pd.DataFrame()
    
    # Save to file if required
    if save is not None:
        tasks_df.to_csv(save, sep='\t', index=False)
    
    return tasks_df



In [4]:
tasks_df = create_tasks_df('/home/icb/eirini.giannakoulia/pipeline/config.yaml', save='data/tasks.tsv')
tasks_df = pd.read_csv('data/tasks.tsv', sep='\t')
# Strip whitespace from all object (string) columns:
for col in tasks_df.select_dtypes(include=['object']).columns:
    tasks_df[col] = tasks_df[col].str.strip()

# Extract unique task details
hashes = tasks_df['hash'].unique()
methods = tasks_df['method'].unique()
tasks = tasks_df['task'].unique()

for _, row in tasks_df.iterrows():
    path = f"dataset/processed/{row['task'].strip()}/{row['featsel'].strip()}/rna_dataset_train.h5ad"
    print(repr(path))


'dataset/processed/lipids/hvg/rna_dataset_train.h5ad'
'dataset/processed/lipids/hvg/rna_dataset_train.h5ad'
'dataset/processed/lipids/hvg_svd/rna_dataset_train.h5ad'
'dataset/processed/lipids/hvg_svd/rna_dataset_train.h5ad'
'dataset/processed/lipids/hvg_svd_graph/rna_dataset_train.h5ad'
'dataset/processed/lipids/hvg_svd_graph/rna_dataset_train.h5ad'
'dataset/processed/lipids/svd/rna_dataset_train.h5ad'
'dataset/processed/lipids/svd/rna_dataset_train.h5ad'
'dataset/processed/lipids/svd_graph/rna_dataset_train.h5ad'
'dataset/processed/lipids/svd_graph/rna_dataset_train.h5ad'
'dataset/processed/lipids/hvg/rna_dataset_train.h5ad'
'dataset/processed/lipids/hvg/rna_dataset_train.h5ad'
'dataset/processed/lipids/hvg_svd/rna_dataset_train.h5ad'
'dataset/processed/lipids/hvg_svd/rna_dataset_train.h5ad'
'dataset/processed/lipids/hvg_svd_graph/rna_dataset_train.h5ad'
'dataset/processed/lipids/hvg_svd_graph/rna_dataset_train.h5ad'
'dataset/processed/lipids/svd/rna_dataset_train.h5ad'
'dataset/proce