In [1]:
import pandas as pd
import yaml
import hashlib
import os
import numpy as np

In [2]:
os.chdir("/lustre/groups/ml01/workspace/eirini.giannakoulia/no_fs_pipeline/")

In [3]:
def create_hash(string: str, digest_size: int = 5):
    string = string.encode('utf-8')
    return hashlib.blake2b(string, digest_size=digest_size).hexdigest()

def create_tasks_df(config, save=None):
    tasks_df = []
    with open(config, "r") as stream:
        params = yaml.safe_load(stream)
    
    for task in params['TASKS']:
        task_dict = params['TASKS'][task]
        method_dfs = []
        
        for method, method_data in task_dict['methods'].items():
            # If method_data is a string, it’s the parameters file path;
            # if a dict, we extract the parameters file path.
            if isinstance(method_data, str):
                method_params = method_data
            elif isinstance(method_data, dict):
                method_params = method_data.get('params')
            else:
                raise ValueError(f"Unexpected format for method_data: {method_data}")
            
            if method_params:
                df_params = pd.read_csv(method_params, sep='\t', index_col=0)
                params_list = [str(row) for row in df_params.to_dict(orient='records')]
            else:
                df_params = pd.DataFrame()
                params_list = [{}]
            
            # Create rows for the method (feature selection removed)
            method_df = {
                'params': params_list,
                'hash': [create_hash(row + method + task) for row in params_list],
                'method': [method] * len(params_list),
            }
            method_dfs.append(pd.DataFrame(method_df))
        
        if method_dfs:
            method_dfs = pd.concat(method_dfs, ignore_index=True)
            method_dfs['task'] = task

            # Add any additional task-level attributes
            for key in task_dict:
                if key != 'methods':
                    method_dfs[key] = task_dict[key]
            
            tasks_df.append(method_dfs)
    
    if tasks_df:
        tasks_df = pd.concat(tasks_df, ignore_index=True)
    else:
        tasks_df = pd.DataFrame()
    
    if save is not None:
        tasks_df.to_csv(save, sep='\t', index=False)
    
    return tasks_df


In [4]:
mypath = os.getcwd()

In [5]:
mypath 

'/ictstr01/groups/ml01/workspace/eirini.giannakoulia/no_fs_pipeline'

In [8]:
# Generate tasks DataFrame and load configuration
tasks_df = create_tasks_df(f"{mypath}/config.yaml")
# tasks_df = pd.read_csv('data/tasks.tsv', sep='\t')

# # Extract unique task details
# hashes = tasks_df['hash'].unique()
# methods = tasks_df['method'].unique()
# tasks = tasks_df['task'].unique()


In [9]:
tasks_df

Unnamed: 0,params,hash,method,task,input_rna,input_metabolomics,split
0,{'alpha': 1.0},64c49ba4e2,ridge,vitatrack,/lustre/groups/ml01/workspace/anastasia.litine...,/lustre/groups/ml01/workspace/anastasia.litine...,split
1,{'alpha': 0.1},03053c6a2c,ridge,vitatrack,/lustre/groups/ml01/workspace/anastasia.litine...,/lustre/groups/ml01/workspace/anastasia.litine...,split


In [6]:
prev = pd.read_csv('/ictstr01/home/icb/eirini.giannakoulia/data_copy/tasks.tsv', sep='\t')

In [8]:
np.all(tasks_df == prev)

np.False_

In [9]:
diff_indices = np.where(tasks_df != prev)
print(diff_indices)


(array([160, 160, 160, 161, 161, 161, 162, 162, 162, 162, 163, 163, 163,
       163, 164, 164, 164, 164, 165, 165, 165, 165, 166, 166, 166, 166,
       167, 167, 167, 167, 168, 168, 168, 168, 169, 169, 169, 169, 170,
       170, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174, 175,
       175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179, 179,
       179, 180, 180, 180, 181, 181, 181, 182, 182, 183, 183, 184, 184,
       184, 185, 185, 185, 186, 186, 186, 187, 187, 187, 188, 188, 188,
       189, 189, 189, 190, 190, 190, 191, 191, 191, 192, 192, 192, 193,
       193, 193, 194, 194, 195, 195, 196, 196, 196, 197, 197, 197, 198,
       198, 198, 199, 199, 199, 200, 200, 200, 201, 201, 201, 202, 202,
       202, 203, 203, 203, 204, 204, 204, 205, 205, 205, 206, 206, 207,
       207, 208, 208, 208, 209, 209, 209, 210, 210, 210, 211, 211, 211,
       212, 212, 212, 213, 213, 213, 214, 214, 214, 215, 215, 215, 216,
       216, 216, 217, 217, 217, 218, 218, 219, 219, 220, 220, 2

In [10]:
diff_mask = (tasks_df != prev).any(axis=1)

# Print the indices of the rows that differ
diff_row_indices = tasks_df.index[diff_mask]
print("Rows with differences:", diff_row_indices)


Rows with differences: Index([160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       ...
       355, 356, 357, 358, 359, 360, 361, 362, 363, 364],
      dtype='int64', length=140)


In [11]:
diff_row_indices

Index([160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       ...
       355, 356, 357, 358, 359, 360, 361, 362, 363, 364],
      dtype='int64', length=140)

In [None]:
np.all(prev[] == tasks_df)

np.False_

In [21]:
prev[160:163]

Unnamed: 0,params,hash,method,featsel,task,input_rna,input_metabolomics,split
160,"{'alpha': 50, 'lambda': 100, 'max_depth': 3, '...",9acdf9a0cc,xgboost,hvg,lipids,/lustre/groups/ml01/workspace/anastasia.litine...,/lustre/groups/ml01/workspace/anastasia.litine...,half_split
161,"{'alpha': 10, 'lambda': 50, 'max_depth': 5, 'l...",c7b850df11,xgboost,hvg,lipids,/lustre/groups/ml01/workspace/anastasia.litine...,/lustre/groups/ml01/workspace/anastasia.litine...,half_split
162,"{'alpha': 50, 'lambda': 100, 'max_depth': 3, '...",83dd4f22b5,xgboost,hvg_svd,lipids,/lustre/groups/ml01/workspace/anastasia.litine...,/lustre/groups/ml01/workspace/anastasia.litine...,half_split


In [22]:
tasks_df[160:163]

Unnamed: 0,params,hash,method,featsel,task,input_rna,input_metabolomics,split
160,"{'alpha': 0.001, 'l1_ratio': 0.1}",5c064c1399,elastic_net,hvg,lipids,/lustre/groups/ml01/workspace/anastasia.litine...,/lustre/groups/ml01/workspace/anastasia.litine...,half_split
161,"{'alpha': 0.001, 'l1_ratio': 0.5}",aa9882b566,elastic_net,hvg,lipids,/lustre/groups/ml01/workspace/anastasia.litine...,/lustre/groups/ml01/workspace/anastasia.litine...,half_split
162,"{'alpha': 0.001, 'l1_ratio': 0.9}",4e69bd9dfd,elastic_net,hvg,lipids,/lustre/groups/ml01/workspace/anastasia.litine...,/lustre/groups/ml01/workspace/anastasia.litine...,half_split
