In [23]:
import pandas as pd
import numpy as np
import re
import glob
import os

In [24]:
%cd ../spreading_dynamics_clinical/

/home/gabridele/Desktop/irbio_folder/spreading_dynamics_clinical


In [25]:
def extract_subject_id(file_path):
    match = re.search(r'sub-\d+', file_path)
    if match:
        return match.group(0)
    return None

In [26]:
# Function to load union indices for a specific subject from Excel file
def load_union_indices(subject, union_file):
    df = pd.read_excel(union_file)
    row = df.loc[df['subject'] == subject]
    if not row.empty:
        union_indices = row['union_indices'].values[0]
        if union_indices == '' or pd.isna(union_indices):
            return []
        return list(map(int, union_indices.split(', ')))
    else:
        raise ValueError(f"Subject {subject} not found in {union_file}.")

In [27]:
# 1. Remove rows and columns from a symmetrical matrix in a CSV file
def process_sc_matrix(subject, csv_file, union_file):
    # Load union indices for the subject
    union_indices = load_union_indices(subject, union_file)

    # Load the matrix from the CSV file
    matrix = pd.read_csv(csv_file, header=None)

    # Drop rows and columns corresponding to union_indices
    matrix.drop(index=union_indices, columns=union_indices, inplace=True)

    # Return the processed matrix
    return matrix


In [28]:

# 2. Remove rows and columns from a symmetrical matrix in a numpy array
def process_fc_matrix(subject, npy_file, union_file):
    # Load union indices for the subject
    union_indices = load_union_indices(subject, union_file)

    # Load the numpy array (symmetrical matrix)
    matrix = np.load(npy_file)

    # Convert numpy array to DataFrame for easy row/column removal
    df = pd.DataFrame(matrix)

    # Drop rows and columns corresponding to union_indices
    df.drop(index=union_indices, columns=union_indices, inplace=True)

    # Convert back to numpy array and return
    return df.to_numpy()


In [29]:
# 3. Remove rows from a numpy array (454x1)
def process_task_matrix(subject, npy_file, union_file):
    # Load union indices for the subject
    union_indices = load_union_indices(subject, union_file)

    # Load the 454x1 numpy array
    array = np.load(npy_file)

    # Remove the rows corresponding to union_indices
    filtered_array = np.delete(array, union_indices, axis=0)

    # Return the processed array
    return filtered_array

In [30]:
file_paths_sc = glob.glob("derivatives/**/dwi/sub*_Schaefer2018_400Parcels_Tian_Subcortex_S4_1mm_5000000mio_connectome.csv", recursive=True)
#file_paths_aw = glob.glob("derivatives/**/dwi/restored_full_association_matrix_sub-*_2seeds.csv", recursive=True)
#file_paths_fc = glob.glob("derivatives/**/func/sub-*_rs_correlation_matrix.npy", recursive=True)
#file_paths_task = glob.glob("preproc_dl/**/scap.feat/sub-*_mean_cope_resampled_ts_1vol.npy", recursive=True)

In [31]:
print(f"Found {len(file_paths_sc)} SC files")

Found 202 SC files


In [None]:
union_file = "nan_indices_with_union.xlsx"

for file_path_sc in file_paths_sc:
    subject_id =  extract_subject_id(file_path_sc)
    print('now processing:', subject_id)
    # Process CSV file
    processed_sc_matrix = process_sc_matrix(subject_id, file_path_sc, union_file)
    print('SC shape:', processed_sc_matrix.shape)
    processed_sc_matrix.to_csv(f"derivatives/{subject_id}/dwi/processed_{subject_id}_Schaefer2018_400Parcels_Tian_Subcortex_S4_1mm_5000000mio_connectome.csv", index=False, header=False)

    file_path_aw = f'derivatives/{subject_id}/dwi/restored_full_association_matrix_{subject_id}_2seeds.csv'
    if os.path.exists(file_path_aw):
        processed_aw_matrix = process_sc_matrix(subject_id, file_path_aw, union_file)
        print('AW shape:', processed_aw_matrix.shape)
        processed_aw_matrix.to_csv(f"derivatives/{subject_id}/dwi/processed_association_matrix_{subject_id}_2seeds.csv", index=False, header=False)
    
    file_path_fc = f"derivatives/{subject_id}/func/{subject_id}_rs_correlation_matrix.npy"    
    if os.path.exists(file_path_fc):
        processed_FC_matrix = process_fc_matrix(subject_id, file_path_fc, union_file)
        print('FC shape:', processed_FC_matrix.shape)
        np.save(f"derivatives/{subject_id}/func/processed_functional_connectivity_{subject_id}.npy", processed_FC_matrix)

    file_path_task = f"preproc_dl/{subject_id}/scap.feat/{subject_id}_mean_cope_resampled_ts_1vol.npy"
    if os.path.exists(file_path_task):
        processed_task_matrix = process_task_matrix(subject_id, file_path_task, union_file)
        print('task shape:', processed_task_matrix.shape)
        np.save(f"derivatives/{subject_id}/func/processed_{subject_id}_mean_cope_resampled_ts_1vol.npy", processed_task_matrix)

In [None]:
# do same for all the seeds. not optimized in one loop to have better control of output

union_file = "nan_indices_with_union.xlsx"

for file_path_sc in file_paths_sc:
    subject_id =  extract_subject_id(file_path_sc)
    print('now processing:', subject_id)
    # Process CSV file
    #processed_sc_matrix = process_sc_matrix(subject_id, file_path_sc, union_file)
    #print('SC shape:', processed_sc_matrix.shape)
    #processed_sc_matrix.to_csv(f"derivatives/{subject_id}/dwi/processed_{subject_id}_Schaefer2018_400Parcels_Tian_Subcortex_S4_1mm_5000000mio_connectome.csv", index=False, header=False)

    file_path_aw = f'derivatives/{subject_id}/dwi/restored_full_association_matrix_{subject_id}_40seeds.csv'
    if os.path.exists(file_path_aw):
        processed_aw_matrix = process_sc_matrix(subject_id, file_path_aw, union_file)
        print('AW shape:', processed_aw_matrix.shape)
        processed_aw_matrix.to_csv(f"derivatives/{subject_id}/dwi/processed_association_matrix_{subject_id}_40seeds.csv", index=False, header=False)