In [5]:
import pandas as pd
import numpy as np
from os.path import join

# Load the three CSV files
sociocult_df = pd.read_csv("derivatives/none-reduced/sociocult.csv")
rsfc_df = pd.read_csv("derivatives/none-reduced/rsfc.csv")

print("Loaded datasets:")
print(f"Sociocultural data shape: {sociocult_df.shape}")
print(f"RSFC data shape: {rsfc_df.shape}")


print("\nSociocultural columns:")
print(sociocult_df.columns.tolist())

print("\nRSFC columns:")
print(rsfc_df.columns.tolist())

Loaded datasets:
Sociocultural data shape: (236, 20)
RSFC data shape: (236, 66)

Sociocultural columns:
['macv_p_ss_fs', 'macv_p_ss_fo', 'macv_p_ss_fr', 'macv_y_ss_fs', 'macv_y_ss_fo', 'macv_y_ss_fr', 'reshist_addr1_coi_r_coi_nat', 'meim_p_ss_exp', 'meim_p_ss_com', 'reshist_addr1_nanda_disadv_fac', 'meim_ss_exp', 'meim_ss_com', 'reshist_addr1_gstat_h_queen', 'dim_y_ss_mean', 'via_p_ss_hc', 'via_p_ss_amer', 'via_ss_hc', 'via_ss_amer', 'comc_ss_cohesion_p', 'comc_ss_control_p']

RSFC columns:
['rsfmri_c_ngd_cgc_ngd_cgc', 'rsfmri_c_ngd_cgc_ngd_ca', 'rsfmri_c_ngd_cgc_ngd_dt', 'rsfmri_c_ngd_cgc_ngd_dla', 'rsfmri_c_ngd_cgc_ngd_fo', 'rsfmri_c_ngd_cgc_ngd_rspltp', 'rsfmri_c_ngd_cgc_ngd_sa', 'rsfmri_c_ngd_cgc_ngd_smh', 'rsfmri_c_ngd_cgc_ngd_smm', 'rsfmri_c_ngd_cgc_ngd_vta', 'rsfmri_c_ngd_cgc_ngd_vs', 'rsfmri_c_ngd_ca_ngd_ca', 'rsfmri_c_ngd_ca_ngd_dt', 'rsfmri_c_ngd_ca_ngd_dla', 'rsfmri_c_ngd_ca_ngd_fo', 'rsfmri_c_ngd_ca_ngd_rspltp', 'rsfmri_c_ngd_ca_ngd_sa', 'rsfmri_c_ngd_ca_ngd_smh', 'rsfmri_c

In [6]:
# Rename columns in sociocultural DataFrame
socio_column_mapping = {
    'meim_ss_exp': 'EIE-Y',
    'meim_ss_com': 'EIC-Y',
    'via_ss_hc': 'HA-Y',
    'via_ss_amer': 'USA-Y',
    'macv_y_ss_fs': 'FS-Y',
    'macv_y_ss_fo': 'FO-Y',
    'macv_y_ss_fr': 'FR-Y',
    'meim_p_ss_exp': 'EIE-C',
    'meim_p_ss_com': 'EIC-C',
    'via_p_ss_hc': 'HA-C',
    'via_p_ss_amer': 'USA-C',
    'macv_p_ss_fs': 'FS-C',
    'macv_p_ss_fo': 'FO-C',
    'macv_p_ss_fr': 'FR-C',
    'dim_y_ss_mean': 'PD',
    'comc_ss_cohesion_p': 'CCoh',
    'comc_ss_control_p': 'CCon',
    'reshist_addr1_coi_r_coi_nat': 'COI',
    'reshist_addr1_nanda_disadv_fac': 'NDI',
    'reshist_addr1_gstat_h_queen': 'GGS'
}

# Apply the column renaming
sociocult_df = sociocult_df.rename(columns=socio_column_mapping)

print("Renamed sociocultural columns:")
print("Old column name -> New column name")
for old_name, new_name in socio_column_mapping.items():
    print(f"{old_name} -> {new_name}")

print(f"\nUpdated sociocultural columns:")
print(sociocult_df.columns.tolist())

Renamed sociocultural columns:
Old column name -> New column name
meim_ss_exp -> EIE-Y
meim_ss_com -> EIC-Y
via_ss_hc -> HA-Y
via_ss_amer -> USA-Y
macv_y_ss_fs -> FS-Y
macv_y_ss_fo -> FO-Y
macv_y_ss_fr -> FR-Y
meim_p_ss_exp -> EIE-C
meim_p_ss_com -> EIC-C
via_p_ss_hc -> HA-C
via_p_ss_amer -> USA-C
macv_p_ss_fs -> FS-C
macv_p_ss_fo -> FO-C
macv_p_ss_fr -> FR-C
dim_y_ss_mean -> PD
comc_ss_cohesion_p -> CCoh
comc_ss_control_p -> CCon
reshist_addr1_coi_r_coi_nat -> COI
reshist_addr1_nanda_disadv_fac -> NDI
reshist_addr1_gstat_h_queen -> GGS

Updated sociocultural columns:
['FS-C', 'FO-C', 'FR-C', 'FS-Y', 'FO-Y', 'FR-Y', 'COI', 'EIE-C', 'EIC-C', 'NDI', 'EIE-Y', 'EIC-Y', 'GGS', 'PD', 'HA-C', 'USA-C', 'HA-Y', 'USA-Y', 'CCoh', 'CCon']


In [7]:
# Rename RSFC columns with network abbreviations
network_mapping = {
    '_ad': 'AN',
    '_cgc': 'CON', 
    '_ca': 'CPN',
    '_dt': 'DN',
    '_dla': 'DAN',
    '_fo': 'FPN',
    '_rspltp': 'RTN',
    '_smh': 'SHN',
    '_smm': 'SMN',
    '_sa': 'SN',
    '_vta': 'VAN',
    '_vs': 'VN'
}

def rename_rsfc_column(col_name):
    """
    Convert RSFC column names from format like 'rsfmri_c_ngd_cgc_ngd_cgc' to 'CON-CON'
    """
    if not col_name.startswith('rsfmri_c_ngd_'):
        return col_name  # Return unchanged if not an RSFC column
    
    # Remove the 'rsfmri_c_ngd_' prefix
    remaining = col_name.replace('rsfmri_c_ngd_', '')
    
    # Split into network parts and replace with new abbreviations
    parts = []
    for network_code, new_abbrev in network_mapping.items():
        if remaining.startswith(network_code.lstrip('_')):
            parts.append(new_abbrev)
            remaining = remaining.replace(network_code.lstrip('_') + '_ngd_', '', 1)
            break
    
    # Handle the second network
    for network_code, new_abbrev in network_mapping.items():
        if remaining == network_code.lstrip('_'):
            parts.append(new_abbrev)
            break
    
    return '-'.join(parts) if len(parts) == 2 else col_name

# Create mapping for all RSFC columns
rsfc_column_mapping = {}
for col in rsfc_df.columns:
    new_name = rename_rsfc_column(col)
    if new_name != col:  # Only include columns that actually changed
        rsfc_column_mapping[col] = new_name

# Apply the renaming
rsfc_df = rsfc_df.rename(columns=rsfc_column_mapping)

print("RSFC column renaming examples:")
for old_name, new_name in list(rsfc_column_mapping.items())[:10]:  # Show first 10 examples
    print(f"{old_name} -> {new_name}")

print(f"\nTotal RSFC columns renamed: {len(rsfc_column_mapping)}")
print(f"Updated RSFC columns:")
print(rsfc_df.columns.tolist())

RSFC column renaming examples:
rsfmri_c_ngd_cgc_ngd_cgc -> CON-CON
rsfmri_c_ngd_cgc_ngd_ca -> CON-CPN
rsfmri_c_ngd_cgc_ngd_dt -> CON-DN
rsfmri_c_ngd_cgc_ngd_dla -> CON-DAN
rsfmri_c_ngd_cgc_ngd_fo -> CON-FPN
rsfmri_c_ngd_cgc_ngd_rspltp -> CON-RTN
rsfmri_c_ngd_cgc_ngd_sa -> CON-SN
rsfmri_c_ngd_cgc_ngd_smh -> CON-SHN
rsfmri_c_ngd_cgc_ngd_smm -> CON-SMN
rsfmri_c_ngd_cgc_ngd_vta -> CON-VAN

Total RSFC columns renamed: 66
Updated RSFC columns:
['CON-CON', 'CON-CPN', 'CON-DN', 'CON-DAN', 'CON-FPN', 'CON-RTN', 'CON-SN', 'CON-SHN', 'CON-SMN', 'CON-VAN', 'CON-VN', 'CPN-CPN', 'CPN-DN', 'CPN-DAN', 'CPN-FPN', 'CPN-RTN', 'CPN-SN', 'CPN-SHN', 'CPN-SMN', 'CPN-VAN', 'CPN-VN', 'DN-DN', 'DN-DAN', 'DN-FPN', 'DN-RTN', 'DN-SN', 'DN-SHN', 'DN-SMN', 'DN-VAN', 'DN-VN', 'DAN-DAN', 'DAN-FPN', 'DAN-RTN', 'DAN-SN', 'DAN-SHN', 'DAN-SMN', 'DAN-VAN', 'DAN-VN', 'FPN-FPN', 'FPN-RTN', 'FPN-SN', 'FPN-SHN', 'FPN-SMN', 'FPN-VAN', 'FPN-VN', 'RTN-RTN', 'RTN-SN', 'RTN-SHN', 'RTN-SMN', 'RTN-VAN', 'RTN-VN', 'SN-SN', 'SN-SHN', '

In [8]:
# Save the cleaned DataFrames as new CSV files
sociocult_df.to_csv("derivatives/none-reduced/clean-socio.csv", index=False)
rsfc_df.to_csv("derivatives/none-reduced/clean-rsfc.csv", index=False)

print("Saved cleaned datasets:")
print(f"✓ clean-socio.csv - Shape: {sociocult_df.shape}")
print(f"✓ clean-rsfc.csv - Shape: {rsfc_df.shape}")

print("\nFiles saved to derivatives/none-reduced/ directory:")

Saved cleaned datasets:
✓ clean-socio.csv - Shape: (236, 20)
✓ clean-rsfc.csv - Shape: (236, 66)

Files saved to derivatives/none-reduced/ directory:
