In [None]:
import os
import os.path as op
from collections import OrderedDict
import pandas as pd
import numpy as np
import shutil
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Define the main directory and target directory paths
data_dir = "./dset"
deriv_dir = "./derivatives"
os.makedirs(deriv_dir, exist_ok=True)
demo_dir = op.join(deriv_dir, "demo")
os.makedirs(demo_dir, exist_ok=True)
rsfc_dir = op.join(deriv_dir, "rsfc")
os.makedirs(rsfc_dir, exist_ok=True)
hisp_dir = op.join(deriv_dir, "hispanic")
os.makedirs(hisp_dir, exist_ok=True)

### Transfer the Data
#### Select certain ABCD variables from CSVs to be copied

In [None]:
# variables that don't need to be controlled for age
# Define the lists of filenames and variables to extract
file_lists = {
    "abcd": {
        "abcd_y_lt": ["interview_age", "rel_family_id", "site_id_l"],
        "abcd_p_demo": [
            "demo_sex_v2",
            "demo_ethn_v2",
            "demo_race_a_*",
            "demo_prnt_age_v2",
            "demo_prnt_gender_id_v2",
            "demo_prnt_ethn_v2",
            "demo_prnt_race_a_*",
            "demo_prnt_ed_v2_2yr_l",
            "demo_prtnr_ed_v2_2yr_l",
            "demo_comb_income_v2",
            "demo_prnt_income_v2_l",
            "demo_origin_v2",
            "demo_biomother_v2",
            "demo_biofather_v2",
            "demo_matgrandm_v2",
            "demo_matgrandf_v2",
            "demo_patgrandm_v2",
            "demo_patgrandf_v2",
        ],
    },
    "led_l": {
        "led_l_coi": ["reshist_addr1_coi_r_coi_nat"],
        "led_l_nbhsoc": ["reshist_addr1_nanda_disadv_fac"],
        "led_l_gi": ["reshist_addr1_gstat_h_queen"],
    },
    "ph_y": {
        "ph_y_anthro": ["anthroheightcalc", "anthroweightcalc"],
    },
}


# Function to recursively find files in directories
def find_files(directory):
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            yield os.path.join(dirpath, filename)


# Iterate through each list and process corresponding files
for category, files_and_vars in file_lists.items():
    for filename, vars_to_extract in files_and_vars.items():
        # Search for the file in all subdirectories of data_directory
        found_file = False
        for file_path in find_files(data_dir):
            if filename + ".csv" in file_path:
                # Read the CSV file and extract specified columns
                df = pd.read_csv(file_path)

                # Ordered dictionary to maintain the order of columns
                columns_to_keep = OrderedDict()
                columns_to_keep["src_subject_id"] = True
                columns_to_keep["eventname"] = True

                for col in vars_to_extract:
                    if "*" in col:
                        # Use regular expression to match the pattern
                        regex_pattern = col.replace("*", ".*")
                        matched_columns = list(df.filter(regex=regex_pattern).columns)
                        for matched_col in matched_columns:
                            columns_to_keep[matched_col] = True
                    else:
                        columns_to_keep[col] = True

                # Create a subset of the dataframe with the desired columns
                df_subset = df[list(columns_to_keep.keys())]

                # Construct the destination path and filename
                output_filename = f"{filename}_subset.csv"
                output_path = os.path.join(demo_dir, output_filename)

                # Save the subset dataframe to the target directory
                df_subset.to_csv(output_path, index=False)

                print(f"Saved {output_filename} to {demo_dir}")

                found_file = True
                break

        if not found_file:
            print(f"File {filename}.csv not found in {data_dir}")

#### Specify certain years to be selected

In [None]:
# variables that need to be controlled for specific years

# Define the lists of filenames and shared eventname
file_lists = {
    "ce_y": {
        "ce_y_meim": {
            "eventname": "3_year_follow_up_y_arm_1",
            "columns_to_extract": [
                "src_subject_id",
                "eventname",
                "meim_ss_exp",
                "meim_ss_com",
            ],
        },
        "ce_y_via": {
            "eventname": "3_year_follow_up_y_arm_1",
            "columns_to_extract": [
                "src_subject_id",
                "eventname",
                "via_ss_hc",
                "via_ss_amer",
            ],
        },
        "ce_y_macv": {
            "eventname": "4_year_follow_up_y_arm_1",
            "columns_to_extract": [
                "src_subject_id",
                "eventname",
                "macv_y_ss_fs",
                "macv_y_ss_fo",
                "macv_y_ss_fr",
            ],
        },
        "ce_y_dm": {
            "eventname": "4_year_follow_up_y_arm_1",
            "columns_to_extract": [
                "src_subject_id",
                "eventname",
                "dim_y_ss_mean",
            ],
        },
    },
    "ce_p": {
        "ce_p_meim": {
            "eventname": "3_year_follow_up_y_arm_1",
            "columns_to_extract": [
                "src_subject_id",
                "eventname",
                "meim_p_ss_exp",
                "meim_p_ss_com",
            ],
        },
        "ce_p_via": {
            "eventname": "3_year_follow_up_y_arm_1",
            "columns_to_extract": [
                "src_subject_id",
                "eventname",
                "via_p_ss_hc",
                "via_p_ss_amer",
            ],
        },
        "ce_p_macv": {
            "eventname": "2_year_follow_up_y_arm_1",
            "columns_to_extract": [
                "src_subject_id",
                "eventname",
                "macv_p_ss_fs",
                "macv_p_ss_fo",
                "macv_p_ss_fr",
            ],
        },
        "ce_p_comc": {
            "eventname": "4_year_follow_up_y_arm_1",
            "columns_to_extract": [
                "src_subject_id",
                "eventname",
                "comc_ss_cohesion_p",
                "comc_ss_control_p",
            ],
        },
    },
    "ph_y": {
        "ph_y_yrb": {
            "eventname": "4_year_follow_up_y_arm_1",
            "columns_to_extract": [
                "src_subject_id",
                "eventname",
                "physical_activity1_y",
            ],
        },
        "ph_y_resp": {
            "eventname": "4_year_follow_up_y_arm_1",
            "columns_to_extract": [
                "src_subject_id",
                "eventname",
                "resp_wheeze_yn_y",
                "resp_pmcough_yn_y",
                "resp_diagnosis_yn_y",
                "resp_bronch_yn_y",
            ],
        },
        "ph_y_mctq": {
            "eventname": "4_year_follow_up_y_arm_1",
            "columns_to_extract": [
                "src_subject_id",
                "eventname",
                "mctq_sdweek_calc",
                "mctq_msfsc_calc",
            ],
        },
        "ph_y_bp": {
            "eventname": "4_year_follow_up_y_arm_1",
            "columns_to_extract": [
                "src_subject_id",
                "eventname",
                "blood_pressure_sys_mean",
                "blood_pressure_dia_mean",
            ],
        },
    },
    "mh_p": {
        "mh_p_cbcl": {
            "eventname": "4_year_follow_up_y_arm_1",
            "columns_to_extract": [
                "src_subject_id",
                "eventname",
                "cbcl_scr_syn_internal_t",
                "cbcl_scr_syn_external_t",
            ],
        },
    },
    "mri_y": {
        "mri_y_rsfmr_cor_gp_gp": {
            "eventname": "4_year_follow_up_y_arm_1",
            "copy_entire_file": True,
        },
        "mri_y_adm_info": {
            "eventname": "4_year_follow_up_y_arm_1",
            "columns_to_extract": [
                "src_subject_id",
                "eventname",
                "mri_info_manufacturer",
            ],
        },
        "mri_y_qc_motion": {
            "eventname": "4_year_follow_up_y_arm_1",
            "columns_to_extract": [
                "src_subject_id",
                "eventname",
                "rsfmri_meanmotion",
            ],
        },
    },
}


# Function to recursively find files in directories
def find_files(directory):
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            yield os.path.join(dirpath, filename)


# Iterate through each list and process corresponding files
for category, files_and_info in file_lists.items():
    for filename, info in files_and_info.items():
        eventname = info["eventname"]
        copy_entire_file = info.get("copy_entire_file", False)
        columns_to_extract = info.get("columns_to_extract", [])

        # Search for the file in all subdirectories of data_directory
        found_file = False
        for file_path in find_files(data_dir):
            if filename + ".csv" in file_path:
                # Read the CSV file
                df = pd.read_csv(file_path)

                # Filter rows by the shared eventname
                df_filtered = df[df["eventname"] == eventname]

                if not copy_entire_file and columns_to_extract:
                    # Create subset dataframe with desired columns
                    df_filtered = df_filtered[columns_to_extract]

                # Construct the destination path and filename
                output_filename = f"{filename}_subset.csv"
                output_path = os.path.join(demo_dir, output_filename)

                # Save the filtered dataframe to the target directory
                df_filtered.to_csv(output_path, index=False)

                print(f"Saved {output_filename} to {demo_dir}")

                found_file = True
                break

        if not found_file:
            print(f"File {filename}.csv not found in {demo_dir}")

### Clean up the demographic variables

In [None]:
# clean up the demo files
file_names = [
    "abcd_y_lt_subset.csv",
    "abcd_p_demo_subset.csv",
]


# Loop over each file name
for file_name in file_names:
    # Define the path to the CSV file
    file_path = op.join(demo_dir, file_name)

    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Forward fill within each 'src_subject_id' group
    df = df.set_index("src_subject_id").groupby("src_subject_id").ffill().reset_index()

    # Filter rows where 'eventname' is '4_year_follow_up_y_arm_1'
    df = df[df["eventname"] == "4_year_follow_up_y_arm_1"]

    # Additional filtering for the 'abcd_p_demo_subset.csv' file
    if file_name == "abcd_p_demo_subset.csv":
        df = df[df["demo_ethn_v2"] == 1]

    # Reset index and ensure 'src_subject_id' is preserved
    df = df.reset_index(drop=True)

    # Print the DataFrame
    print(df.columns)
    print(df)

    # Save the processed DataFrame
    df.to_csv(file_path, index=False)
    print(df)

In [None]:
# List of columns related to 'race'
race_cols = [col for col in df.columns if "race" in col]

# Create the new column based on the presence of any value that is not 0 or NaN
df["demo_prnt_race"] = df[race_cols].apply(
    lambda x: (
        x.index[(x != 0) & (~x.isna())].tolist()[0]
        if not ((x == 0) | (x.isna())).all()
        else None
    ),
    axis=1,
)

# Drop columns that start with 'demo_race_a_p___'
df = df.loc[:, ~df.columns.str.startswith("demo_race_a_p___")]
df = df.loc[:, ~df.columns.str.startswith("demo_prnt_race_a")]


# Define a custom aggregation function
def aggregate_column(series):
    # For columns with multiple values, return a list of values, else the first value
    if series.nunique() > 1:
        return ", ".join(series.dropna().astype(str))
    return series.iloc[0] if not series.empty else pd.NA


# Merge rows based on 'src_subject_id', aggregating values in other columns
df_merged = df.groupby("src_subject_id").agg(aggregate_column).reset_index()

print(df_merged)

# Save the DataFrame with the new column to a new CSV file
df_merged.to_csv(file_path, index=False)

In [None]:

# List of columns to keep
columns_to_keep = [
    "src_subject_id",
    "demo_sex_v2",
    "demo_ethn_v2",
    "demo_ethn2_v2",
    "demo_prnt_age_v2",
    "demo_prnt_gender_id_v2",
    "demo_prnt_ethn_v2", 
    "demo_prnt_ethn2_v2",
    "demo_prnt_ed_v2_2yr_l",
    "demo_prtnr_ed_v2_2yr_l",
    "demo_comb_income_v2",
    "demo_prnt_income_v2_l",
    "demo_origin_v2",
    "demo_biomother_v2",
    "demo_biofather_v2",
    "demo_matgrandm_v2",
    "demo_matgrandf_v2",
    "demo_patgrandm_v2",
    "demo_patgrandf_v2",
    'demo_prnt_race',
]

# Ensure that columns in the DataFrame are in the columns_to_keep list
columns_to_keep = [col for col in columns_to_keep if col in df_merged.columns]

# Filter the DataFrame to keep only the desired columns
df_filtered = df_merged[columns_to_keep]

# Optionally, check the columns to be removed
columns_to_remove = [col for col in df.columns if col not in columns_to_keep]

# Print columns to remove for verification
print("Columns to remove:", columns_to_remove)

# Print the DataFrame with only the columns to keep
print(df_filtered)

# Save the filtered DataFrame to a new CSV file
df_filtered.to_csv(op.join(demo_dir, "abcd_p_demo_subset.csv"), index=False)

#### calculate missing data

In [None]:
# Specify the output file
output_file = op.join(demo_dir, "excluded-ppts.txt")

with open(output_file, "w") as file:
    # Loop through each file in the directory
    for filename in os.listdir(demo_dir):
        if filename.endswith(".csv"):
            file_path = os.path.join(demo_dir, filename)
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)

            # Ensure the DataFrame is not empty
            if df.empty:
                file.write(f"File: {filename} is empty.\n")
                continue

            # Calculate the percentage of missing data for each column
            missing_data_per_column = df.isna().mean() * 100

            # Calculate the percentage of rows with any NaN values
            rows_with_nan = df.isna().any(axis=1).mean() * 100

            # Find columns with NaN values
            columns_with_nan = df.columns[df.isna().any()].tolist()

            if columns_with_nan:
                file.write(f"File: {filename}\n")
                file.write("Percentage of missing data per column:\n")
                for column in columns_with_nan:
                    percentage_missing = df[column].isna().mean() * 100
                    if percentage_missing > 5:
                        file.write(f"{column}: {percentage_missing:.2f}% *\n")
                if rows_with_nan > 5:
                    file.write(
                        f"Percentage of rows with any NaN values: {rows_with_nan:.2f}% *\n"
                    )

                # Remove rows with NaN values
                df_cleaned = df.dropna()
                non_empty_rows_left = len(df_cleaned)
                file.write(
                    f"Total number of non-empty rows left: {non_empty_rows_left}\n"
                )
                file.write("\n")

### RSFC
#### only keep subjects scanned at year 4

In [None]:
# Specify the main CSV file with the src_subject_id column
rsfc_df = op.join(demo_dir, "mri_y_rsfmr_cor_gp_gp_subset.csv")

# Read the main CSV file to get the list of src_subject_id values
main_df = pd.read_csv(rsfc_df)
valid_subject_ids = main_df["src_subject_id"].unique()


# First, filter and save all CSV files
for filename in os.listdir(demo_dir):
    if filename.endswith(".csv") and filename != "mri_y_rsfmr_cor_gp_gp_subset.csv":
        file_path = os.path.join(demo_dir, filename)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Filter the DataFrame to keep only rows with valid src_subject_id values
        df_filtered = df[df["src_subject_id"].isin(valid_subject_ids)]

        # Save the filtered DataFrame to a new CSV file
        output_csv_file = os.path.join(rsfc_dir, filename)
        df_filtered.to_csv(output_csv_file, index=False)


# Copy the rsfc_df to the rsfc_dir
shutil.copy(rsfc_df, rsfc_dir)

#### filter for only Hispanic/Latino/Latina

In [None]:
# Specify the main CSV file with the src_subject_id column
hisp_df = op.join(rsfc_dir, "abcd_p_demo_subset.csv")

# Read the main CSV file to get the list of src_subject_id values
main_df = pd.read_csv(hisp_df)
valid_subject_ids = main_df["src_subject_id"].unique()


# First, filter and save all CSV files
for filename in os.listdir(rsfc_dir):
    if filename.endswith(".csv") and filename != "abcd_p_demo_subset.csv":
        file_path = os.path.join(demo_dir, filename)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Filter the DataFrame to keep only rows with valid src_subject_id values
        df_filtered = df[df["src_subject_id"].isin(valid_subject_ids)]

        # Save the filtered DataFrame to a new CSV file
        output_csv_file = os.path.join(hisp_dir, filename)
        df_filtered.to_csv(output_csv_file, index=False)


# Copy the rsfc_df to the rsfc_dir
shutil.copy(hisp_df, hisp_dir)

In [None]:
# Now, calculate the percentage of missing data for each filtered CSV
output_file = os.path.join(hisp_dir, "excluded_data_summary.txt")

with open(output_file, "w") as file:
    for filename in os.listdir(hisp_dir):
        if filename.endswith(".csv") and filename != "excluded_data_summary.txt":
            file_path = os.path.join(hisp_dir, filename)
            # Read the filtered CSV file into a DataFrame
            df_filtered = pd.read_csv(file_path)

            # Calculate the percentage of missing data for each column
            missing_data_per_column = df_filtered.isna().mean() * 100

            # Calculate the percentage of rows with any NaN values
            rows_with_nan = df_filtered.isna().any(axis=1).mean() * 100

            # Find columns with NaN values
            columns_with_nan = df_filtered.columns[df_filtered.isna().any()].tolist()

            if columns_with_nan:
                file.write(f"File: {filename}\n")
                file.write("Percentage of missing data per column:\n")
                for column in columns_with_nan:
                    percentage_missing = df_filtered[column].isna().mean() * 100
                    if percentage_missing > 5:
                        file.write(f"{column}: {percentage_missing:.2f}% *\n")
                    else:
                        file.write(f"{column}: {percentage_missing:.2f}%\n")
                if rows_with_nan > 5:
                    file.write(
                        f"Percentage of subjects missing data: {rows_with_nan:.2f}% *\n"
                    )
                else:
                    file.write(
                        f"Percentage of subjects missing data: {rows_with_nan:.2f}%\n"
                    )

                non_empty_rows_left = len(df_filtered.dropna())
                file.write(
                    f"Total number of subjects left: {non_empty_rows_left}\n"
                )
                file.write("\n")

In [None]:
# Initialize empty DataFrames for each category
sociocult_df = pd.DataFrame()
covariate_df = pd.DataFrame()
phyhealth_df = pd.DataFrame()
rsfc_df = pd.DataFrame()


# Function to merge a CSV file into a DataFrame on 'src_subject_id'
def merge_csv(file_path, df):
    new_df = pd.read_csv(file_path)
    # Drop the 'eventname' column if it exists
    if "eventname" in new_df.columns:
        new_df = new_df.drop(columns=["eventname"])
    # Drop duplicate rows based on 'src_subject_id' in the new DataFrame
    new_df = new_df.drop_duplicates(subset="src_subject_id", keep="first")
    if df.empty:
        return new_df
    else:
        # Merge the DataFrames on 'src_subject_id'
        merged_df = pd.merge(df, new_df, on="src_subject_id", how="outer")
        # Drop duplicate columns, keeping the first occurrence
        merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
        # Drop duplicate rows based on 'src_subject_id'
        merged_df = merged_df.drop_duplicates(subset="src_subject_id", keep="first")
        return merged_df


# Iterate through files in the directory
for filename in os.listdir(hisp_dir):
    file_path = os.path.join(hisp_dir, filename)
    if filename.startswith("ce"):
        sociocult_df = merge_csv(file_path, sociocult_df)
    elif filename.startswith(("abcd", "mri_y_adm", "mri_y_qc")):
        covariate_df = merge_csv(file_path, covariate_df)
    elif filename.startswith("mri_y_rsfmr"):
        rsfc_df = merge_csv(file_path, rsfc_df)
    elif filename.startswith(("led", "mh", "ph")):
        phyhealth_df = merge_csv(file_path, phyhealth_df)

# Display the first few rows of each DataFrame
print("Sociocultural DataFrame:")
print(sociocult_df.head())

print("\nCovariate DataFrame:")
print(covariate_df.head())

print("\nPhysical Health DataFrame:")
print(phyhealth_df.head())

print("\nRSFC DataFrame:")
print(rsfc_df.head())

In [None]:
# Define file paths for saving
sociocult_save_path = os.path.join(deriv_dir, "sociocult.csv")
covariate_save_path = os.path.join(deriv_dir, "covariate.csv")
phyhealth_save_path = os.path.join(deriv_dir, "phyhealth.csv")
rsfc_save_path = os.path.join(deriv_dir, "rsfc.csv")

# Save each DataFrame to a CSV file
sociocult_df.to_csv(sociocult_save_path, index=False)
covariate_df.to_csv(covariate_save_path, index=False)
phyhealth_df.to_csv(phyhealth_save_path, index=False)
rsfc_df.to_csv(rsfc_save_path, index=False)

print("DataFrames have been saved successfully.")

In [None]:
# Remove rows with any NaN values
sociocult_df_cleaned = sociocult_df.dropna()
covariate_df_cleaned = covariate_df.dropna()

print(sociocult_df_cleaned)
print(covariate_df_cleaned)

In [None]:
# Get the intersection of subject IDs in both DataFrames
common_ids = set(sociocult_df_cleaned["src_subject_id"]).intersection(
    set(covariate_df_cleaned["src_subject_id"])
)
print(common_ids)

# Count the number of common IDs
print("Number of common subject IDs:", len(common_ids))

# Filter both DataFrames to keep only the rows with common subject IDs
sociocult_df_final = sociocult_df_cleaned[
    sociocult_df_cleaned["src_subject_id"].isin(common_ids)
]
covariate_df_final = covariate_df_cleaned[
    covariate_df_cleaned["src_subject_id"].isin(common_ids)
]
rsfc_df_final = rsfc_df[rsfc_df["src_subject_id"].isin(common_ids)]


print(sociocult_df_final)
print(covariate_df_final)
print(rsfc_df_final)


In [None]:
# Define file paths for saving
sociocult_save_path = os.path.join(deriv_dir, "sociocult.csv")
covariate_save_path = os.path.join(deriv_dir, "covariate.csv")
rsfc_save_path = os.path.join(deriv_dir, "rsfc.csv")

# Save each DataFrame to a CSV file
sociocult_df_final.to_csv(sociocult_save_path, index=False)
covariate_df_final.to_csv(covariate_save_path, index=False)
rsfc_df_final.to_csv(rsfc_save_path, index=False)

print("DataFrames have been saved successfully.")

In [None]:
def missing_data_summary(df, df_name):
    # Calculate the number of non-NaN values for each column (except the first column)
    non_nan_counts = df.iloc[:, 1:].notna().sum()
    # Calculate the percentage of missing data for each column (except the first column)
    missing_percentage = df.iloc[:, 1:].isna().mean() * 100

    # Create the summary DataFrame
    summary = pd.DataFrame(
        {
            "Measure": df.columns[1:],  # Exclude the first column
            "Number of Subjects": non_nan_counts,  # Number of non-NaN values
            "Percentage Missing": missing_percentage,
        }
    )

    # Determine if an asterisk should be added
    summary["Above 5%"] = summary["Percentage Missing"].apply(
        lambda x: "*" if x > 5 else ""
    )

    # Create a formatted string for the DataFrame summary
    summary_str = f"\n{df_name} Missing Data Summary:\n"
    summary_str += (
        summary.to_string(
            index=False,
            columns=["Measure", "Number of Subjects", "Percentage Missing", "Above 5%"],
        )
        + "\n"
    )

    return summary, summary_str


# Generate missing data summaries for each DataFrame
sociocult_summary, sociocult_summary_str = missing_data_summary(
    sociocult_df, "Sociocultural DataFrame"
)
covariate_summary, covariate_summary_str = missing_data_summary(
    covariate_df, "Covariate DataFrame"
)
phyhealth_summary, phyhealth_summary_str = missing_data_summary(
    phyhealth_df, "Physical Health DataFrame"
)
rsfc_summary, rsfc_summary_str = missing_data_summary(rsfc_df, "RSFC DataFrame")

# Combine all summaries into one string
all_summaries = (
    sociocult_summary_str
    + covariate_summary_str
    + phyhealth_summary_str
    + rsfc_summary_str
)

# Write the combined summaries to a text file
with open("PLSC-missing_data.txt", "w") as file:
    file.write(all_summaries)

In [None]:
sns.set_theme(color_codes=True)


def plot_missing_data_summary(summary, df_name, deriv_dir):
    plt.figure(figsize=(10, 8))
    barplot = sns.barplot(
        x="Percentage Missing", y="Measure", data=summary, palette="viridis"
    )
    plt.title(f"{df_name} Percentage of Missing Data")
    plt.xlabel("Percentage Missing (out of 589 subjects)")
    plt.ylabel(f"ABCD {df_name} Measures")
    plt.xlim(0, 100)

    # Add vertical lines
    #plt.axvline(x=5, color="green", linestyle="--", label="5% Missing")
    #plt.axvline(x=10, color="red", linestyle="--", label="10% Missing")

    # Add custom legend with total number of subjects
    plt.legend()

    for index, row in summary.iterrows():
        percentage_missing = format(row['Percentage Missing'], ".2f")
        text_color = 'red' if row['Percentage Missing'] > 5 else 'black'
        barplot.annotate(f"{percentage_missing}%",
                         xy=(row['Percentage Missing'], index),
                         xytext=(20, 0),  # Move text slightly more to the right
                         textcoords='offset points',
                         ha='center', va='center', color=text_color)

    # Save the figure
    file_path = os.path.join(deriv_dir, f"{df_name.replace(' ', '_')}_missing_data.png")
    plt.tight_layout()  # Ensure a tight layout with some padding
    plt.savefig(file_path)
    plt.close()
    plt.show()


# Plot and save the bar plots
plot_missing_data_summary(sociocult_summary, "Sociocultural", deriv_dir)
plot_missing_data_summary(covariate_summary, "Covariate", deriv_dir)
plot_missing_data_summary(phyhealth_summary, "Physical Health", deriv_dir)
# plot_missing_data_summary(rsfc_summary, "RSFC DataFrame", deriv_dir)

### Regression analysis

In [None]:
# Filter the phyhealth_df to only include rows with src_subject_id in common_ids
phyhealth_df_final = phyhealth_df[phyhealth_df["src_subject_id"].isin(common_ids)]

# Remove all rows with any NaN values
phyhealth_df_final_cleaned = phyhealth_df_final.dropna()

# Print the cleaned DataFrame
print("DataFrame after removing rows with NaN values:")
print(phyhealth_df_final_cleaned)

In [None]:
# Define file paths for saving
phyhealth_save_path = os.path.join(deriv_dir, "phyhealth_reg.csv")

# Save each DataFrame to a CSV file
phyhealth_df_final_cleaned.to_csv(phyhealth_save_path, index=False)

print("DataFrames have been saved successfully.")

In [None]:
# Assuming you already have phyhealth_df_final
common_ids = phyhealth_df_final_cleaned["src_subject_id"].unique()

# Filter other DataFrames using common_ids from phyhealth_df_final
sociocult_df_reg = sociocult_df[sociocult_df["src_subject_id"].isin(common_ids)]
covariate_df_reg = covariate_df[covariate_df["src_subject_id"].isin(common_ids)]
rsfc_df_reg = rsfc_df[rsfc_df["src_subject_id"].isin(common_ids)]

# Now you have filtered versions of all DataFrames
print(sociocult_df_reg, covariate_df_reg, rsfc_df_reg)

In [None]:
# Save the filtered DataFrames as CSV files
sociocult_df_final.to_csv(op.join(deriv_dir, "sociocult_reg.csv"), index=False)
covariate_df_final.to_csv(op.join(deriv_dir, "covariate_reg.csv"), index=False)
rsfc_df_final.to_csv(op.join(deriv_dir, "rsfc_reg.csv"), index=False)