## Creating ABIDE Dataframes

### Step 1: Import necessary files
In this step, we import the necessary files for the analysis. We also will reduce the ABIDE  datasets to only include the fields that we are interested in analyzing. 
We will also create a new csv file that is the cleaned data. 

In [None]:
import os.path as op
import pandas as pd
import numpy as np
import numpy as np
from scipy.stats import ttest_ind


# location of partipcant csv files
data_dir = "./dset"



In [None]:
def load_csv(file_path):
    try:
        # Read the CSV file
        data = pd.read_csv(file_path)

        # Values to be replaced with NaN
        values_to_replace = [-9999.0, "#", "-9999", "-9999.0", "`"]

        # Replace the specified values with NaN
        for value in values_to_replace:
            data.replace(value, np.nan, inplace=True)

        return data
    except FileNotFoundError:
        print(f"The file at {file_path} does not exist.")
    except pd.errors.EmptyDataError:
        print("The file is empty.")
    except pd.errors.ParserError:
        print("The file could not be parsed.")
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [None]:
# original abide I and II csv files
abide1_df = load_csv(op.join(data_dir, "new_data_abide1.csv"))
abide2_df = load_csv(op.join(data_dir, "new_data_abide2.csv"))

# add a column that specifies which version of abide
abide1_df["ABIDE"] = 1
abide2_df["ABIDE"] = 2

In [None]:
# reduce abide1 and abide2 with demographics of interest
abide1_df = abide1_df[
    [
        "ABIDE",
        "alternative_id_1",
        "DX_GROUP",
        "DSM_IV_TR",
        "AGE_AT_SCAN",
        "SEX",
        "HANDEDNESS_CATEGORY",
        "COMORBIDITY",
        "CURRENT_MED_STATUS",
        "ADI_RRB_TOTAL_C",
        "ADI_R_SOCIAL_TOTAL_A",
        "ADI_R_VERBAL_TOTAL_BV",
        "ADI_R_RSRCH_RELIABLE",
        "ADOS_RSRCH_RELIABLE",
        "ADOS_GOTHAM_SOCAFFECT",
        "ADOS_GOTHAM_RRB",
        "SRS_MOTIVATION",
        "VINELAND_DAILYLVNG_STANDARD",
        "VINELAND_COPING_V_SCALED",
    ]
]


abide2_df = abide2_df[
    [
        "ABIDE",
        "SUB_ID",
        "DX_GROUP",
        "PDD_DSM_IV_TR",
        "AGE_AT_SCAN",
        "SEX",
        "HANDEDNESS_CATEGORY",
        "NONASD_PSYDX_LABEL",
        "CURRENT_MED_STATUS",
        "ADI_R_RRB_TOTAL_C",
        "ADI_R_SOCIAL_TOTAL_A",
        "ADI_R_VERBAL_TOTAL_BV",
        "ADI_R_RSRCH_RELIABLE",
        "ADOS_RSRCH_RELIABLE",
        "ADOS_2_SOCAFFECT",
        "ADOS_2_RRB",
        "SRS_MOTIVATION_RAW",
        "VINELAND_DAILYLIVING_STANDARD",
        "VINELAND_COPING_V_SCALED",
    ]
]


# rename the columns in abide2 to match abide1
abide2_df = abide2_df.rename(
    columns={
        "SUB_ID": "alternative_id_1",
        "PDD_DSM_IV_TR": "DSM_IV_TR",
        "NONASD_PSYDX_LABEL": "COMORBIDITY",
        "ADI_R_RRB_TOTAL_C": "ADI_RRB_TOTAL_C",
        "ADOS_2_SOCAFFECT": "ADOS_GOTHAM_SOCAFFECT",
        "ADOS_2_RRB": "ADOS_GOTHAM_RRB",
        "SRS_MOTIVATION_RAW": "SRS_MOTIVATION",
    }
)

# combine the two datasets into one
abide_df = pd.concat([abide1_df, abide2_df], ignore_index=True)
print(abide_df)


# change handedness from strings to numerics
abide_df["HANDEDNESS_CATEGORY"].replace(
    {"R": 1, "L": 2, "Ambi": 3, "Mixed": 3, "L->R": 3}, inplace=True
)

abide_df["CURRENT_MED_STATUS"].replace(
    {"0": 0, "1": 1, "0.0": 0, "1.0": 1, "'": np.nan}, inplace=True
)


# Save the melted DataFrame to the CSV file
'''csv_file_path = op.join(data_dir, "abide.csv")
abide_df.to_csv(csv_file_path, index=False)'''

### Step 2: Count Values
Here we find the counts of how many participants fall into various categories. 

In [None]:
# count number of participants in each of the diagnosis categories
# 1 = ASD; 2 = TD
print(abide_df["CURRENT_MED_STATUS"].unique())
# to find the total count for combined ASD and HC
totcolumn_count = ["DX_GROUP", "DSM_IV_TR"]
for column in totcolumn_count:
    count = abide_df[column].value_counts(dropna=False)
    print(count)

# Loop through the list of column combinations and count values
dxcolumn_count = [
    ["DX_GROUP", "SEX"],
    ["DX_GROUP", "HANDEDNESS_CATEGORY"],
    ["DX_GROUP", "CURRENT_MED_STATUS"],
    ["DX_GROUP", "ADI_R_RSRCH_RELIABLE"],
    ["DX_GROUP", "ADOS_RSRCH_RELIABLE"],
]
for columns in dxcolumn_count:
    count = abide_df[columns].value_counts(dropna=False).reset_index()
    print(count)

# Counting NaN values in all columns
nan_count1 = abide_df[abide_df["DX_GROUP"] == 1].isna().sum()
nan_count2 = abide_df[abide_df["DX_GROUP"] == 2].isna().sum()

print("Nan Count ASD:", nan_count1, "NaN Count HC:", nan_count2)

## Calculating statistcal values

Here, we are creating a df that will be used to run statistical tests in R

In [None]:
# just calling the abide df that we are going to manipulate
df = pd.read_csv(op.join(csv_dir, "abide.csv"))
print(df)

# Reduce abide df
stat_df = df[
    [
        "alternative_id_1",
        "DX_GROUP",
        "DSM_IV_TR",
        "AGE_AT_SCAN",
        "SEX",
        "HANDEDNESS_CATEGORY",
        "COMORBIDITY",
        "CURRENT_MED_STATUS",
        "ADI_RRB_TOTAL_C",
        "ADI_R_SOCIAL_TOTAL_A",
        "ADI_R_VERBAL_TOTAL_BV",
        "ADI_R_RSRCH_RELIABLE",
        "ADOS_RSRCH_RELIABLE",
        "ADOS_GOTHAM_SOCAFFECT",
        "ADOS_GOTHAM_RRB",
        "SRS_MOTIVATION",
        "VINELAND_DAILYLVNG_STANDARD",
        "VINELAND_COPING_V_SCALED",
    ]
]

Ignore this part for now, I attempted to do the tests in python and they didn't workout

In [None]:
# Ignore this for now
#abide 1
# Extract ages for DX_GROUP 1 (ASD) and DX_GROUP 2 (TD) into arrays
asd_ages = abide1_df[abide_df["DX_GROUP"] == 1]["AGE_AT_SCAN"].to_numpy(dtype=float)
td_ages = abide1_df[abide_df["DX_GROUP"] == 2]["AGE_AT_SCAN"].to_numpy(dtype=float)

# Perform two-sample t-test
t_stat, p_val = ttest_ind(asd_ages, td_ages)

print('t-statistic:', t_stat)
print('p-value:', p_val)

#abide 2
# Extract ages for DX_GROUP 1 (ASD) and DX_GROUP 2 (TD) into arrays
asd_ages = abide2_df[abide_df["DX_GROUP"] == 1]["AGE_AT_SCAN"].to_numpy(dtype=float)
td_ages = abide2_df[abide_df["DX_GROUP"] == 2]["AGE_AT_SCAN"].to_numpy(dtype=float)

# Perform two-sample t-test
t_stat, p_val = ttest_ind(asd_ages, td_ages)

print('t-statistic:', t_stat)
print('p-value:', p_val)

Here we find some statistics to describe the counts. 

In [None]:
dxcolumn_dist = [
    ["DX_GROUP", "AGE_AT_SCAN"],
    ["DX_GROUP", "ADI_RRB_TOTAL_C"],
    ["DX_GROUP", "ADI_R_SOCIAL_TOTAL_A"],
    ["DX_GROUP", "ADI_R_VERBAL_TOTAL_BV"],
    ["DX_GROUP", "ADOS_GOTHAM_SOCAFFECT"],
    ["DX_GROUP", "ADOS_GOTHAM_RRB"],
    ["DX_GROUP", "SRS_MOTIVATION"],
    ["DX_GROUP", "VINELAND_DAILYLVNG_STANDARD"],
    ["DX_GROUP", "VINELAND_COPING_V_SCALED"],
    ["DX_GROUP", "VINELAND_DAILYLIVING_STANDARD"],
]

for columns in dxcolumn_dist:
    filtered_df = abide_df[abide_df["DX_GROUP"] == 1]
    describe = filtered_df[columns[1]].describe()
    print(f"ASD ({columns[1]}):")
    print(describe)

    filtered_df = abide_df[abide_df["DX_GROUP"] == 2]
    describe = filtered_df[columns[1]].describe()
    print(f"TD ({columns[1]}):")
    print(describe)

### Step 3: Comorbidites
In this bit of code, we create a new csv file that counts how many times a comorbidity is listed for ASD/TD. 

In [None]:
# List of DX_GROUP values to iterate through
dx_group_values = [1, 2]

# create an empty dataframe
comorb_df = pd.DataFrame(columns=["COMORBIDITY"])

# Loop through each DX_GROUP
# this will add two columns for ASD, TD comorbidity counts
for dx_group in dx_group_values:
    # list the all possible comorbidities
    comorb = abide_df[abide_df["DX_GROUP"] == dx_group]["COMORBIDITY"].unique()
    # count the number of occurences
    comorb_count = abide_df[abide_df["DX_GROUP"] == dx_group][
        "COMORBIDITY"
    ].value_counts()
    print(comorb_count)
    # create the column names
    column_name = f"Count ASD" if dx_group == 1 else f"Count TD"
    comorb_df_group = pd.DataFrame(comorb_count.reset_index())
    comorb_df_group.columns = ["COMORBIDITY", column_name]

    # merge new columns with empty df
    if comorb_df.empty:
        comorb_df = comorb_df_group
    else:
        comorb_df = comorb_df.merge(comorb_df_group, on="COMORBIDITY", how="outer")

comorb_df.loc["Total Counts"] = comorb_df.sum(numeric_only=True, axis=0, skipna=True)

# Save the combined DataFrame to a CSV file
csv_file_path = op.join(csv_dir, "abide_comorb.csv")
comorb_df.to_csv(csv_file_path, index=False)


Here, we create a csv of seperated comorbidities and then simplified them.

In [None]:

import re

# List of DX_GROUP values to iterate through
dx_group_values = [1, 2]

# Create an empty DataFrame with columns for COMORBIDITY, Count ASD, and Count TD
comorb_df = pd.DataFrame(columns=["COMORBIDITY"])


# Regular expression pattern for matching variations of "ADHD"
adhd_pattern = re.compile(r"ADHD", re.IGNORECASE)
# Regular expression pattern for matching variations of "MDD"
mdd_pattern = re.compile(r"MDD", re.IGNORECASE)

# Loop through each DX_GROUP value
for dx_group in dx_group_values:
    # Filter the DataFrame for the current DX_GROUP value
    comorb_array = abide_df[abide_df["DX_GROUP"] == dx_group]["COMORBIDITY"].unique()
    comorb_list = comorb_array.tolist()
    comorb_split = []
    for item in comorb_list:
        if isinstance(item, str):
            comorb_split.extend(item.split(";"))

    # Remove leading and trailing spaces from each item in the list
    comorb_split = [value.strip() for value in comorb_split]

    # Create a dictionary to map keywords to specific values
    keyword_mapping = {
        "anxiety": "Anxiety",
        "phobia": "Phobia",
        "tic": "Tic Disorder",
        "dysth": "Dysthymia",
        "enuresis": "Enuresis",
        "depr": "Depression",
        "MDD": "Depression",
        "adhd": "ADHD",
        "bipolar": "Bipolar Disorder",
        "encopresis": "Encopresis",
        "schizo": "Schizophrenic Disorder",
        "GAD": "GAD",
        "mood": "Mood Disorder",
        "learn": "Nonverbal Learning Disorder",
        "dyslexia": "Developmental Dyslexia",
        "tourettes": "Tourettes Disorder",
        "disrupt": "Disruptive Disorder",
        "sensory": "Sensory Integration Disorder",
        "ODD": "ODD",
        "PTSD": "PTSD",
    }

    # Replace keywords with their corresponding values using case-insensitive search
    for i, value in enumerate(comorb_split):
        for keyword, mapped_value in keyword_mapping.items():
            if re.search(keyword, value, re.IGNORECASE):
                comorb_split[i] = mapped_value

    # Filter out comorbidities that are not in the keyword mapping
    comorb_split = [
        value for value in comorb_split if value in keyword_mapping.values()
    ]

    # Create a DataFrame with a named column
    comorb_splitdf = pd.DataFrame(comorb_split, columns=["COMORBIDITY"])

    # Count unique values and add them to the comorb_df
    comorb_count = comorb_splitdf["COMORBIDITY"].value_counts()

    column_name = f"Count ASD" if dx_group == 1 else f"Count TD"
    comorb_df_group = pd.DataFrame(comorb_count.reset_index())

    comorb_df_group.columns = ["COMORBIDITY", column_name]

    # merge new columns with the empty DataFrame
    if comorb_df.empty:
        comorb_df = comorb_df_group
    else:
        comorb_df = comorb_df.merge(comorb_df_group, on="COMORBIDITY", how="outer")

    comorb_df.loc["Total Counts"] = comorb_df.sum(
        numeric_only=True, axis=0, skipna=True
    )


comorb_df = comorb_df.drop(19)
print(comorb_df)
# Save DataFrame to the CSV file
'''csv_file_path = op.join(data_dir, "abide_comorb_cleaned.csv")
comorb_df.to_csv(csv_file_path, index=False)'''

In [None]:


# Define replacement lists
adhd = [
    "ADHD",
    "Tic Disorder",
    "Nonverbal Learning Disorder",
    "Developmental Dyslexia",
    "Tourettes Disorder",
    "Sensory Integration Disorder",
]
anxiety = ["Phobia", "GAD", "Anxiety", "PTSD"]
mood = ["Dysthymia", "Depression", "Mood Disorder"]
disrupt = ["ODD", "Disruptive Disorder"]
remove = ["Enuresis", "Encopresis", "Bipolar Disorder", "Schizophrenic Disorder"]

# Define replacement dictionary
replacements = {
    "ADHD": "ADHD/Other ND",
    "Tic Disorder": "ADHD/Other ND",
    "Nonverbal Learning Disorder": "ADHD/Other ND",
    "Developmental Dyslexia": "ADHD/Other ND",
    "Tourettes Disorder": "ADHD/Other ND",
    "Sensory Integration Disorder": "ADHD/Other ND",
    "Phobia": "Anxiety",
    "GAD": "Anxiety",
    "Anxiety": "Anxiety",
    "PTSD": "Anxiety",
    "Dysthymia": "Mood Disorder",
    "Depression": "Mood Disorder",
    "Mood Disorder": "Mood Disorder",
    "ODD": "Disruptive",
    "Disruptive Disorder": "Disruptive",
    "Enuresis": None,
    "Encopresis": None,
    "Bipolar Disorder": None,
    "Schizophrenic Disorder": None,
}

# Replace values in the 'COMORBIDITY' column based on the dictionary
comorb_df["COMORBIDITY"].replace(replacements, inplace=True)

# Drop rows where 'COMORBIDITY' is None (corresponding to values in the 'remove' list)
comorb_df.dropna(subset=["COMORBIDITY"], inplace=True)

# Combine counts for the same comorbidity names
comorb_clean_combined = (
    comorb_df.groupby("COMORBIDITY")
    .agg({"Count ASD": "sum", "Count TD": "sum"})
    .reset_index()
)

# Print the updated DataFrame with combined counts
print(comorb_clean_combined)

# Save DataFrame to the CSV file
'''csv_file_path = op.join(data_dir, "abide_comorb_simplified.csv")
comorb_clean_combined.to_csv(csv_file_path, index=False)'''