In [None]:
import os.path as op
import pandas as pd
import numpy as np
import numpy as np

#### load files

In [None]:
dir = "./"
data_dir = "./dset"
group_dir = op.join(data_dir, "group/habenula")
age_dir = op.join(data_dir, "age-effect5-21old/habenula")

#### collecting df w subject info for 1584 subjects

In [None]:
# run for whole group
subject_df = pd.read_csv(
    op.join(group_dir, "sub-group_task-rest_desc-1S2StTesthabenula_table.txt"), sep="\t"
)
print(subject_df)

unique_subjects = subject_df["Subj"].nunique()
print(f"Number of unique subjects: {unique_subjects}")

In [None]:
'''# run for just age group
subject_df = pd.read_csv(
    op.join(age_dir, "sub-group_task-rest_desc-Agehabenula_table.txt"), sep="\t"
)
print(subject_df)

unique_subjects = subject_df["Subj"].nunique()
print(f"Number of unique subjects: {unique_subjects}")'''

In [None]:
participant_df = pd.read_csv(op.join(data_dir, "participants.tsv"), sep="\t")
print(participant_df)

In [None]:
# Merge the DataFrames
merged_df = pd.merge(
    subject_df, participant_df, left_on="Subj", right_on="participant_id"
)
merged_df = merged_df.drop(columns=["participant_id"])
print(merged_df.columns)

In [None]:
# Specify the columns you want to keep
columns_to_keep = [
    "Subj",
    "DX_GROUP",
    "DSM_IV_TR",
    "AGE_AT_SCAN",
    "SEX",
    "HANDEDNESS_CATEGORY",
    "CURRENT_MED_STATUS",
    "COMORBIDITY",
]

# Create a new DataFrame with only the desired columns
participant_df = merged_df[columns_to_keep]
print(participant_df)


In [None]:
# Replace -9999 and -9999.0 with NaN across the entire DataFrame
participant_df.replace([-9999, -9999.0, "-9999", "`"], np.nan, inplace=True)

# change handedness from strings to numerics
'''participant_df["HANDEDNESS_CATEGORY"].replace(
    {"R": 1, "L": 2, "Ambi": 3, "Mixed": 3, "L->R": 3}, inplace=True
)'''

# Replace values in "HANDEDNESS_CATEGORY"
participant_df["HANDEDNESS_CATEGORY"].replace(
    {
        "1": "R",
        '1.0': "R",
        "2": "L",
        "2.0": "L",
        "Mixed": "Ambi",
        "3": "Mixed",
        "3.0": "Mixed",
        "3": "L->R",
    },
    inplace=True,
)

participant_df["HANDEDNESS_CATEGORY"] = (
    participant_df["HANDEDNESS_CATEGORY"]
    .replace({'1.0': 1, '2.0': 2, '3.0': 3})
    .fillna(participant_df["HANDEDNESS_CATEGORY"])
)

final_options = participant_df["HANDEDNESS_CATEGORY"].unique()
print(final_options)

participant_df["CURRENT_MED_STATUS"].replace(
    {"0": 0, "1": 1, "0.0": 0, "1.0": 1, "'": np.nan}, inplace=True
)
participant_df["HANDEDNESS_CATEGORY"].replace(
    {"R": 1, "L": 2, "Ambi": 3, "Mixed": 3, "L->R": 3}, inplace=True
)
participant_df["DX_GROUP"].replace(
    {1: "ASD", 2: "NT", "1": "ASD", "2": "NT"}, inplace=True
)
# Display the final DataFrame
print(participant_df)

In [None]:
# Replace 1 with "R", 2 with "L", and 3 with "Ambi"
participant_df["HANDEDNESS_CATEGORY"].replace({1: "R", 2: "L", 3: "Ambi"}, inplace=True)
participant_df["SEX"].replace({1: "M", 2: "F"}, inplace=True)
participant_df["DSM_IV_TR"].replace(
    {0: "Control", 1: "autism", 2: "asperg", 3: "pdd"}, inplace=True
)
participant_df["CURRENT_MED_STATUS"].replace({0: "no med", 1: "med"}, inplace=True)

In [None]:
# List of columns to print unique values for
columns = [
    "DX_GROUP",
    "DSM_IV_TR",
    "SEX",
    "HANDEDNESS_CATEGORY",
    "CURRENT_MED_STATUS",
    "COMORBIDITY",
]

# Print unique values for each column
for col in columns:
    unique_values = participant_df[col].unique()
    print(f"Unique values in '{col}': {unique_values}")

In [None]:
groups = ["ASD", "NT"]

# Filter the dataset for the specified groups
filtered_df = participant_df[participant_df["DX_GROUP"].isin(groups)]

# Calculate median age and standard deviation
median_age = filtered_df["AGE_AT_SCAN"].median()
std_age = filtered_df["AGE_AT_SCAN"].std()

# Print the results
print(f"Median Age for combined groups: {median_age}")
print(f"Standard Deviation of Age for combined groups: {std_age}")

In [None]:

for group in groups:
    count = participant_df[participant_df["DX_GROUP"] == group].value_counts(dropna=False)
    print(count)
    for index, row in participant_df.iterrows():
        if row["DX_GROUP"] == group:
            subj = row["Subj"]

### T-Test

In [None]:
from scipy.stats import ttest_ind

# Define the groups of interest
groups = ["ASD", "NT"]

# Initialize a dictionary to keep track of counts
group_counts = {group: 0 for group in groups}

# Loop through the participant_df and count the subjects for each group
for group in groups:
    count = participant_df[participant_df["DX_GROUP"] == group].shape[0]
    group_counts[group] = count
    print(f"Group {group}: {count} subjects")

# Extract ages for the two groups
asd_ages = participant_df[participant_df["DX_GROUP"] == "ASD"]["AGE_AT_SCAN"]
td_ages = participant_df[participant_df["DX_GROUP"] == "NT"]["AGE_AT_SCAN"]

# Calculate mean and standard deviation for each group
asd_mean_age = asd_ages.mean()
asd_std_age = asd_ages.std()
td_mean_age = td_ages.mean()
td_std_age = td_ages.std()

# Perform an independent t-test
t_stat, p_value = ttest_ind(
    asd_ages, td_ages, equal_var=False
)  # Use equal_var=False if variances are unequal

# Print mean age and standard deviation for each group
print(f"\nMean Age:")
print(f"  Group asd: {asd_mean_age:.2f}")
print(f"  Group td: {td_mean_age:.2f}")

print(f"\nStandard Deviation Age:")
print(f"  Group asd: {asd_std_age:.2f}")
print(f"  Group td: {td_std_age:.2f}")

# Print the t-statistic and p-value
print(f"\nIndependent t-test results:")
print(f"  t-statistic: {t_stat:.2f}")
print(f"  p-value: {p_value:.4f}")

### chi-sq tests

In [None]:
from scipy.stats import chi2_contingency
import pandas as pd

# Define the groups for counting
groups = participant_df["DX_GROUP"].unique()

# List of columns to perform chi-square tests on
columns = [
    "DSM_IV_TR",
    "SEX",
    "HANDEDNESS_CATEGORY",
    "CURRENT_MED_STATUS",
]

for col in columns:
    print(f"Column: {col}")

    # Create a contingency table for the chi-square test
    contingency_table = pd.crosstab(participant_df["DX_GROUP"], participant_df[col])

    # Perform the chi-square test
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)

    # Print the results
    print("Contingency Table:")
    print(contingency_table)
    print(f"Chi-square statistic: {chi2}")
    print(f"p-value: {p_value}")
    print(f"Degrees of freedom: {dof}")
    print("Expected frequencies:")
    print(expected)
    print("-" * 40)  # Separator for readability

### Cleaning Comorbidity

In [None]:
import re

# List of DX_GROUP values to iterate through
dx_group_values = ["ASD", "NT"]

# Create an empty DataFrame with columns for COMORBIDITY, Count ASD, and Count TD
comorb_df = pd.DataFrame(columns=["COMORBIDITY"])


# Regular expression pattern for matching variations of "ADHD"
adhd_pattern = re.compile(r"ADHD", re.IGNORECASE)
# Regular expression pattern for matching variations of "MDD"
mdd_pattern = re.compile(r"MDD", re.IGNORECASE)

# Loop through each DX_GROUP value
for dx_group in dx_group_values:
    # Filter the DataFrame for the current DX_GROUP value
    comorb_array = participant_df[participant_df["DX_GROUP"] == dx_group]["COMORBIDITY"].unique()
    comorb_list = comorb_array.tolist()
    comorb_split = []
    for item in comorb_list:
        if isinstance(item, str):
            comorb_split.extend(item.split(";"))

    # Remove leading and trailing spaces from each item in the list
    comorb_split = [value.strip() for value in comorb_split]

    # Create a dictionary to map keywords to specific values
    keyword_mapping = {
        "anxiety": "Anxiety",
        "phobia": "Phobia",
        "tic": "Tic Disorder",
        "dysth": "Dysthymia",
        "enuresis": "Enuresis",
        "depr": "Depression",
        "MDD": "Depression",
        "adhd": "ADHD",
        "bipolar": "Bipolar Disorder",
        "encopresis": "Encopresis",
        "schizo": "Schizophrenic Disorder",
        "GAD": "GAD",
        "mood": "Mood Disorder",
        "learn": "Nonverbal Learning Disorder",
        "dyslexia": "Developmental Dyslexia",
        "tourettes": "Tourettes Disorder",
        "disrupt": "Disruptive Disorder",
        "sensory": "Sensory Integration Disorder",
        "ODD": "ODD",
        "PTSD": "PTSD",
    }

    # Replace keywords with their corresponding values using case-insensitive search
    for i, value in enumerate(comorb_split):
        for keyword, mapped_value in keyword_mapping.items():
            if re.search(keyword, value, re.IGNORECASE):
                comorb_split[i] = mapped_value

    # Filter out comorbidities that are not in the keyword mapping
    comorb_split = [
        value for value in comorb_split if value in keyword_mapping.values()
    ]

    # Create a DataFrame with a named column
    comorb_splitdf = pd.DataFrame(comorb_split, columns=["COMORBIDITY"])

    # Count unique values and add them to the comorb_df
    comorb_count = comorb_splitdf["COMORBIDITY"].value_counts()

    column_name = f"Count ASD" if dx_group == "ASD" else f"Count NT"
    comorb_df_group = pd.DataFrame(comorb_count.reset_index())

    comorb_df_group.columns = ["COMORBIDITY", column_name]

    # merge new columns with the empty DataFrame
    if comorb_df.empty:
        comorb_df = comorb_df_group
    else:
        comorb_df = comorb_df.merge(comorb_df_group, on="COMORBIDITY", how="outer")

    comorb_df.loc["Total Counts"] = comorb_df.sum(
        numeric_only=True, axis=0, skipna=True
    )


comorb_df = comorb_df
print(comorb_df)
# Save DataFrame to the CSV file
"""csv_file_path = op.join(data_dir, "abide_comorb_cleaned.csv")
comorb_df.to_csv(csv_file_path, index=False)"""

In [None]:
# Define replacement lists
adhd = [
    "ADHD",
    "Tic Disorder",
    "Nonverbal Learning Disorder",
    "Developmental Dyslexia",
    "Tourettes Disorder",
    "Sensory Integration Disorder",
]
anxiety = ["Phobia", "GAD", "Anxiety", "PTSD"]
mood = ["Dysthymia", "Depression", "Mood Disorder"]
disrupt = ["ODD", "Disruptive Disorder"]
remove = ["Enuresis", "Encopresis", "Bipolar Disorder", "Schizophrenic Disorder"]

# Define replacement dictionary
replacements = {
    "ADHD": "ADHD/Other ND",
    "Tic Disorder": "ADHD/Other ND",
    "Nonverbal Learning Disorder": "ADHD/Other ND",
    "Developmental Dyslexia": "ADHD/Other ND",
    "Tourettes Disorder": "ADHD/Other ND",
    "Sensory Integration Disorder": "ADHD/Other ND",
    "Phobia": "Anxiety",
    "GAD": "Anxiety",
    "Anxiety": "Anxiety",
    "PTSD": "Anxiety",
    "Dysthymia": "Mood Disorder",
    "Depression": "Mood Disorder",
    "Mood Disorder": "Mood Disorder",
    "ODD": "Disruptive",
    "Disruptive Disorder": "Disruptive",
    "Enuresis": None,
    "Encopresis": None,
    "Bipolar Disorder": None,
    "Schizophrenic Disorder": None,
}

# Replace values in the 'COMORBIDITY' column based on the dictionary
comorb_df["COMORBIDITY"].replace(replacements, inplace=True)

# Drop rows where 'COMORBIDITY' is None (corresponding to values in the 'remove' list)
comorb_df.dropna(subset=["COMORBIDITY"], inplace=True)

# Combine counts for the same comorbidity names
comorb_clean_combined = (
    comorb_df.groupby("COMORBIDITY")
    .agg({"Count ASD": "sum", "Count NT": "sum"})
    .reset_index()
)

# Print the updated DataFrame with combined counts
print(comorb_clean_combined)

# Save DataFrame to the CSV file
"""csv_file_path = op.join(data_dir, "abide_comorb_simplified.csv")
comorb_clean_combined.to_csv(csv_file_path, index=False)"""

In [None]:
from scipy.stats import chi2_contingency

# Create the contingency table
contingency_table = pd.DataFrame(
    {"ASD": comorb_clean_combined["Count ASD"], "NT": comorb_clean_combined["Count NT"]}
).T  # Transpose to match the expected format

# Perform the chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Print the results
print("Contingency Table:")
print(contingency_table)
print(f"Chi-square statistic: {chi2}")
print(f"p-value: {p_value}")
print(f"Degrees of freedom: {dof}")
print("Expected frequencies:")
print(expected)