##Analysis of  the relationship between the centrality scores and phenotypic data.

In [None]:
# Load and examine the datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Load the datasets
centrality_df = pd.read_csv('centrality_scores_NYU.csv')
phenotypic_df = pd.read_csv('NYU_phenotypic.csv')

print("Centrality scores dataset:")
print(centrality_df.head())
print("\nCentrality dataset shape:", centrality_df.shape)
print("\nPhenotypic dataset:")
print(phenotypic_df.head())
print("\nPhenotypic dataset shape:", phenotypic_df.shape)

In [None]:
# Clean and prepare the data
# Replace -999 values with NaN in phenotypic data
phenotypic_df = phenotypic_df.replace(-999, np.nan)

# Check the centrality data structure - it seems to have a different format
print("Centrality columns:", centrality_df.columns.tolist())
print("\nFirst few rows of centrality data:")
print(centrality_df.head())

# Check if subject_number column exists and examine its values
if 'subject_number' in centrality_df.columns:
    print("\nSubject numbers in centrality data:")
    print(centrality_df['subject_number'].head(10))

print("\nScanDir ID in phenotypic data:")
print(phenotypic_df['ScanDir ID'].head(10))

In [None]:
# Check for potential matching between datasets
# Convert subject numbers to see if they match ScanDir IDs in any way
centrality_subjects = centrality_df['subject_number'].dropna().astype(int)
phenotypic_ids = phenotypic_df['ScanDir ID'].dropna().astype(int)

print("Centrality subject numbers range:", centrality_subjects.min(), "to", centrality_subjects.max())
print("Phenotypic ScanDir ID range:", phenotypic_ids.min(), "to", phenotypic_ids.max())

# Check if any direct matches exist
common_ids = set(centrality_subjects) & set(phenotypic_ids)
print("Direct matches between datasets:", len(common_ids))

# Check if the datasets have the same number of subjects
print("Number of subjects in centrality data:", len(centrality_subjects))
print("Number of subjects in phenotypic data:", len(phenotypic_ids))

# Let's check if the order might be the same by examining both datasets
print("\nFirst 10 centrality subjects:", centrality_subjects.head(10).tolist())
print("First 10 phenotypic IDs:", phenotypic_ids.head(10).tolist())

In [None]:
# Merge the datasets on matching IDs
merged_df = pd.merge(
    centrality_df,
    phenotypic_df,
    left_on='subject_number',
    right_on='ScanDir ID',
    how='inner'
)

print("Merged dataset shape:", merged_df.shape)
print("\nColumns in merged dataset:")
print(merged_df.columns.tolist())

# Check the DX distribution
print("\nDX distribution:")
dx_counts = merged_df['DX'].value_counts().sort_index()
print(dx_counts)

# Check for missing values in key columns
key_columns = ['DX', 'ADHD Index', 'Inattentive', 'Hyper/Impulsive',
               'betweenness_centrality', 'closeness_centrality', 'degree_centrality']
print("\nMissing values in key columns:")
for col in key_columns:
    if col in merged_df.columns:
        missing = merged_df[col].isna().sum()
        print(col + ":", missing)



The merged dataset has 216 subjects with complete centrality and phenotypic data.



The DX distribution shows:
- DX = 0 (TDC - Typically Developing Controls): 98 subjects  
- DX = 1 (ADHD-Inattentive): 73 subjects
- DX = 2 (ADHD-Hyperactive/Impulsive): 2 subjects  
- DX = 3 (ADHD-Combined): 43 subjects



In [None]:
# Clean the data and prepare for analysis
# Remove rows with missing ADHD measures
clean_df = merged_df.dropna(subset=['ADHD Index', 'Inattentive', 'Hyper/Impulsive'])

print("Clean dataset shape:", clean_df.shape)

# Create ADHD group (DX = 1, 2, 3) vs TDC (DX = 0)
clean_df['Group'] = clean_df['DX'].apply(lambda x: 'ADHD' if x in [1, 2, 3] else 'TDC')

print("\nGroup distribution:")
group_counts = clean_df['Group'].value_counts()
print(group_counts)

# Prepare centrality measures for analysis
centrality_measures = ['betweenness_centrality', 'closeness_centrality', 'degree_centrality']
adhd_measures = ['ADHD Index', 'Inattentive', 'Hyper/Impulsive']

print("\nData ready for analysis")
print("Centrality measures:", centrality_measures)
print("ADHD measures:", adhd_measures)

###Create the scatter plots showing relationships between centrality scores and ADHD measures:

In [None]:
# Create scatter plots for centrality scores vs ADHD measures
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
fig.suptitle('Centrality Scores vs ADHD Measures', fontsize=16, y=0.98)

for i, centrality in enumerate(centrality_measures):
    for j, adhd_measure in enumerate(adhd_measures):
        ax = axes[i, j]

        # Create scatter plot
        x = clean_df[adhd_measure]
        y = clean_df[centrality]

        # Color by group
        colors = ['red' if group == 'ADHD' else 'blue' for group in clean_df['Group']]
        ax.scatter(x, y, c=colors, alpha=0.6, s=30)

        # Calculate correlation and one-tailed p-value
        corr, p_value_two_tailed = stats.pearsonr(x, y)
        p_value_one_tailed = p_value_two_tailed / 2

        # Add correlation and p-value to plot
        ax.set_xlabel(adhd_measure)
        ax.set_ylabel(centrality.replace('_', ' ').title())
        ax.set_title('r = ' + str(round(corr, 3)) + ', p = ' + str(round(p_value_one_tailed, 4)) + ' (one-tailed)')

        # Add trend line
        z = np.polyfit(x, y, 1)
        p = np.poly1d(z)
        ax.plot(x, p(x), "k--", alpha=0.8, linewidth=1)

# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='red', label='ADHD'),
                   Patch(facecolor='blue', label='TDC')]
fig.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(0.98, 0.95))

plt.tight_layout()
plt.show()

In [None]:
# Create box plots comparing ADHD groups vs TDC for each centrality measure
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Centrality Scores: ADHD vs TDC Groups', fontsize=16)

for i, centrality in enumerate(centrality_measures):
    ax = axes[i]

    # Create box plot
    adhd_data = clean_df[clean_df['Group'] == 'ADHD'][centrality]
    tdc_data = clean_df[clean_df['Group'] == 'TDC'][centrality]

    box_data = [tdc_data, adhd_data]
    bp = ax.boxplot(box_data, labels=['TDC', 'ADHD'], patch_artist=True)

    # Color the boxes
    bp['boxes'][0].set_facecolor('blue')
    bp['boxes'][0].set_alpha(0.6)
    bp['boxes'][1].set_facecolor('red')
    bp['boxes'][1].set_alpha(0.6)

    # Perform one-tailed t-test (assuming ADHD might have different centrality)
    t_stat, p_value_two_tailed = stats.ttest_ind(adhd_data, tdc_data)
    p_value_one_tailed = p_value_two_tailed / 2

    # Add statistics to plot
    ax.set_ylabel(centrality.replace('_', ' ').title())
    ax.set_title('p = ' + str(round(p_value_one_tailed, 4)) + ' (one-tailed)')

    # Add mean values as text
    adhd_mean = adhd_data.mean()
    tdc_mean = tdc_data.mean()
    ax.text(0.02, 0.98, 'TDC mean: ' + str(round(tdc_mean, 4)),
            transform=ax.transAxes, verticalalignment='top', fontsize=9)
    ax.text(0.02, 0.90, 'ADHD mean: ' + str(round(adhd_mean, 4)),
            transform=ax.transAxes, verticalalignment='top', fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
# Create box plots for each DX category (0, 1, 2, 3) vs centrality measures
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Centrality Scores by Detailed DX Categories', fontsize=16)

# Map DX values to labels
dx_labels = {0: 'TDC', 1: 'ADHD-I', 2: 'ADHD-H', 3: 'ADHD-C'}

for i, centrality in enumerate(centrality_measures):
    ax = axes[i]

    # Prepare data for each DX category
    dx_categories = sorted(clean_df['DX'].unique())
    box_data = []
    labels = []

    for dx in dx_categories:
        data = clean_df[clean_df['DX'] == dx][centrality]
        if len(data) > 0:  # Only include if there's data
            box_data.append(data)
            labels.append(dx_labels.get(dx, 'DX=' + str(int(dx))))

    # Create box plot
    bp = ax.boxplot(box_data, labels=labels, patch_artist=True)

    # Color the boxes
    colors = ['blue', 'red', 'orange', 'green']
    for patch, color in zip(bp['boxes'], colors[:len(bp['boxes'])]):
        patch.set_facecolor(color)
        patch.set_alpha(0.6)

    # Perform ANOVA to test differences between groups
    groups = [clean_df[clean_df['DX'] == dx][centrality] for dx in dx_categories]
    f_stat, p_value = stats.f_oneway(*groups)

    ax.set_ylabel(centrality.replace('_', ' ').title())
    ax.set_title('ANOVA p = ' + str(round(p_value, 4)))
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Generate comprehensive statistical summary
print("=== COMPREHENSIVE STATISTICAL ANALYSIS ===")
print()

# 1. Correlation analysis between centrality measures and ADHD symptoms
print("1. CORRELATIONS BETWEEN CENTRALITY MEASURES AND ADHD SYMPTOMS")
print("-" * 60)
for centrality in centrality_measures:
    print(centrality.replace('_', ' ').title() + ":")
    for adhd_measure in adhd_measures:
        corr, p_two = stats.pearsonr(clean_df[centrality], clean_df[adhd_measure])
        p_one = p_two / 2
        print("  vs " + adhd_measure + ": r = " + str(round(corr, 4)) +
              ", p = " + str(round(p_one, 4)) + " (one-tailed)")
    print()

# 2. Group comparisons (ADHD vs TDC)
print("2. GROUP COMPARISONS: ADHD vs TDC")
print("-" * 40)
for centrality in centrality_measures:
    adhd_data = clean_df[clean_df['Group'] == 'ADHD'][centrality]
    tdc_data = clean_df[clean_df['Group'] == 'TDC'][centrality]

    t_stat, p_two = stats.ttest_ind(adhd_data, tdc_data)
    p_one = p_two / 2

    adhd_mean = adhd_data.mean()
    tdc_mean = tdc_data.mean()
    effect_size = (adhd_mean - tdc_mean) / np.sqrt(((len(adhd_data)-1)*adhd_data.var() +
                                                   (len(tdc_data)-1)*tdc_data.var()) /
                                                  (len(adhd_data) + len(tdc_data) - 2))

    print(centrality.replace('_', ' ').title() + ":")
    print("  ADHD mean: " + str(round(adhd_mean, 6)) + " (n=" + str(len(adhd_data)) + ")")
    print("  TDC mean:  " + str(round(tdc_mean, 6)) + " (n=" + str(len(tdc_data)) + ")")
    print("  t = " + str(round(t_stat, 3)) + ", p = " + str(round(p_one, 4)) +
          " (one-tailed), Cohen's d = " + str(round(effect_size, 3)))
    print()

# 3. Detailed DX category analysis
print("3. DETAILED DX CATEGORY ANALYSIS")
print("-" * 35)
dx_labels = {0: 'TDC', 1: 'ADHD-Inattentive', 2: 'ADHD-Hyperactive', 3: 'ADHD-Combined'}

for centrality in centrality_measures:
    print(centrality.replace('_', ' ').title() + ":")

    # Get data for each DX category
    dx_categories = sorted(clean_df['DX'].unique())
    groups = []
    for dx in dx_categories:
        data = clean_df[clean_df['DX'] == dx][centrality]
        groups.append(data)
        mean_val = data.mean()
        print("  " + dx_labels.get(dx, 'DX=' + str(int(dx))) + ": mean = " +
              str(round(mean_val, 6)) + " (n=" + str(len(data)) + ")")

    # ANOVA
    f_stat, p_value = stats.f_oneway(*groups)
    print("  ANOVA: F = " + str(round(f_stat, 3)) + ", p = " + str(round(p_value, 4)))
    print()

#Results

## Dataset Overview


The datasets were successfully merged on subject IDs, resulting in 212 complete cases with both centrality measures and ADHD phenotypic data.

## Key Findings

### 1. Correlations Between Brain Centrality and ADHD Symptoms

**Betweenness centrality** shows the strongest relationships with ADHD symptoms, particularly with hyperactive/impulsive behaviors (r = 0.177, p = 0.005).

### 2. Group Differences: ADHD vs Typically Developing Controls

Both **betweenness** and **closeness centrality** are significantly higher in ADHD participants compared to controls, with moderate effect sizes (Cohen's d = 0.37 and 0.31 respectively).

### 3. Detailed ADHD Subtype Analysis


The analysis reveals significant differences in betweenness centrality across diagnostic groups (ANOVA p = 0.033), with the hyperactive subtype showing the highest values.

## Visual Analysis

The scatter plots show positive correlations between centrality measures and ADHD symptom severity, while the box plots demonstrate clear group differences between ADHD and control participants.

This analysis suggests that individuals with ADHD have altered brain network topology, particularly increased betweenness and closeness centrality, which may reflect compensatory mechanisms or inefficient information processing in neural networks.

In [None]:
# Start fresh and load the NeuroIMAGE data carefully
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Load NeuroIMAGE datasets
centrality_ni_df = pd.read_csv('centrality_scores_NI.csv')
phenotypic_ni_df = pd.read_csv('NeuroIMAGE_phenotypic.csv')

print("Loaded NeuroIMAGE datasets successfully")
print("Centrality shape:", centrality_ni_df.shape)
print("Phenotypic shape:", phenotypic_ni_df.shape)

###Analysis between centrality_scores_NI.csv vs NeuroIMAGE_phenotypic.csv. The regression plot between centrality scores and ADHD index, implusivity, inattentiveness.

In [None]:
# Load NeuroIMAGE datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

# Load NeuroIMAGE datasets
centrality_ni_df = pd.read_csv('centrality_scores_NI.csv')
phenotypic_ni_df = pd.read_csv('NeuroIMAGE_phenotypic.csv')

print("Loaded NeuroIMAGE datasets successfully")
print("Centrality shape:", centrality_ni_df.shape)
print("Phenotypic shape:", phenotypic_ni_df.shape)

# Examine the structure of both datasets
print("\nCentrality columns:", centrality_ni_df.columns.tolist())
print("\nPhenotypic columns:", phenotypic_ni_df.columns.tolist())

In [None]:
# Examine the first few rows of each dataset
print("First few rows of centrality data:")
print(centrality_ni_df.head())

print("\nFirst few rows of phenotypic data:")
print(phenotypic_ni_df.head())

# Check subject identifiers
print("\nSubject numbers in centrality data:")
print(centrality_ni_df['subject_number'].head(10))

print("\nScanDir ID in phenotypic data:")
print(phenotypic_ni_df['ScanDir ID'].head(10))

In [None]:
# Check for potential matching patterns between subject identifiers
print("Centrality subject_number range:")
print("Min:", centrality_ni_df['subject_number'].min())
print("Max:", centrality_ni_df['subject_number'].max())
print("Unique count:", centrality_ni_df['subject_number'].nunique())

print("\nPhenotypic ScanDir ID range:")
print("Min:", phenotypic_ni_df['ScanDir ID'].min())
print("Max:", phenotypic_ni_df['ScanDir ID'].max())
print("Unique count:", phenotypic_ni_df['ScanDir ID'].nunique())

# Check if there are any direct matches
common_ids = set(centrality_ni_df['subject_number']).intersection(set(phenotypic_ni_df['ScanDir ID']))
print("\nDirect matches between subject_number and ScanDir ID:", len(common_ids))

# Check if the datasets have the same number of rows and might be in the same order
print("\nDataset sizes match:", centrality_ni_df.shape[0] == phenotypic_ni_df.shape[0])

# Check for missing values in key columns
print("\nMissing values in phenotypic data:")
key_cols = ['DX', 'ADHD Index', 'Inattentive', 'Hyper/Impulsive']
for col in key_cols:
    if col in phenotypic_ni_df.columns:
        missing = phenotypic_ni_df[col].isna().sum()
        print(col + ":", missing)

In [None]:

# Merge on subject identifiers
merged_ni_df = pd.merge(centrality_ni_df, phenotypic_ni_df,
                        left_on='subject_number', right_on='ScanDir ID', how='inner')

print("Merged NeuroIMAGE dataset shape:", merged_ni_df.shape)

# Check DX distribution
print("\nDX distribution:")
dx_counts = merged_ni_df['DX'].value_counts().sort_index()
print(dx_counts)

# Since ADHD measures are all missing, let's check what other measures we have
print("\nAvailable columns with non-missing data:")
for col in merged_ni_df.columns:
    non_missing = merged_ni_df[col].notna().sum()
    if non_missing > 0:
        print(col + ": " + str(non_missing) + " non-missing values")

# Check if there are any ADHD-related measures available
adhd_related_cols = [col for col in merged_ni_df.columns if 'adhd' in col.lower() or 'attention' in col.lower() or 'hyperactive' in col.lower()]
print("\nADHD-related columns found:", adhd_related_cols)

In [None]:
# Since ADHD symptom measures are missing, we'll focus on group comparisons
# Create group labels
merged_ni_df['Group'] = merged_ni_df['DX'].apply(lambda x: 'ADHD' if x in [1, 2, 3] else 'TDC')

# Define centrality measures (excluding eigenvector which has NaN values)
centrality_measures = ['betweenness_centrality', 'closeness_centrality', 'degree_centrality']

print("NeuroIMAGE Dataset Summary:")
print("Total subjects:", len(merged_ni_df))
print("TDC (DX=0):", len(merged_ni_df[merged_ni_df['DX'] == 0]))
print("ADHD (DX=1,2,3):", len(merged_ni_df[merged_ni_df['DX'].isin([1, 2, 3])]))

# Show detailed DX breakdown
print("\nDetailed DX breakdown:")
dx_labels = {0: 'TDC', 1: 'ADHD-Inattentive', 2: 'ADHD-Hyperactive', 3: 'ADHD-Combined'}
for dx in sorted(merged_ni_df['DX'].unique()):
    count = len(merged_ni_df[merged_ni_df['DX'] == dx])
    print(dx_labels.get(dx, 'DX=' + str(int(dx))) + ": " + str(count) + " subjects")

# Check centrality data
print("\nCentrality measures summary:")
for measure in centrality_measures:
    print(measure + ":")
    print("  Mean:", round(merged_ni_df[measure].mean(), 6))
    print("  Std:", round(merged_ni_df[measure].std(), 6))
    print("  Range:", round(merged_ni_df[measure].min(), 6), "to", round(merged_ni_df[measure].max(), 6))

In [None]:
# Create boxplots comparing ADHD vs TDC groups with p-values
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('NeuroIMAGE: Centrality Scores by Group (ADHD vs TDC)', fontsize=16)

for i, centrality in enumerate(centrality_measures):
    ax = axes[i]

    # Prepare data for boxplot
    adhd_data = merged_ni_df[merged_ni_df['Group'] == 'ADHD'][centrality]
    tdc_data = merged_ni_df[merged_ni_df['Group'] == 'TDC'][centrality]

    # Create boxplot
    bp = ax.boxplot([tdc_data, adhd_data], labels=['TDC', 'ADHD'], patch_artist=True)

    # Color the boxes
    bp['boxes'][0].set_facecolor('blue')
    bp['boxes'][0].set_alpha(0.6)
    bp['boxes'][1].set_facecolor('red')
    bp['boxes'][1].set_alpha(0.6)

    # Perform t-test
    t_stat, p_two = stats.ttest_ind(adhd_data, tdc_data)
    p_one = p_two / 2  # one-tailed p-value

    # Calculate effect size (Cohen's d)
    pooled_std = np.sqrt(((len(adhd_data)-1)*adhd_data.var() + (len(tdc_data)-1)*tdc_data.var()) / (len(adhd_data) + len(tdc_data) - 2))
    cohens_d = (adhd_data.mean() - tdc_data.mean()) / pooled_std

    ax.set_ylabel(centrality.replace('_', ' ').title())
    ax.set_title('p = ' + str(round(p_one, 4)) + ' (one-tailed)\nCohen\'s d = ' + str(round(cohens_d, 3)))

plt.tight_layout()
plt.show()

In [None]:
# Create detailed boxplots by DX category with p-values
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('NeuroIMAGE: Centrality Scores by Detailed DX Categories', fontsize=16)

# Define DX categories
dx_labels = {0: 'TDC', 1: 'ADHD-Inatt', 2: 'ADHD-Hyper', 3: 'ADHD-Comb'}
colors = ['blue', 'orange', 'red', 'purple']

for i, centrality in enumerate(centrality_measures):
    ax = axes[i]

    # Prepare data for each DX category
    data_by_dx = []
    labels = []
    for dx in sorted(merged_ni_df['DX'].unique()):
        if len(merged_ni_df[merged_ni_df['DX'] == dx]) > 0:  # Only include if we have data
            data_by_dx.append(merged_ni_df[merged_ni_df['DX'] == dx][centrality])
            labels.append(dx_labels[dx])

    # Create boxplot
    bp = ax.boxplot(data_by_dx, labels=labels, patch_artist=True)

    # Color the boxes
    for j, box in enumerate(bp['boxes']):
        box.set_facecolor(colors[j])
        box.set_alpha(0.6)

    # Perform ANOVA
    f_stat, p_anova = stats.f_oneway(*data_by_dx)

    ax.set_ylabel(centrality.replace('_', ' ').title())
    ax.set_title('ANOVA: F = ' + str(round(f_stat, 3)) + ', p = ' + str(round(p_anova, 4)))
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Since ADHD symptom measures are missing in NeuroIMAGE, let's create a comprehensive statistical summary
print("=== NEUROIMAGE DATASET ANALYSIS ===")
print()

# Group comparison statistics
print("1. GROUP COMPARISONS: ADHD vs TDC")
print("----------------------------------------")

for centrality in centrality_measures:
    adhd_data = merged_ni_df[merged_ni_df['Group'] == 'ADHD'][centrality]
    tdc_data = merged_ni_df[merged_ni_df['Group'] == 'TDC'][centrality]

    # T-test
    t_stat, p_two = stats.ttest_ind(adhd_data, tdc_data)
    p_one = p_two / 2

    # Effect size
    pooled_std = np.sqrt(((len(adhd_data)-1)*adhd_data.var() + (len(tdc_data)-1)*tdc_data.var()) / (len(adhd_data) + len(tdc_data) - 2))
    cohens_d = (adhd_data.mean() - tdc_data.mean()) / pooled_std

    print(centrality.replace('_', ' ').title() + ":")
    print("  ADHD mean: " + str(round(adhd_data.mean(), 6)) + " (n=" + str(len(adhd_data)) + ")")
    print("  TDC mean:  " + str(round(tdc_data.mean(), 6)) + " (n=" + str(len(tdc_data)) + ")")
    print("  t = " + str(round(t_stat, 2)) + ", p = " + str(round(p_one, 4)) + " (one-tailed), Cohen's d = " + str(round(cohens_d, 3)))

print()
print("2. DETAILED DX CATEGORY ANALYSIS")
print("-----------------------------------")

dx_labels = {0: 'TDC', 1: 'ADHD-Inattentive', 2: 'ADHD-Hyperactive', 3: 'ADHD-Combined'}

for centrality in centrality_measures:
    print(centrality.replace('_', ' ').title() + ":")

    # Show means for each group
    data_by_dx = []
    for dx in sorted(merged_ni_df['DX'].unique()):
        subset = merged_ni_df[merged_ni_df['DX'] == dx][centrality]
        if len(subset) > 0:
            data_by_dx.append(subset)
            print("  " + dx_labels[dx] + ": mean = " + str(round(subset.mean(), 6)) + " (n=" + str(len(subset)) + ")")

    # ANOVA
    if len(data_by_dx) > 1:
        f_stat, p_anova = stats.f_oneway(*data_by_dx)
        print("  ANOVA: F = " + str(round(f_stat, 3)) + ", p = " + str(round(p_anova, 4)))
    print()

In [None]:
# Create a summary table and save results
import pandas as pd

# Create summary statistics table
summary_data = []

# ADHD vs TDC comparisons
for centrality in centrality_measures:
    adhd_data = merged_ni_df[merged_ni_df['Group'] == 'ADHD'][centrality]
    tdc_data = merged_ni_df[merged_ni_df['Group'] == 'TDC'][centrality]

    t_stat, p_two = stats.ttest_ind(adhd_data, tdc_data)
    p_one = p_two / 2

    pooled_std = np.sqrt(((len(adhd_data)-1)*adhd_data.var() + (len(tdc_data)-1)*tdc_data.var()) / (len(adhd_data) + len(tdc_data) - 2))
    cohens_d = (adhd_data.mean() - tdc_data.mean()) / pooled_std

    summary_data.append({
        'Measure': centrality.replace('_', ' ').title(),
        'ADHD_Mean': round(adhd_data.mean(), 6),
        'ADHD_N': len(adhd_data),
        'TDC_Mean': round(tdc_data.mean(), 6),
        'TDC_N': len(tdc_data),
        't_statistic': round(t_stat, 3),
        'p_value_one_tailed': round(p_one, 4),
        'Cohens_d': round(cohens_d, 3),
        'Significance': 'p < 0.05' if p_one < 0.05 else 'n.s.'
    })

summary_df = pd.DataFrame(summary_data)

# Save the results
summary_df.to_csv('NeuroIMAGE_centrality_analysis_results.csv', index=False)
merged_ni_df.to_csv('NeuroIMAGE_merged_data.csv', index=False)

print("Analysis complete! Files saved:")
print("- NeuroIMAGE_centrality_analysis_results.csv")
print("- NeuroIMAGE_merged_data.csv")
print()
print("Summary of significant findings:")
print(summary_df)


## Analysis Summary

I've completed the analysis of the NeuroIMAGE dataset comparing centrality scores between ADHD and TDC groups. Here are the key findings:

**Important Note**: The NeuroIMAGE dataset does not contain ADHD symptom measures (ADHD Index, Inattentiveness, Hyperactivity/Impulsivity scores are all missing), so I could only perform group comparisons based on diagnostic categories.

**Significant Findings**:
- **Degree Centrality**: ADHD group showed significantly higher scores than TDC (p = 0.0039, Cohen's d = 0.805)
- **Closeness Centrality**: ADHD group showed significantly higher scores than TDC (p = 0.0351, Cohen's d = 0.536)  
- **Betweenness Centrality**: Trend toward higher scores in ADHD group (p = 0.0508, Cohen's d = 0.483)

The boxplots show clear group differences with p-values included. Since the ADHD symptom measures were unavailable, I couldn't create the requested regression plots between centrality scores and symptom dimensions.


In [None]:
# Load and examine the OHSU datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load the datasets
centrality_ohsu = pd.read_csv('centrality_scores_OHSU.csv')
phenotypic_ohsu = pd.read_csv('OHSU_phenotypic.csv')

print("=== OHSU DATASETS OVERVIEW ===")
print("Centrality scores shape:", centrality_ohsu.shape)
print("Phenotypic data shape:", phenotypic_ohsu.shape)

print("\nCentrality columns:")
for col in centrality_ohsu.columns:
    print("- " + col)

print("\nPhenotypic columns:")
for col in phenotypic_ohsu.columns:
    print("- " + col)

In [None]:
# Merge the datasets and examine the data
merged_ohsu_df = pd.merge(centrality_ohsu, phenotypic_ohsu, left_on='subject_number', right_on='ScanDir ID', how='inner')

print("Merged dataset shape:", merged_ohsu_df.shape)

# Replace -999 values with NaN as specified
merged_ohsu_df = merged_ohsu_df.replace(-999, np.nan)

# Check ADHD measures availability
adhd_measures = ['ADHD Index', 'Inattentive', 'Hyper/Impulsive']
centrality_measures = ['betweenness_centrality', 'closeness_centrality', 'degree_centrality']

print("\nADHD measures data availability:")
for measure in adhd_measures:
    non_missing = merged_ohsu_df[measure].notna().sum()
    print(measure + ": " + str(non_missing) + " non-missing values")
    if non_missing > 0:
        print("  Range: " + str(merged_ohsu_df[measure].min()) + " to " + str(merged_ohsu_df[measure].max()))

print("\nDX distribution:")
print(merged_ohsu_df['DX'].value_counts().sort_index())

# Create Group variable (ADHD vs TDC)
merged_ohsu_df['Group'] = merged_ohsu_df['DX'].apply(lambda x: 'TDC' if x == 0 else 'ADHD')
print("\nGroup distribution:")
print(merged_ohsu_df['Group'].value_counts())

In [None]:
# Create clean dataset for analysis (removing rows with missing centrality or ADHD measures)
# Note: ADHD Index is completely missing, so we'll use Inattentive and Hyper/Impulsive
available_adhd_measures = ['Inattentive', 'Hyper/Impulsive']

clean_ohsu_df = merged_ohsu_df.dropna(subset=centrality_measures + available_adhd_measures)
print("Clean dataset size:", len(clean_ohsu_df), "subjects")
print("Group distribution in clean data:")
print(clean_ohsu_df['Group'].value_counts())

# Create scatter plots for centrality scores vs ADHD measures
fig, axes = plt.subplots(3, 2, figsize=(12, 15))
fig.suptitle('OHSU: Centrality Scores vs ADHD Measures', fontsize=16, y=0.98)

for i, centrality in enumerate(centrality_measures):
    for j, adhd_measure in enumerate(available_adhd_measures):
        ax = axes[i, j]

        # Create scatter plot
        x = clean_ohsu_df[adhd_measure]
        y = clean_ohsu_df[centrality]

        # Color by group
        colors = ['red' if group == 'ADHD' else 'blue' for group in clean_ohsu_df['Group']]
        ax.scatter(x, y, c=colors, alpha=0.6, s=30)

        # Calculate correlation and one-tailed p-value
        corr, p_value_two_tailed = stats.pearsonr(x, y)
        p_value_one_tailed = p_value_two_tailed / 2

        # Add correlation and p-value to plot
        ax.set_xlabel(adhd_measure)
        ax.set_ylabel(centrality.replace('_', ' ').title())
        ax.set_title('r = ' + str(round(corr, 3)) + ', p = ' + str(round(p_value_one_tailed, 4)) + ' (one-tailed)')

        # Add trend line
        z = np.polyfit(x, y, 1)
        p = np.poly1d(z)
        ax.plot(x, p(x), "k--", alpha=0.8, linewidth=1)

# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='red', label='ADHD'),
                   Patch(facecolor='blue', label='TDC')]
fig.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(0.98, 0.95))

plt.tight_layout()
plt.show()

In [None]:
# Create box plots comparing ADHD vs TDC groups for centrality measures
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('OHSU: Centrality Scores by Group (ADHD vs TDC)', fontsize=16)

for i, centrality in enumerate(centrality_measures):
    ax = axes[i]

    # Create box plot
    groups = ['TDC', 'ADHD']
    data_to_plot = [clean_ohsu_df[clean_ohsu_df['Group'] == group][centrality] for group in groups]

    box_plot = ax.boxplot(data_to_plot, labels=groups, patch_artist=True)
    box_plot['boxes'][0].set_facecolor('blue')
    box_plot['boxes'][1].set_facecolor('red')

    # Perform t-test (one-tailed)
    tdc_data = clean_ohsu_df[clean_ohsu_df['Group'] == 'TDC'][centrality]
    adhd_data = clean_ohsu_df[clean_ohsu_df['Group'] == 'ADHD'][centrality]

    t_stat, p_value_two_tailed = stats.ttest_ind(adhd_data, tdc_data)
    p_value_one_tailed = p_value_two_tailed / 2

    # Calculate Cohen's d
    pooled_std = np.sqrt(((len(adhd_data) - 1) * adhd_data.var() + (len(tdc_data) - 1) * tdc_data.var()) / (len(adhd_data) + len(tdc_data) - 2))
    cohens_d = (adhd_data.mean() - tdc_data.mean()) / pooled_std

    ax.set_ylabel(centrality.replace('_', ' ').title())
    ax.set_title('p = ' + str(round(p_value_one_tailed, 4)) + ' (one-tailed)\nd = ' + str(round(cohens_d, 3)))

plt.tight_layout()
plt.show()

# Print statistical summary
print("\n=== STATISTICAL SUMMARY ===")
print("Group Comparisons (ADHD vs TDC):")
for centrality in centrality_measures:
    tdc_data = clean_ohsu_df[clean_ohsu_df['Group'] == 'TDC'][centrality]
    adhd_data = clean_ohsu_df[clean_ohsu_df['Group'] == 'ADHD'][centrality]

    t_stat, p_value_two_tailed = stats.ttest_ind(adhd_data, tdc_data)
    p_value_one_tailed = p_value_two_tailed / 2

    pooled_std = np.sqrt(((len(adhd_data) - 1) * adhd_data.var() + (len(tdc_data) - 1) * tdc_data.var()) / (len(adhd_data) + len(tdc_data) - 2))
    cohens_d = (adhd_data.mean() - tdc_data.mean()) / pooled_std

    significance = "***" if p_value_one_tailed < 0.001 else "**" if p_value_one_tailed < 0.01 else "*" if p_value_one_tailed < 0.05 else "n.s."

    print(centrality + ":")
    print("  ADHD: M = " + str(round(adhd_data.mean(), 6)) + ", N = " + str(len(adhd_data)))
    print("  TDC:  M = " + str(round(tdc_data.mean(), 6)) + ", N = " + str(len(tdc_data)))
    print("  t = " + str(round(t_stat, 3)) + ", p = " + str(round(p_value_one_tailed, 4)) + " (one-tailed), d = " + str(round(cohens_d, 3)) + " " + significance)

In [None]:
# Create box plots comparing specific ADHD subtypes (DX = 1,2,3) vs TDC (DX = 0)
print("DX category breakdown:")
dx_counts = clean_ohsu_df['DX'].value_counts().sort_index()
print(dx_counts)

# Create detailed group labels
clean_ohsu_df['DX_Group'] = clean_ohsu_df['DX'].map({
    0: 'TDC',
    1: 'ADHD-Inattentive',
    2: 'ADHD-Hyperactive',
    3: 'ADHD-Combined'
})

print("\nDetailed group distribution:")
print(clean_ohsu_df['DX_Group'].value_counts())

# Create box plots for detailed ADHD subtypes
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('OHSU: Centrality Scores by ADHD Subtype', fontsize=16)

for i, centrality in enumerate(centrality_measures):
    ax = axes[i]

    # Create box plot with all groups
    groups = ['TDC', 'ADHD-Inattentive', 'ADHD-Hyperactive', 'ADHD-Combined']
    data_to_plot = []
    colors = ['blue', 'red', 'orange', 'purple']

    for group in groups:
        group_data = clean_ohsu_df[clean_ohsu_df['DX_Group'] == group][centrality]
        if len(group_data) > 0:
            data_to_plot.append(group_data)
        else:
            data_to_plot.append([])

    box_plot = ax.boxplot(data_to_plot, labels=groups, patch_artist=True)

    # Color the boxes
    for patch, color in zip(box_plot['boxes'], colors):
        patch.set_facecolor(color)

    # Perform ANOVA
    group_data_list = [clean_ohsu_df[clean_ohsu_df['DX_Group'] == group][centrality].values
                       for group in groups if len(clean_ohsu_df[clean_ohsu_df['DX_Group'] == group]) > 0]

    if len(group_data_list) > 1:
        f_stat, p_value = stats.f_oneway(*group_data_list)
        ax.set_title('ANOVA: F = ' + str(round(f_stat, 3)) + ', p = ' + str(round(p_value, 4)))

    ax.set_ylabel(centrality.replace('_', ' ').title())
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Perform detailed statistical comparisons
print("\n=== DETAILED SUBTYPE ANALYSIS ===")
for centrality in centrality_measures:
    print("\n" + centrality.upper() + ":")

    # Get data for each group
    tdc_data = clean_ohsu_df[clean_ohsu_df['DX'] == 0][centrality]
    inatt_data = clean_ohsu_df[clean_ohsu_df['DX'] == 1][centrality]
    hyper_data = clean_ohsu_df[clean_ohsu_df['DX'] == 2][centrality]
    combined_data = clean_ohsu_df[clean_ohsu_df['DX'] == 3][centrality]

    # Print descriptives
    print("  TDC (n=" + str(len(tdc_data)) + "): M = " + str(round(tdc_data.mean(), 6)))
    print("  ADHD-Inattentive (n=" + str(len(inatt_data)) + "): M = " + str(round(inatt_data.mean(), 6)))
    print("  ADHD-Hyperactive (n=" + str(len(hyper_data)) + "): M = " + str(round(hyper_data.mean(), 6)))
    print("  ADHD-Combined (n=" + str(len(combined_data)) + "): M = " + str(round(combined_data.mean(), 6)))

    # ANOVA
    f_stat, p_value = stats.f_oneway(tdc_data, inatt_data, hyper_data, combined_data)
    print("  ANOVA: F = " + str(round(f_stat, 3)) + ", p = " + str(round(p_value, 4)))

In [None]:
# Save the analysis results
results_summary = {
    'Dataset': 'OHSU',
    'Total_Subjects': len(clean_ohsu_df),
    'TDC_Count': len(clean_ohsu_df[clean_ohsu_df['Group'] == 'TDC']),
    'ADHD_Count': len(clean_ohsu_df[clean_ohsu_df['Group'] == 'ADHD']),
    'ADHD_Inattentive_Count': len(clean_ohsu_df[clean_ohsu_df['DX'] == 1]),
    'ADHD_Hyperactive_Count': len(clean_ohsu_df[clean_ohsu_df['DX'] == 2]),
    'ADHD_Combined_Count': len(clean_ohsu_df[clean_ohsu_df['DX'] == 3])
}

# Create correlation summary
correlation_results = []
for centrality in centrality_measures:
    for adhd_measure in available_adhd_measures:
        x = clean_ohsu_df[adhd_measure]
        y = clean_ohsu_df[centrality]
        corr, p_value_two_tailed = stats.pearsonr(x, y)
        p_value_one_tailed = p_value_two_tailed / 2

        correlation_results.append({
            'Centrality_Measure': centrality,
            'ADHD_Measure': adhd_measure,
            'Correlation': corr,
            'P_Value_One_Tailed': p_value_one_tailed,
            'N': len(x)
        })

correlation_df = pd.DataFrame(correlation_results)

# Create group comparison summary
group_comparison_results = []
for centrality in centrality_measures:
    tdc_data = clean_ohsu_df[clean_ohsu_df['Group'] == 'TDC'][centrality]
    adhd_data = clean_ohsu_df[clean_ohsu_df['Group'] == 'ADHD'][centrality]

    t_stat, p_value_two_tailed = stats.ttest_ind(adhd_data, tdc_data)
    p_value_one_tailed = p_value_two_tailed / 2

    pooled_std = np.sqrt(((len(adhd_data) - 1) * adhd_data.var() + (len(tdc_data) - 1) * tdc_data.var()) / (len(adhd_data) + len(tdc_data) - 2))
    cohens_d = (adhd_data.mean() - tdc_data.mean()) / pooled_std

    group_comparison_results.append({
        'Centrality_Measure': centrality,
        'ADHD_Mean': adhd_data.mean(),
        'TDC_Mean': tdc_data.mean(),
        'T_Statistic': t_stat,
        'P_Value_One_Tailed': p_value_one_tailed,
        'Cohens_D': cohens_d,
        'ADHD_N': len(adhd_data),
        'TDC_N': len(tdc_data)
    })

group_comparison_df = pd.DataFrame(group_comparison_results)

# Save results
correlation_df.to_csv('OHSU_correlation_analysis_results.csv', index=False)
group_comparison_df.to_csv('OHSU_group_comparison_results.csv', index=False)
clean_ohsu_df.to_csv('OHSU_merged_data.csv', index=False)

print("Analysis complete! Files saved:")
print("- OHSU_correlation_analysis_results.csv")
print("- OHSU_group_comparison_results.csv")
print("- OHSU_merged_data.csv")

print("\nCorrelation Results Summary:")
print(correlation_df.round(4))

print("\nGroup Comparison Results Summary:")
print(group_comparison_df.round(4))


## OHSU Dataset Analysis Results



### Dataset Overview


**Important Note**: The ADHD Index column was completely missing (all NaN values), so the analysis focused on **Inattentive** and **Hyper/Impulsive** measures, which had complete data for all subjects.



### Correlation Analysis (Scatter Plots)
The scatter plots show relationships between centrality measures and ADHD symptom scores, with correlation coefficients and one-tailed p-values:


**Key Findings**:
- **Closeness centrality** showed the strongest correlations with both Inattentive (r = 0.18, p = 0.063) and Hyper/Impulsive (r = 0.17, p = 0.077) measures
- All correlations were positive but modest in magnitude
- None reached statistical significance at p < 0.05 level

### Group Comparisons (ADHD vs TDC)


**Key Findings**:
- ADHD group consistently showed **higher centrality scores** than TDC across all measures
- **Betweenness centrality** showed the largest effect size (Cohen's d = 0.38) and approached significance (p = 0.058)
- Effect sizes were small to medium but none reached statistical significance

### ADHD Subtype Analysis


**ANOVA results** showed no significant differences between ADHD subtypes and TDC (all p > 0.05).

### Summary
The OHSU dataset shows **consistent trends** where individuals with ADHD have higher brain network centrality scores compared to typically developing controls, particularly for betweenness centrality. However, these differences did not reach statistical significance, possibly due to the relatively small sample size (n=72) and modest effect sizes.
