# Eye tracking analyses 2025 Gap Overlap

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import time
import os
import statsmodels
import seaborn as sns
import matplotlib.pyplot as plt
import glob
from scipy import stats
import pingouin as pg
import scikit_posthocs as sp
import statsmodels.api as sm
from scipy.stats import boxcox
from statannotations.Annotator import Annotator
from scipy.stats import pearsonr
import docx
from docx.enum.section import WD_ORIENT
from docx.enum.section import WD_SECTION
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
from statsmodels.stats.outliers_influence import variance_inflation_factor



In [2]:
# Papermill parameters
date = "2025_06_27"

In [3]:
# Function to convert df to docx table format

def df_to_table(doc, df):
    # Add a table to the end and create a reference variable
    t = doc.add_table(df.shape[0]+1, df.shape[1])

    # Make sure df is rounded 
    df = df.round(2)

    # Make sure table is auto fit
    for cell in t._cells:
        cell.width = docx.shared.Inches(1.0)
    
    # Set the style of the table
    table_style = doc.styles['Table Grid']

    # Add the header rows.
    for j in range(df.shape[-1]):
        t.cell(0,j).text = df.columns[j]

    # Add the rest of the data frame
    for i in range(df.shape[0]):
        for j in range(df.shape[-1]):
            t.cell(i+1,j).text = str(df.values[i,j])
    
    return doc


In [4]:
# Date 
date

'2025_06_27'

#### Create a word document with results

In [5]:
# Create a document

doc = docx.Document()

# Title page
section = doc.sections[0]
#section.start_type = WD_SECTION.ODD_PAGE
#section.orientation = WD_ORIENT.LANDSCAPE
#section.page_width, section.page_height = section.page_height, section.page_width

title = doc.add_paragraph()
title_run = title.add_run("Gap Overlap Eye Tracking Analysis Results")
title_run.bold = True
title_run.font.size = Pt(28)
title_run.font.name = 'Arial'
title.alignment = WD_ALIGN_PARAGRAPH.CENTER

doc.add_paragraph("")
doc.add_paragraph(f"Date: {date}").alignment = WD_ALIGN_PARAGRAPH.CENTER
doc.add_paragraph("Author: Gabriel Blanco").alignment = WD_ALIGN_PARAGRAPH.CENTER
doc.add_page_break()

# Add new doc section with landscape orientation
section = doc.add_section()
section.start_type = WD_SECTION.ODD_PAGE
new_width, new_height = section.page_height, section.page_width
section.orientation = WD_ORIENT.LANDSCAPE
section.page_width = new_width


doc.add_heading(f"Gap Analysis results", 0)

# Add heading

doc.add_heading('Participants', level=1)
doc.add_paragraph("This section describes the participants included in the analysis after initial filtering and demographic merging.")


<docx.text.paragraph.Paragraph at 0x1220504f760>

## Data loading

In [6]:
# Load all the files
go_files = glob.glob(f"C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source/GO_files*/all_files/*.txt")
p1= f"Total files including tests: " + str(len(go_files))
go_files


['C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source\\GO_files\\all_files\\002_Z_GO.txt',
 'C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source\\GO_files\\all_files\\0042P_GO.txt',
 'C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source\\GO_files\\all_files\\0043F1_GO.txt',
 'C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source\\GO_files\\all_files\\004_Z_GO.txt',
 'C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source\\GO_files\\all_files\\0050M1_GO.txt',
 'C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source\\GO_files\\all_files\\0050P_GO.txt',
 'C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source\\GO_files\\all_files\\0062P_GO.txt',
 'C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source\\GO_files\\all_files\\0064M1_GO.txt

In [7]:
# Load all the files into a single dataframe
df = []

# Create a dictionary to store empty subjects
subjects_removed = {}
for file in go_files:
    # Load experimental participants
    print(file)
    temp_df=pd.read_table(file)
    print("Completed")
    temp_df = pd.read_csv(file, delimiter = "\t")
    id= file.split("\\")[-1].split(".")[0].replace("_GO", "")
    print(id)
    if len(temp_df) == 0:
        print(f"Empty file: {id}")       
    # Save the empty subjects id to dictionary and include label "empty"
        subjects_removed[id] = "empty file"
        continue
    temp_df["subject"] = id
    # Load pilot participants
    if "Z" in id:
        alphanum = id.split("_")[0]
        family_id = "Z"
        temp_df["subject"] = alphanum + family_id
    df.append(temp_df)
go_df = pd.concat(df, axis=0)
go_df

C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source\GO_files\all_files\002_Z_GO.txt
Completed
002_Z
C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source\GO_files\all_files\0042P_GO.txt
Completed
0042P
C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source\GO_files\all_files\0043F1_GO.txt
Completed
0043F1
C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source\GO_files\all_files\004_Z_GO.txt
Completed
004_Z
Empty file: 004_Z
C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source\GO_files\all_files\0050M1_GO.txt
Completed
0050M1
C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source\GO_files\all_files\0050P_GO.txt
Completed
0050P
C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source\GO_files\all_files\0062P_GO.txt
Completed
0062P
C:/Users/gabot/OneDrive - McGill Univers

Unnamed: 0,Session_Name_,Trial_Index_,Trial_Recycled_,task,condition,detail,GO_JITTER,GO_SAC_RT,GO_VALID,subject
0,s02,1,False,GO,Baseline,Right,42,230.31,True,002Z
1,s02,2,False,GO,Gap,Right,64,255.00,True,002Z
2,s02,3,False,GO,Overlap,Left,72,323.78,True,002Z
3,s02,4,False,GO,Gap,Right,29,197.00,True,002Z
4,s02,5,False,GO,Baseline,Left,83,372.62,True,002Z
...,...,...,...,...,...,...,...,...,...,...
33,Q1256_S2,34,False,GO,Overlap,Right,10,368.29,True,1256S2
34,Q1256_S2,35,False,GO,Gap,Left,51,247.00,True,1256S2
35,Q1256_S2,36,False,GO,Baseline,Right,94,211.76,True,1256S2
36,Q1256_S2,37,False,GO,Overlap,Right,90,319.85,True,1256S2


In [8]:
# Total number of subjects after firts filtering
doc.add_heading('Total number of subjects after first filtering', level=2)



# Add the unique subjects to the document

# Add total number of subject files
doc.add_paragraph(f"Total number of subject files: {len(go_files)}", style='List Bullet')
# Print percetage of empty subjects
empty_subjects_count = len(subjects_removed)
total_subjects = len(go_df.subject.unique())
doc.add_paragraph(f"Total number of empty subjects: {empty_subjects_count}", style='List Bullet')
doc.add_paragraph(f"Percentage of empty subjects: {empty_subjects_count/total_subjects*100:.2f}%", style='List Bullet')

# Print on this notebook as well
print(f"Total number of subject files: {len(go_files)}")
print(f"Percentage of empty subjects: {empty_subjects_count} ({empty_subjects_count/total_subjects*100:.2f}%)")

# Print the unique subjects
print(f"Total number of subjects after first filtering: {len(go_df.subject.unique())}")

# Add the unique subjects to the document
doc.add_paragraph(f"Total number of subjects after first filtering: {len(go_df.subject.unique())}", style='List Bullet')


Total number of subject files: 242
Percentage of empty subjects: 12 (5.22%)
Total number of subjects after first filtering: 230


<docx.text.paragraph.Paragraph at 0x12205034d30>

In [9]:
# Print the unique subjects
go_df.subject.unique()

array(['002Z', '0042P', '0043F1', '0050M1', '0050P', '0062P', '0064M1',
       '0064S1', '0068F1', '0068M1', '0068S1', '0068S3', '006Z', '0083F1',
       '0083M1', '0083P', '0086F1', '0086M1', '0086S1', '008Z', '0093O1',
       '009Z', '0104P', '0105P', '010Z', '0111F1', '0111M1', '0119M1',
       '0128P', '0129M1', '012Z', '0131M1', '0131P', '0131S1', '0134F1',
       '0134M1', '0134S1', '013Z', '0146P', '0147F2', '0147M1', '0147P',
       '014Z', '0150M1', '0152M1', '0154P', '0157M1', '0159M1', '015Z',
       '0162M1', '0162P', '0162S1', '0171F1', '0171M1', '0171P', '0179M1',
       '0179P', '017Z', '0181F1', '0181M1', '0181P', '0183M1', '0183P',
       '0183S1', '0186F1', '0186M1', '018Z', '0196M1', '0196P', '0196S2',
       '0200F1', '0200M1', '0200P', '0200S1', '020Z', '0216M1', '0216P',
       '0218M1', '0218P', '0223M1', '0223P', '0223S2', '023Z', '0248F1',
       '0248M1', '0248P', '0248S1', '024Z', '0265M1', '0265P', '026Z',
       '0275F1', '0275P', '027Z', '0281M1', '0281P',

#### Keep only relevant columns

In [10]:
## keep only the relevant columns
go_df = go_df[["subject","Trial_Index_","condition", "detail", "GO_SAC_RT", "GO_VALID"]]
go_df

Unnamed: 0,subject,Trial_Index_,condition,detail,GO_SAC_RT,GO_VALID
0,002Z,1,Baseline,Right,230.31,True
1,002Z,2,Gap,Right,255.00,True
2,002Z,3,Overlap,Left,323.78,True
3,002Z,4,Gap,Right,197.00,True
4,002Z,5,Baseline,Left,372.62,True
...,...,...,...,...,...,...
33,1256S2,34,Overlap,Right,368.29,True
34,1256S2,35,Gap,Left,247.00,True
35,1256S2,36,Baseline,Right,211.76,True
36,1256S2,37,Overlap,Right,319.85,True


#### Load demographic data

In [11]:
# Load demographic data
demo_df = pd.read_csv(f'C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source/demographics_redcap/participants.tsv', 
                      sep="\t", encoding='utf-8', low_memory=False)

In [12]:
demo_df

Unnamed: 0,participant_id,bids_id,site,sex,eeg_age,group,age_group,ndd,asd,asd_healthform,adhd,language_dis_healthform,q1k_ID
0,sub-0042P,0042P,MNI,female,30.97,proband,adult,ndd,0.0,,0.0,,Q1K_MHC_20042_P
1,sub-0043F1,0043F1,HSJ,male,58.13,father,adult,no_ndd,0.0,0,1.0,0,Q1K_HSJ_10043_F1
2,sub-0043P,0043P,HSJ,male,16.91,proband,child,ndd,,0,,1,Q1K_HSJ_10043_P
3,sub-0050M1,0050M1,HSJ,female,37.10,mother,adult,no_ndd,,,,,Q1K_HSJ_10050_M1
4,sub-0050P,0050P,HSJ,female,8.07,proband,child,ndd,,0,1.0,1,Q1K_HSJ_10050_P
...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,sub-1256S1,1256S1,HSJ,male,17.33,sibling,child,ndd,,0,1.0,0,Q1K_HSJ_1525-1256_S1
273,sub-1256S2,1256S2,HSJ,female,18.32,sibling,adult,ndd,1.0,1,1.0,0,Q1K_HSJ_1525-1256_S2
274,sub-1261F1,1261F1,HSJ,male,40.24,father,adult,no_ndd,,,,,Q1K_HSJ_1525-1261_F1
275,sub-1261P,1261P,HSJ,male,7.71,proband,child,ndd,1.0,,,,Q1K_HSJ_1525-1261_P


In [13]:
demo_df["subject"]=demo_df["bids_id"]

#### Load the Pilot data

In [14]:
pilot_demodf = pd.read_csv(f'C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/source/demographics_redcap/pilot_participants.csv', 
                        encoding='utf-8', low_memory=False)

In [15]:
pilot_demodf

Unnamed: 0,sex,eeg_age,site,q1k_ID,ndd,asd,group,age_group,subject
0,Female,45.88,MNI,Q1K_002_Z,no_ndd,0,,adult,002Z
1,Female,11.13,MNI,Q1K_004_Z,no_ndd,0,,child,004Z
2,Male,33.64,MNI,Q1K_006_Z,no_ndd,0,,adult,006Z
3,Female,23.5,MNI,Q1K_007_Z,no_ndd,0,,adult,007Z
4,Male,40.47,MNI,Q1K_008_Z,no_ndd,0,,adult,008Z
5,Female,23.77,MNI,Q1K_009_Z,no_ndd,0,,adult,009Z
6,Female,28.98,MNI,Q1K_010_Z,no_ndd,0,,adult,010Z
7,Female,32.1,MNI,Q1K_011_Z,no_ndd,0,,adult,011Z
8,Female,23.66,MNI,Q1K_012_Z,no_ndd,0,,adult,012Z
9,Female,22.43,MNI,Q1K_013_Z,no_ndd,0,,adult,013Z


In [16]:
# Change sex column to lowercase
pilot_demodf.loc[:, "sex"] = pilot_demodf["sex"].str.lower()   

In [17]:
# drop trailing spaces
demo_df.loc[:,'subject'] = demo_df['subject'].str.strip()

#### Merge pilot and experimental data

In [18]:
demo_df = pd.concat([demo_df, pilot_demodf], axis=0)

In [19]:
# Remove participants more than 80% missing data
missing_data_threshold = 0.8
demo_df = demo_df[demo_df["subject"].notna()]
missing_data = demo_df.isnull().mean(axis=1)    
demo_df = demo_df[missing_data < missing_data_threshold]

# Add section about demographics filtering
doc.add_heading('Demographics Filtering', level=2)

# Add subheading for demographics filtering
doc.add_paragraph("This section describes the participants included in the analysis after demographic filtering.")

# Add number of subjects with missing data to the document
doc.add_paragraph(f"Total number of subjects with missing data: {len(demo_df[missing_data > missing_data_threshold])}", style='List Bullet')



<docx.text.paragraph.Paragraph at 0x12206663fd0>

In [20]:
# Keep important columns
demo_df = demo_df[["subject", "eeg_age",  "site", "sex","asd", "ndd", "q1k_ID", "group", 'age_group']]

# Change NDD from 0, 1 to "ndd" and "control"
demo_df.loc[:, "ndd"] = demo_df["ndd"].replace({0: "control", 1: "ndd"})

# Change site to lowercase
demo_df.loc[:, "site"] = demo_df["site"].str.lower()

In [21]:
demo_df

Unnamed: 0,subject,eeg_age,site,sex,asd,ndd,q1k_ID,group,age_group
0,0042P,30.97,mni,female,0.0,ndd,Q1K_MHC_20042_P,proband,adult
1,0043F1,58.13,hsj,male,0.0,no_ndd,Q1K_HSJ_10043_F1,father,adult
2,0043P,16.91,hsj,male,,ndd,Q1K_HSJ_10043_P,proband,child
3,0050M1,37.10,hsj,female,,no_ndd,Q1K_HSJ_10050_M1,mother,adult
4,0050P,8.07,hsj,female,,ndd,Q1K_HSJ_10050_P,proband,child
...,...,...,...,...,...,...,...,...,...
25,031Z,28.27,mni,male,0.0,no_ndd,Q1K_031_Z,,adult
26,032Z,18.20,mni,female,0.0,no_ndd,Q1K_032_Z,,adult
27,033Z,9.88,mni,male,0.0,no_ndd,Q1K_033_Z,,child
28,038Z,10.63,mni,male,0.0,no_ndd,Q1K_038_Z,,child


### Sanity check to flag participants with no demographic data

In [22]:
for subject in go_df.subject.unique():
    if subject in demo_df.subject.unique():
        continue
    else: 
        print (f"Flag '{subject}'")
        # Add the subject to the dictionary with label "missing"
        subjects_removed[subject] = "missing demographic data"

In [23]:
# Keep only the subjects that are in the go_df 
demo_df = demo_df[demo_df.subject.isin(go_df.subject.unique())]

## Exploratory Analysis

In [24]:
# Total number of subjects
p2 = f"Total number of subjects after demographic filtering: {len(demo_df)}"
print (p2)
doc.add_paragraph(p2,style='List Bullet')

Total number of subjects after demographic filtering: 230


<docx.text.paragraph.Paragraph at 0x122062390f0>

In [25]:
# Summarize data by diagnosis groups
summary_table = demo_df.groupby("ndd").agg(
    n=("ndd", "count"),
    male=("sex", lambda x: (x == "male").sum()),
    female=("sex", lambda x: (x == "female").sum()),
    mean_age=("eeg_age", "mean"),
    std_age=("eeg_age", "std"),
    hsj=("site", lambda x: (x == "hsj").sum()),
    mni=("site", lambda x: (x == "mni").sum()),
    adults=("age_group", lambda x: (x == "adult").sum()),
    children=("age_group", lambda x: (x == "child").sum()),
).reset_index()

# Add combined row for all participants
all_participants = {
    "ndd": "combined",
    "n": summary_table["n"].sum(),
    "male": summary_table["male"].sum(),
    "female": summary_table["female"].sum(),
    "mean_age": demo_df["eeg_age"].mean(),
    "std_age": demo_df["eeg_age"].std(),
    "hsj": summary_table["hsj"].sum(),
    "mni": summary_table["mni"].sum(),
    "adults": summary_table["adults"].sum(),
    "children": summary_table["children"].sum(),
}

summary_table = pd.concat(
    [pd.DataFrame([all_participants]), summary_table], ignore_index=True
)

# Create combined and percentage columns
summary_table["sex (M:F)"] = summary_table["male"].astype(str) + ":" + summary_table["female"].astype(str)
summary_table["sex % (M:F)"] = (
    (summary_table["male"] / summary_table["n"] * 100).round(2).astype(str)
    + ":"
    + (summary_table["female"] / summary_table["n"] * 100).round(2).astype(str)
)

summary_table["site (HSJ:MNI)"] = summary_table["hsj"].astype(str) + ":" + summary_table["mni"].astype(str)
summary_table["site % (HSJ:MNI)"] = (
    (summary_table["hsj"] / summary_table["n"] * 100).round(2).astype(str)
    + ":"
    + (summary_table["mni"] / summary_table["n"] * 100).round(2).astype(str)
)

# Drop intermediate columns to keep the table clean
summary_table.drop(columns=["male", "female", "hsj", "mni"], inplace=True)

# Rename columns for better clarity
summary_table.rename(
    columns={
        "ndd": "NDD Group",
        "n": "Total Participants",
        "mean_age": "Mean Age",
        "std_age": "Age SD",
        "adults": "Adults",
        "children": "Children",
    },
    inplace=True,
)

print(summary_table)


  NDD Group  Total Participants   Mean Age     Age SD  Adults  Children  \
0  combined                 230  27.137000  16.204714     130       100   
1       ndd                 118  20.312119  13.828630      42        76   
2    no_ndd                 112  34.327500  15.443190      88        24   

  sex (M:F)  sex % (M:F) site (HSJ:MNI) site % (HSJ:MNI)  
0   103:126  44.78:54.78         156:74      67.83:32.17  
1     63:54  53.39:45.76          92:26      77.97:22.03  
2     40:72  35.71:64.29          64:48      57.14:42.86  


In [26]:
summary_table=summary_table.round(2)
summary_table

Unnamed: 0,NDD Group,Total Participants,Mean Age,Age SD,Adults,Children,sex (M:F),sex % (M:F),site (HSJ:MNI),site % (HSJ:MNI)
0,combined,230,27.14,16.2,130,100,103:126,44.78:54.78,156:74,67.83:32.17
1,ndd,118,20.31,13.83,42,76,63:54,53.39:45.76,92:26,77.97:22.03
2,no_ndd,112,34.33,15.44,88,24,40:72,35.71:64.29,64:48,57.14:42.86


In [27]:
## Add table to the document
doc.add_heading('Demographics breakdown', level=1)

doc.add_heading('Demographics table', level=3)

doc = df_to_table(doc, summary_table)


#### Age stats

In [28]:
# Age stats
p3 = f"Mean age: {demo_df.eeg_age.mean().round(2)}"
print (p3)

# Number of adults and children
p5 = f"Number of adults: {len(demo_df[demo_df.age_group == 'adult'])}"
print (p5)

p6 = f"Number of children: {len(demo_df[demo_df.age_group == 'child'])}"
print (p6)

# Add headings for age stats
doc.add_heading('Age statistics', level=2)
# Add age stats to the document
doc.add_paragraph(p3, style='List Bullet')
doc.add_paragraph(p5, style='List Bullet')
doc.add_paragraph(p6, style='List Bullet')


Mean age: 27.14
Number of adults: 130
Number of children: 100


<docx.text.paragraph.Paragraph at 0x1220415a6b0>

#### Sex

In [29]:
# Add heading for sex stats
doc.add_heading('Sex statistics', level=2)


# Number of female
p7 = f"Number of females: {len(demo_df[demo_df.sex == 'female'])}"
print (p7)

# Number of males
p8 = f"Number of male: {len(demo_df[demo_df.sex=='male'])}"
print (p8)

# Add sex stats to the document
doc.add_paragraph(p7, style='List Bullet')
doc.add_paragraph(p8, style='List Bullet')

Number of females: 126
Number of male: 103


<docx.text.paragraph.Paragraph at 0x1220509fe80>

#### Number of NDDs

In [30]:
# Add section for NDD statistics
doc.add_heading('NDD statistics', level=2)

# Number of NDD
p9 = f"Number of NDD: {len(demo_df[demo_df.ndd=='ndd'])}"
print (p9)

# Number of control
p10 = f"Number of control: {len(demo_df[demo_df.ndd=='no_ndd'])}"
print (p10)

# Add NDD stats to the document
doc.add_paragraph(p9, style='List Bullet')
doc.add_paragraph(p10, style='List Bullet')


Number of NDD: 118
Number of control: 112


<docx.text.paragraph.Paragraph at 0x1220509fbb0>

#### Site Count

In [31]:
# Number of MNI
p11 = f"Number of mni: {len(demo_df[demo_df.site=='MNI'])}"
print (p11)

# Number of HSJ
p12 = f"Number of hsj: {len(demo_df[demo_df.site=='HSJ'])}"
print (p12)


Number of mni: 0
Number of hsj: 0


#### Famly dynamics

In [32]:
print(demo_df.group.value_counts())

# Add section for family groups
doc.add_heading('Family groups', level=2)
# Add the breakdown (group values counts) of family groups to the document
doc.add_paragraph(str(demo_df.group.value_counts()), style='List Bullet')


group
proband    67
mother     58
sibling    46
father     35
Name: count, dtype: int64


<docx.text.paragraph.Paragraph at 0x12206267610>

### Summary demographics of only ASD children


In [33]:
# Subset only ASD children
asd_only_df = demo_df[(demo_df.asd == 1)]

In [34]:
# Summarize data by diagnosis groups
summary_table = asd_only_df.groupby("age_group").agg(
    n=("age_group", "count"),
    male=("sex", lambda x: (x == "male").sum()),
    female=("sex", lambda x: (x == "female").sum()),
    mean_age=("eeg_age", "mean"),
    std_age=("eeg_age", "std"),
    hsj=("site", lambda x: (x == "hsj").sum()),
    mni=("site", lambda x: (x == "mni").sum()),

).reset_index()

# Add combined row for all participants
all_participants = {
    "age_group": "combined",
    "n": summary_table["n"].sum(),
    "male": summary_table["male"].sum(),
    "female": summary_table["female"].sum(),
    "mean_age": asd_only_df["eeg_age"].mean(),
    "std_age": asd_only_df["eeg_age"].std(),
    "hsj": summary_table["hsj"].sum(),
    "mni": summary_table["mni"].sum(),
}

summary_table = pd.concat(
    [pd.DataFrame([all_participants]), summary_table], ignore_index=True
)

# Create combined and percentage columns
summary_table["sex (M:F)"] = summary_table["male"].astype(str) + ":" + summary_table["female"].astype(str)
summary_table["sex % (M:F)"] = (
    (summary_table["male"] / summary_table["n"] * 100).round(2).astype(str)
    + ":"
    + (summary_table["female"] / summary_table["n"] * 100).round(2).astype(str)
)

summary_table["site (HSJ:MNI)"] = summary_table["hsj"].astype(str) + ":" + summary_table["mni"].astype(str)
summary_table["site % (HSJ:MNI)"] = (
    (summary_table["hsj"] / summary_table["n"] * 100).round(2).astype(str)
    + ":"
    + (summary_table["mni"] / summary_table["n"] * 100).round(2).astype(str)
)

# Drop intermediate columns to keep the table clean
summary_table.drop(columns=["male", "female", "hsj", "mni"], inplace=True)

# Rename columns for better clarity
summary_table.rename(
    columns={
        "ndd": "NDD Group",
        "n": "Total Participants",
        "mean_age": "Mean Age",
        "std_age": "Age SD"
    },
    inplace=True,
)

print(summary_table)


  age_group  Total Participants   Mean Age     Age SD sex (M:F)  sex % (M:F)  \
0  combined                  22  17.344091  12.055805     11:11    50.0:50.0   
1     adult                   5  35.786000  12.896059       1:4    20.0:80.0   
2     child                  17  11.920000   3.412157      10:7  58.82:41.18   

  site (HSJ:MNI) site % (HSJ:MNI)  
0           17:5      77.27:22.73  
1            5:0        100.0:0.0  
2           12:5      70.59:29.41  


In [35]:
summary_table=summary_table.round(2)
summary_table

Unnamed: 0,age_group,Total Participants,Mean Age,Age SD,sex (M:F),sex % (M:F),site (HSJ:MNI),site % (HSJ:MNI)
0,combined,22,17.34,12.06,11:11,50.0:50.0,17:5,77.27:22.73
1,adult,5,35.79,12.9,1:4,20.0:80.0,5:0,100.0:0.0
2,child,17,11.92,3.41,10:7,58.82:41.18,12:5,70.59:29.41


In [36]:
## Add table to the document
doc.add_heading('Confirmed ASD Demographics breakdown', level=1)

doc.add_heading('ASD Demographics table', level=3)

doc = df_to_table(doc, summary_table)


In [37]:
## Merge the go_df and demo_df
go_df = pd.merge(go_df, demo_df, on="subject", how="inner")

In [39]:
## Rename columns for clarity
go_df.rename(columns={
    "Trial_Index_": "trial_index",
    "GO_SAC_RT": "rt",
    "GO_VALID": "valid_trials",
    "detail": "side"
}, inplace=True)

In [40]:
go_df

Unnamed: 0,subject,trial_index,condition,side,rt,valid_trials,eeg_age,site,sex,asd,ndd,q1k_ID,group,age_group
0,002Z,1,Baseline,Right,230.31,True,45.88,mni,female,0.0,no_ndd,Q1K_002_Z,,adult
1,002Z,2,Gap,Right,255.00,True,45.88,mni,female,0.0,no_ndd,Q1K_002_Z,,adult
2,002Z,3,Overlap,Left,323.78,True,45.88,mni,female,0.0,no_ndd,Q1K_002_Z,,adult
3,002Z,4,Gap,Right,197.00,True,45.88,mni,female,0.0,no_ndd,Q1K_002_Z,,adult
4,002Z,5,Baseline,Left,372.62,True,45.88,mni,female,0.0,no_ndd,Q1K_002_Z,,adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8249,1256S2,34,Overlap,Right,368.29,True,18.32,hsj,female,1.0,ndd,Q1K_HSJ_1525-1256_S2,sibling,adult
8250,1256S2,35,Gap,Left,247.00,True,18.32,hsj,female,1.0,ndd,Q1K_HSJ_1525-1256_S2,sibling,adult
8251,1256S2,36,Baseline,Right,211.76,True,18.32,hsj,female,1.0,ndd,Q1K_HSJ_1525-1256_S2,sibling,adult
8252,1256S2,37,Overlap,Right,319.85,True,18.32,hsj,female,1.0,ndd,Q1K_HSJ_1525-1256_S2,sibling,adult


In [41]:
# Save the go_df to a csv file
go_df.to_csv(f"C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/outputs/gap/go_df.csv", index=False)

In [42]:
# Save the document
doc.save(f"C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/outputs/word_document/gap_analysis_results.docx")

# Save backup to archive 
doc.save(f"C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/outputs/word_document/archive/gap_analysis_results_{date}.docx")

In [43]:
# Convert missing subjects to a DataFrame
subjects_removed_df = pd.DataFrame.from_dict(subjects_removed, orient='index', columns=['status'])
subjects_removed_df.index.name = 'subject'
# Save the subjects_removed_df to a csv file
subjects_removed_df.to_csv(f"C:/Users/gabot/OneDrive - McGill University/Desktop/github_repos/q1k_neurosubs/outputs/missingness/subjects_removed.csv")