In [4]:
# | echo: true
# | output: false

import polars as pl
import seaborn.objects as so
from tabulate import tabulate
import glob
import os
from pathlib import Path

# Find all files matching the pattern
root_dir = "/Volumes/T7/BrainLat/EEG data"
def collect_records_dataframes(root_dir, file_pattern,  schema_overrides=None):
    all_records_dfs = []
    all_folders = [
    folder for folder in glob.glob(f"{root_dir}/*_*")
    if os.path.isdir(folder)
    ]
    for folder in all_folders:
        try:
            # Get the records file using the pattern provided as argument
            records_file = glob.glob(f"{folder}/{file_pattern}")[0]
            
            # Read the CSV into dataframe
            if schema_overrides:
                records_df = pl.read_csv(records_file, schema_overrides=schema_overrides)
            else:
                records_df = pl.read_csv(records_file)
            
            
            # Add condition column
            condition_name = Path(folder).name
            records_df = records_df.with_columns(pl.lit(condition_name).alias("condition"))
            
            # Add to list
            all_records_dfs.append(records_df)
        except (IndexError, Exception) as e:
            print(f"Error with records in {Path(folder).name}: {e}")
    
    # Concatenate all dataframes
    if all_records_dfs:
        all_records = pl.concat(all_records_dfs, how="diagonal")
        return all_records
    else:
        print("No valid records dataframes collected.")
        return None
    

In [13]:
records_df = collect_records_dataframes(root_dir, "Records_*_EEG_data.csv").filter(
    pl.col("condition") != "4_MS"
)
display(records_df.head())
records_df.write_csv("records_df.csv")
# cognition

int_columns = [
    "ifs_months",
    "ifs_verbal_inhibition",
    "laterality",
    "moca_total",
    "moca_visuospatial",
    "moca_recog",
    "moca_attention",
    "moca_language",
    "moca_abstraction",
    "moca_memory",
    "moca_orientation",
    "ifs_total_score",
    "ifs_motor_series",
    "ifs_conflicting_instructions",
    "ifs_motor_inhibition",
    "ifs_digits",
    "ifs_months",
    "ifs_visual_wm",
    "ifs_proverb",
    "ifs_verbal_inhibition",
]

schema = {col: pl.Int64 for col in int_columns}
# Add float columns to the schema
float_columns = ["mini_sea_fer", "mini_sea_tom"]
for col in float_columns:
    schema[col] = pl.Float64


cognition_df = (
    collect_records_dataframes(
        root_dir, "Cognition_*_EEG_data.csv", schema_overrides=schema
    )
    .rename({"id EEG": "id_EEG"})
    .filter(pl.col("condition") != "4_MS")
)
display(cognition_df.head())
cognition_df.write_csv("cognition_df.csv")

demographics_df = (
    collect_records_dataframes(
        root_dir,
        "Demographics_*_EEG_data.csv",
        schema_overrides={"laterality": pl.Int64},
    )
    .rename({"id EEG": "id_EEG"})
    .filter(pl.col("condition") != "4_MS")
)

demographics_df.write_csv("demographics_df.csv")
display(demographics_df.head())

path,id_EEG,diagnosis,T1,Rest,DWI,MF,eeg,condition,id,id_MRI,MRI_path
str,str,str,i64,i64,i64,i64,i64,str,str,str,str
"""2_bVFTD/AR""","""sub-20001""","""FTD""",1,0,0,3,1,"""2_bvFTD""",,,
"""2_bVFTD/AR""","""sub-20003""","""FTD""",1,1,1,3,1,"""2_bvFTD""",,,
"""2_bVFTD/AR""","""sub-20006""","""FTD""",1,1,1,3,1,"""2_bvFTD""",,,
"""2_bVFTD/AR""","""sub-20009""","""FTD""",1,1,1,3,1,"""2_bvFTD""",,,
"""2_bVFTD/AR""","""sub-20010""","""FTD""",1,1,1,3,1,"""2_bvFTD""",,,


id,path,id_EEG,diagnosis,moca_total,moca_visuospatial,moca_recog,moca_attention,moca_language,moca_abstraction,moca_memory,moca_orientation,ifs_total_score,ifs_motor_series,ifs_conflicting_instructions,ifs_motor_inhibition,ifs_digits,ifs_months,ifs_visual_wm,ifs_proverb,ifs_verbal_inhibition,mini_sea_fer,mini_sea_tom,emotion recog,condition,MMSE
str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,str,str,i64
"""PIB00056""","""2_bVFTD/AR""","""sub-20001""","""FTD""",26,5,3,5,3,2,2,6,26,3,3,3,5,2,4,3,3,14.6,15.0,,"""2_bvFTD""",
"""PIB00057""","""2_bVFTD/AR""","""sub-20003""","""FTD""",23,5,3,5,3,2,0,5,23,3,3,2,3,2,3,3,4,11.6,11.3,,"""2_bvFTD""",
"""PIB00059""","""2_bVFTD/AR""","""sub-20006""","""FTD""",12,1,2,4,3,0,0,2,18,3,3,3,3,0,1,2,3,9.4,14.3,,"""2_bvFTD""",
"""PIB00061""","""2_bVFTD/AR""","""sub-20009""","""FTD""",15,4,4,1,4,0,1,2,16,4,3,3,2,2,0,2,0,4.0,6.9,,"""2_bvFTD""",
"""PIB00062""","""2_bVFTD/AR""","""sub-20010""","""FTD""",18,2,2,4,2,1,1,6,20,3,2,0,3,2,2,3,5,10.7,12.8,,"""2_bvFTD""",


path,id_EEG,diagnosis,sex,Age,years_education,laterality,condition,id
str,str,str,i64,i64,i64,i64,str,str
"""2_vbFTD/AR""","""sub-20001""","""FTD""",1,62,18,1,"""2_bvFTD""",
"""2_vbFTD/AR""","""sub-20003""","""FTD""",1,70,15,0,"""2_bvFTD""",
"""2_vbFTD/AR""","""sub-20006""","""FTD""",1,87,7,1,"""2_bvFTD""",
"""2_vbFTD/AR""","""sub-20009""","""FTD""",1,57,14,1,"""2_bvFTD""",
"""2_vbFTD/AR""","""sub-20010""","""FTD""",0,81,6,1,"""2_bvFTD""",


In [None]:
replacements = {
    "2_bVFTD/AR": "2_bvFTD/AR",
    "2_bVFTD/CL": "2_bvFTD/CL",
    "2_vbFTD/AR": "2_bvFTD/AR",
    "HC/AR" :  "5_HC/AR",
    "HC/CL" :  "5_HC/CL",

    # Add more mappings as needed
}
#
# Load from CSV
cognition_df = (
    pl.read_csv("cognition_df.csv")
    .with_columns(
        pl.col("path").str.replace("-", "_").alias("path"),
        pl.col("id_EEG").str.strip_chars().alias("id_EEG"),
    )
    .with_columns(pl.col("path").replace(replacements).alias("path"))
)


demographics_df = pl.read_csv("demographics_df.csv").with_columns(
        pl.col("path").str.replace("-", "_").alias("path"),
        pl.col("id_EEG").str.strip_chars().alias("id_EEG"),
    ).with_columns(pl.col("path").replace(replacements).alias("path"))
                   
records_df = pl.read_csv("records_df.csv").with_columns(
        pl.col("path").str.replace("-", "_").alias("path"),
        pl.col("id_EEG").str.strip_chars().alias("id_EEG"),
    ).with_columns(pl.col("path").replace(replacements).alias("path"))

result_df = records_df.join(
    demographics_df,
    on=["id_EEG", "path", "diagnosis", "condition"],
    suffix="_demo",
    how="full",
).join(
    cognition_df,
    on=["id_EEG", "path", "diagnosis", "condition"],
    suffix="_cog",
    how="full",
)
print(result_df)


shape: (129, 47)
┌────────────┬───────────┬───────────┬─────┬───┬──────────────┬─────────┬───────────────┬──────┐
│ path       ┆ id_EEG    ┆ diagnosis ┆ T1  ┆ … ┆ mini_sea_tom ┆ emotion ┆ condition_cog ┆ MMSE │
│ ---        ┆ ---       ┆ ---       ┆ --- ┆   ┆ ---          ┆ recog   ┆ ---           ┆ ---  │
│ str        ┆ str       ┆ str       ┆ i64 ┆   ┆ f64          ┆ ---     ┆ str           ┆ i64  │
│            ┆           ┆           ┆     ┆   ┆              ┆ str     ┆               ┆      │
╞════════════╪═══════════╪═══════════╪═════╪═══╪══════════════╪═════════╪═══════════════╪══════╡
│ 2_bvFTD/AR ┆ sub-20001 ┆ FTD       ┆ 1   ┆ … ┆ 15.0         ┆ null    ┆ 2_bvFTD       ┆ null │
│ 2_bvFTD/AR ┆ sub-20003 ┆ FTD       ┆ 1   ┆ … ┆ 11.3         ┆ null    ┆ 2_bvFTD       ┆ null │
│ 2_bvFTD/AR ┆ sub-20006 ┆ FTD       ┆ 1   ┆ … ┆ 14.3         ┆ null    ┆ 2_bvFTD       ┆ null │
│ 2_bvFTD/AR ┆ sub-20009 ┆ FTD       ┆ 1   ┆ … ┆ 6.9          ┆ null    ┆ 2_bvFTD       ┆ null │
│ 2_bvFTD/AR 

In [None]:
# Get the unique values for each column
unique_values1 = cognition_df["path"].unique().to_list()
unique_values2 = records_df["path"].unique().to_list()

# Convert to sets for comparison
set1 = set([str(x) for x in unique_values1])  # Convert to strings to handle None/null values
set2 = set([str(x) for x in unique_values2])

# Find values in set1 but not in set2
only_in_path = set1 - set2
print("Values only in path:", only_in_path if only_in_path else "None")

# Find values in set2 but not in set1
only_in_path_demo = set2 - set1
print("Values only in path_demo:", only_in_path_demo if only_in_path_demo else "None")

In [16]:
result_df = result_df.with_columns([
        pl.col("path").str.split("/").list.get(1).alias("country")  ])

columns_to_move = ["condition", "country"]

# Move to beginning
new_column_order = columns_to_move + [col for col in result_df.columns if col not in columns_to_move]
result_df = result_df.select(new_column_order)


result_df.write_csv("brainlat_metadata_clean.csv")


In [19]:
""" # In a separate notebook cell
# Check schemas to identify mismatches
if 'all_processed_dfs' in globals() and all_processed_dfs:
    # Print schema of first dataframe as reference
    print("Reference schema (first dataframe):")
    for col, dtype in zip(all_processed_dfs[0].columns, all_processed_dfs[0].dtypes):
        print(f"{col}: {dtype}")
    
    # Check for type inconsistencies
    for i, df in enumerate(all_processed_dfs[1:], 1):
        for col in df.columns:
            if col in all_processed_dfs[0].columns:
                ref_type = all_processed_dfs[0].schema[col]
                curr_type = df.schema[col]
                if ref_type != curr_type:
                    print(f"Type mismatch in dataframe {i}, column '{col}': {ref_type} vs {curr_type}")
    
    # Now try to concatenate
    try:
        combined_df = pl.concat(all_processed_dfs, how="diagonal")
        print(f"Combined dataframe shape: {combined_df.shape}")
        
        # Save result
        combined_df.write_csv("combined_eeg_data.csv")
        print("Successfully saved to combined_eeg_data.csv")
    except Exception as e:
        print(f"Error during concatenation: {e}")
        print("You may need to convert column types to be consistent")
else:
    print("No processed dataframes found") """

' # In a separate notebook cell\n# Check schemas to identify mismatches\nif \'all_processed_dfs\' in globals() and all_processed_dfs:\n    # Print schema of first dataframe as reference\n    print("Reference schema (first dataframe):")\n    for col, dtype in zip(all_processed_dfs[0].columns, all_processed_dfs[0].dtypes):\n        print(f"{col}: {dtype}")\n\n    # Check for type inconsistencies\n    for i, df in enumerate(all_processed_dfs[1:], 1):\n        for col in df.columns:\n            if col in all_processed_dfs[0].columns:\n                ref_type = all_processed_dfs[0].schema[col]\n                curr_type = df.schema[col]\n                if ref_type != curr_type:\n                    print(f"Type mismatch in dataframe {i}, column \'{col}\': {ref_type} vs {curr_type}")\n\n    # Now try to concatenate\n    try:\n        combined_df = pl.concat(all_processed_dfs, how="diagonal")\n        print(f"Combined dataframe shape: {combined_df.shape}")\n\n        # Save result\n       

In [29]:
# | output: false

cognition_path = "brainlat_metadata_clean.csv"
cognition_all = (
    pl.scan_csv(cognition_path)
    .collect()
    .filter(~pl.all_horizontal(pl.all().is_null()))
    .group_by("diagnosis")
    .agg(pl.all().count())
)
display(cognition_all)

diagnosis,condition,country,path,id_EEG,T1,Rest,DWI,MF,eeg,id,id_MRI,MRI_path,path_demo,id_EEG_demo,diagnosis_demo,sex,Age,years_education,laterality,condition_demo,id_demo,id_cog,path_cog,id_EEG_cog,diagnosis_cog,moca_total,moca_visuospatial,moca_recog,moca_attention,moca_language,moca_abstraction,moca_memory,moca_orientation,ifs_total_score,ifs_motor_series,ifs_conflicting_instructions,ifs_motor_inhibition,ifs_digits,ifs_months,ifs_visual_wm,ifs_proverb,ifs_verbal_inhibition,mini_sea_fer,mini_sea_tom,emotion recog,condition_cog,MMSE
str,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
"""FTD""",19,19,19,19,19,19,19,19,19,0,0,0,19,19,19,19,19,19,18,19,0,19,19,19,19,14,14,14,14,14,14,14,14,17,17,17,17,17,16,17,17,19,19,19,0,19,0
"""PD""",29,29,29,29,29,29,29,29,29,0,24,24,29,29,29,29,29,29,14,29,0,0,29,29,29,14,7,7,14,14,14,14,14,14,7,7,7,7,0,14,7,0,14,14,0,29,15
"""AD""",35,35,35,35,35,35,35,28,35,0,0,0,35,35,35,35,35,35,34,35,0,0,35,35,35,29,29,29,29,29,29,29,29,33,31,30,31,31,31,31,31,31,31,31,0,35,0
"""CN""",46,46,46,46,45,45,45,30,45,46,0,0,46,46,46,46,46,46,46,46,0,0,46,46,46,31,31,31,31,31,31,31,31,28,25,25,24,24,24,24,24,26,30,30,0,46,0


In [30]:
df_all = cognition_all.to_pandas().T
df_all.columns = df_all.iloc[0]
df_all = df_all.iloc[1:]
print(tabulate(df_all, headers="keys", tablefmt="grid"))

+------------------------------+-------+------+------+------+
|                              |   FTD |   PD |   AD |   CN |
| condition                    |    19 |   29 |   35 |   46 |
+------------------------------+-------+------+------+------+
| country                      |    19 |   29 |   35 |   46 |
+------------------------------+-------+------+------+------+
| path                         |    19 |   29 |   35 |   46 |
+------------------------------+-------+------+------+------+
| id_EEG                       |    19 |   29 |   35 |   46 |
+------------------------------+-------+------+------+------+
| T1                           |    19 |   29 |   35 |   45 |
+------------------------------+-------+------+------+------+
| Rest                         |    19 |   29 |   35 |   45 |
+------------------------------+-------+------+------+------+
| DWI                          |    19 |   29 |   35 |   45 |
+------------------------------+-------+------+------+------+
| MF    