# **Q1K** Validation of EEG-ET Log

In [562]:
# Import modules
import os
import pandas as pd
from pathlib import Path
import numpy as np
import glob
import shutil
from matplotlib import pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as py
import plotly.io as pio
import seaborn as sns
import kaleido


### Functions

In [563]:
def map_values(series, col_name):
    """Map numeric values to their labels if a mapping exists"""
    if col_name in category_mappings:
        return series.map(category_mappings[col_name]).fillna(series)
    return series

def get_top_categories(series, col_name, percent=False):
    """Get top categories with proper value mapping"""
    mapped_series = map_values(series, col_name)

    if percent:
        value_counts = mapped_series.value_counts(normalize=True, dropna=False)
        value_counts = value_counts.round(2)
    else:
        value_counts = mapped_series.value_counts(dropna=False)
    return {k: v for k, v in value_counts.head(5).items() if not pd.isna(k)}

# 1. Extracting REDCap EEG LOG

In [564]:
# Select the date
date="2025_05_06"

In [565]:
# Select the tasks
tasks = ['RS','TO','GO', 'VEP', 'AEP', 'NSP', 'PLR','VS','MMN']

In [566]:
for file in glob.glob(f"../source/{date}/*"):
    if "Validation" in file:
        vali_df = pd.read_csv(file)
        print( "Validation file is: " , file)
    if "LABELS" in file:
        labels_df = pd.read_csv(file)
        print( "Labels file is: " , file)

 

Labels file is:  ../source/2025_05_06\DATA_LABELS_2025-05-06_1717.csv


Validation file is:  ../source/2025_05_06\Q1KDatabase-EEGLogValidation_DATA_2025-05-06_1716.csv


In [567]:
labels_df

Unnamed: 0,Record ID,Event Name,Was EEG attempted?,Reasons:,EEG site:,Participant EEG code,Eye tracking code,Birthdate,EEG date 1,Age in years REMOVE,...,Notes: .22,Good.24,Questionable.23,Invalid.24,Reasons.23,Notes: .23,End of recording,General session notes:,status change?,Complete?
0,21,Phase 3 (Arm 1: Phase 2),,,,,,1980-05-01,,,...,,,,,,,,,,Incomplete
1,40,Phase 3 (Arm 1: Phase 2),No,,HSJ,Q1K_HSJ_10040_P,,2018-03-17,,,...,,,,,,,,,,Incomplete
2,41,Phase 3 (Arm 1: Phase 2),No,,HSJ,Q1K_HSJ_10040_M1,,1985-01-10,,,...,,,,,,,,,,Incomplete
3,42,Phase 3 (Arm 1: Phase 2),Yes,,MHC,Q1K_MHC_20042_P,Q042_P,1993-08-19,2024-08-09,30.973942,...,,,,,,,,,,Complete
4,43,Phase 3 (Arm 1: Phase 2),Yes,,HSJ,Q1K_HSJ_10043_P,,2007-05-02,2024-03-28,16.906576,...,,,,,,,,,,Incomplete
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
528,525-1290,Phase 3 (Arm 1: Phase 2),,,,,,,,,...,,,,,,,,,,Incomplete
529,525-1291,Phase 3 (Arm 1: Phase 2),,,,,,,,,...,,,,,,,,,,Incomplete
530,525-1292,Phase 3 (Arm 1: Phase 2),,,,,,,,,...,,,,,,,,,,Incomplete
531,Q1K test - (for Irini),Phase 3 (Arm 1: Phase 2),,,,,,,,,...,,,,,,,,,,Complete


In [568]:
# Import the label mapping file
mapping_df = pd.read_csv(f"../source/{date}/label_mapping.csv")
mapping_df

Unnamed: 0,raw_var_name,new_label
0,record_id,Record ID
1,redcap_event_name,Event Name
2,eeg_attempted,Was EEG attempted?
3,eeg_attempted_reasons,Reasons:
4,eeg_site,EEG site:
...,...,...
272,eeget_p8_b2_notes_v2_v2,Notes: .23
273,eeg_endrecord,End of recording
274,eeget_general_notes_v2_v2,General session notes:
275,change_status,status change?


### Subset only participants who completed EEG form

In [569]:
vali_df= vali_df.loc[vali_df['eeg_participant_code'].isna() == False]

In [570]:
vali_df

Unnamed: 0,record_id,redcap_event_name,eeg_attempted,eeg_attempted_reasons,eeg_site,eeg_participant_code,eeg_code_software,eeg_birthdate_v2_v2,eeg_today_date,eeg_age_years_testdate,...,eeget_p8_b1_notes_v2_v2,eeget_p8_b2_good_v2_v2,eeget_p8_b2_quest_v2_v2,eeget_p8_b2_inv_v2_v2,eeget_p8_b2_reasons_v2_v2,eeget_p8_b2_notes_v2_v2,eeg_endrecord,eeget_general_notes_v2_v2,change_status,eeget_session_log_complete
1,40,phase_3_arm_1,0.0,,1.0,Q1K_HSJ_10040_P,,2018-03-17,,,...,,,,,,,,,,0
2,41,phase_3_arm_1,0.0,,1.0,Q1K_HSJ_10040_M1,,1985-01-10,,,...,,,,,,,,,,0
3,42,phase_3_arm_1,1.0,,2.0,Q1K_MHC_20042_P,Q042_P,1993-08-19,2024-08-09,30.973942,...,,,,,,,,,,2
4,43,phase_3_arm_1,1.0,,1.0,Q1K_HSJ_10043_P,,2007-05-02,2024-03-28,16.906576,...,,,,,,,,,,0
6,45,phase_3_arm_1,1.0,,1.0,Q1K_HSJ_10043_F1,Q043_F1,1966-04-13,2024-05-31,58.133865,...,,,,,,,14:21,,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502,525-1256,phase_3_arm_1,1.0,,1.0,Q1K_HSJ_1525-1256_P,Q1256_P,2009-10-28,2025-05-01,15.507505,...,,,,,,,13:53,,,2
503,525-1257,phase_3_arm_1,1.0,,1.0,Q1K_HSJ_1525-1256_S1,Q1256_S1,2008-01-01,2025-05-01,17.330837,...,,,,,,,12:05,,,2
504,525-1258,phase_3_arm_1,1.0,,1.0,Q1K_HSJ_1525-1256_S2,Q1256_S2,2007-01-03,2025-05-01,18.324698,...,,,,,,,11:00,,,2
505,525-1259,phase_3_arm_1,0.0,No time left. Family had to leave.,1.0,Q1K_HSJ_1525-1256_M1,Q1256_M1,1980-01-06,2025-05-01,45.317723,...,,,,,,,,,,2


In [571]:
# Principal columns

principal_cols = ['record_id','eeg_attempted',
 'eeg_attempted_reasons',
 'eeg_site',
 'eeg_participant_code',
 'eeg_code_software',
 'eeg_birthdate_v2_v2',
 'eeg_today_date',
 'eeg_age_years_testdate',
 'eeg_age_years',
 'eeg_sex_birth',
 'eeg_participant_medic',
 'eeg_participant_handedness',
 'eeg_age_v2_v2',
 'eeget_date_v2_v2']

In [572]:
# Select the columns that are needed
main_vali= vali_df[principal_cols]

## Validation of main vairables (e..g, Age, Sex, etc)

In [573]:
main_vali.columns

Index(['record_id', 'eeg_attempted', 'eeg_attempted_reasons', 'eeg_site',
       'eeg_participant_code', 'eeg_code_software', 'eeg_birthdate_v2_v2',
       'eeg_today_date', 'eeg_age_years_testdate', 'eeg_age_years',
       'eeg_sex_birth', 'eeg_participant_medic', 'eeg_participant_handedness',
       'eeg_age_v2_v2', 'eeget_date_v2_v2'],
      dtype='object')

In [574]:
# Column type mappings
column_types = {
            'yes_no': ['eeg_attempted', 'eeg_diagnosis___1', 'eeg_diagnosis___2', 
                            'eeg_diagnosis___3', 'eeg_diagnosis___4', 'eeg_diagnosis___5', 
                            'eeg_diagnosis_unk'],
            'categorical': ['eeg_sex_birth', 'eeg_site', 'eeg_participant_handedness'],
            'numerical': ['eeg_age_years', 'eeg_age_years_testdate', 'eeg_age_v2_v2'],
            'date': ['eeg_birthdate_v2_v2', 'eeg_today_date', 'eeget_date_v2_v2'],
            'text': ['eeg_participant_code', 'eeg_code_software', 'eeg_sex_birth_specify', 
                    'eeg_diagnosis_unk', 'eeg_participant_medic', 'eeg_attempted_reasons',"record_id"]
        }




In [575]:
category_mappings = {
    'eeg_attempted': {1.0: 'Yes', 0.0: 'No'},
    'eeg_sex_birth': {1: 'Female', 2: 'Male', 3: 'Other'},
    'eeg_participant_handedness': {1: 'Right', 2: 'Left', 3: 'Ambidextrous'},
    'eeg_site': {1: 'HSJ', 2: 'MHC'},
    'eeg_fit_v2_v2': {1.0: 'Good', 2.0: "Okay", 3.0: 'Poor'},
    'eeg_imped_v2_v2': {1.0: 'Good', 2.0: 'Okay', 3.0: 'Poor'},
    'eeg_vep_a_b_file': {0.0: "A", 1.0: "B"},
    'eeg_ssaep_a_b_version': {1.0: "A", 0.0: "B"},
    'eeg_mmn_a_b_file': {1.0: "A", 0.0: "B"},
    'eeg_p2_rand_v2_v2': {1.0: "A", 0.0: "B"},
    'eeg_rs_rio_reason': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand", 
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p1_b1_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p1_b2_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p1_b3_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p1_b4_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p1_b4_reasons_2_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p1_b4_reasons_3_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_to_b1_reasons_v2_v3': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p6_b1_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p3_b1_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p3_b2_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p3_b3_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p3_b3_reasons_2_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p5_b1_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p5_b2_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p5_b3_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p5_b4_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p9_b1_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p7_b1_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p7_b1_reasons_2_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p4_b1_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p2_b1_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p10_b1_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p8_b1_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    },
    'eeget_p8_b2_reasons_v2_v2': {
        "0": "Movement", 
        "1": "Attention", 
        "2": "Talking", 
        "3": "Did not understand",
        "4": "Distance from screen", 
        "5": "Eye-tracker recalibration", 
        "6": "EEG Net movement", 
        "7": "Other"
    }
}

# Special columns
special_cols = ['eeg_sex_birth_specify', 'eeg_diagnosis_unk', 'eeg_participant_medic', 'eeg_diagnosis_other']

In [576]:
validation_results = []

# Create table for validation results

#Forms to check 
inds_val_required= []
participant_list=  set()

# Check columns

symmary_df=main_vali.copy()
for col in symmary_df.columns: 
    # Process each column
    col_type = next((t for t, cols in column_types.items() if col in cols), 'unknown')

 
    if col in ['eeg_attempted', 'eeg_today_date', 'eeget_date_v2_v2']:
        df = symmary_df
        print("All columns checked for column:", col)
        inds_val_required.append({'Column': col, 'Participants': df.loc[df[col].isna(), 'eeg_participant_code'].unique().tolist()})
        missing_ids = df.loc[df[col].isna(), 'record_id'].unique()
        participant_list.update(missing_ids)
 
    elif col == 'eeg_attempted_reasons':
        # Check those 
        df = symmary_df[symmary_df['eeg_attempted'] == 0.0]
        print("All columns checked for column:", col)
        inds_val_required.append({'Column': col, 'Participants': df.loc[df[col].isna(), 'eeg_participant_code'].unique().tolist()})
        missing_ids = df.loc[df[col].isna(), 'record_id'].unique()
        participant_list.update(missing_ids)
    
    elif col in special_cols:
        if col == 'eeg_sex_birth_specify':
            # Check those that are not in the list
            df = symmary_df[symmary_df['eeg_sex_birth'].isna()]
        elif col == 'eeg_diagnosis_unk':
            # Check those that are not in the list
            df = symmary_df[symmary_df['eeg_diagnosis_unk'].isna()]

    else: 
        # Get rid of all those that did not attempt the test
        df = symmary_df[symmary_df['eeg_attempted'] == 1.0]
        inds_val_required.append({'Column': col, 'Participants': df.loc[df[col].isna(), 'eeg_participant_code'].unique().tolist()})
        missing_ids = df.loc[df[col].isna(), 'record_id'].unique()
        participant_list.update(missing_ids)
    

        
    na_count = df[col].isna().sum()
    total_count = len(df)
    na_percent = round((na_count / total_count) * 100, 1)
    
    result = {
        'Raw variable name': col,
        'Type': col_type.capitalize(),
        'Missing n': na_count,
        'Missing %': na_percent,
        'Unique Values': df[col].nunique(),
        'Issues': []
    }



    # Type-specific validation
    if col_type == 'yes_no':
        valid_values = df[col].dropna().unique()
        yes_count = df[col].isin([1, 1.0]).sum()
        no_count = df[col].isin([0, 0.0]).sum()
        yes_percent = round((yes_count / total_count) * 100, 1)
        no_percent = round((no_count / total_count) * 100, 1)
        other_count = total_count - yes_count - no_count - na_count
        
        # Get mapped top categories
        top_cats = get_top_categories(df[col], col)
        
        result.update({
            "Top Categories": str(top_cats),
            "Examples": df[col].dropna().unique()[:2].tolist(),
            "Extra info": f"Yes %: {yes_percent}, No % : {no_percent}",
        })
        
        if other_count > 0:
            result['Issues'].append(f"Found {other_count} values that are not 0/1")
            
    elif col_type == 'categorical':
        value_counts = df[col].value_counts(dropna=False)
        value_percent = (value_counts / total_count) * 100
        value_percent = value_percent.round(1)
        top_values = value_counts.head(5).to_dict()
        top_values_percent = value_percent.head(5).to_dict()


        
        # Get mapped top categories
        top_cats = get_top_categories(df[col], col, percent=False)
        top_cats_percent = get_top_categories(df[col], col, percent=True)
        
        result.update({
            "Top Categories": str(top_cats),
            "Examples": df[col].dropna().unique()[:2].tolist(), 
            "Extra info": str(top_cats_percent),
        })
 
    elif col_type == 'numerical':
        non_null = df[col].dropna()
        non_null= non_null.round(2)
        if len(non_null) > 0:
            result.update({

                'Examples': non_null.unique()[:2].tolist(),
                "Extra info": f"Min: {non_null.min()}, Max: {non_null.max()}, Mean: {round(non_null.mean(), 2)}"
             })
                
    elif col_type == 'date':
        # This would need to be adapted to your specific date format
        try:
            dates = pd.to_datetime(df[col].dropna())
            result.update({
                'Examples': dates.dt.strftime('%Y-%m-%d').unique()[:2].tolist(), 
                'Extra info': f"Date range: {dates.min().strftime('%Y-%m-%d')} to {dates.max().strftime('%Y-%m-%d')}"
            })
            
 
        except Exception as e:
            result['Issues'].append(f"Date parsing error: {str(e)}")
            
    elif col_type == 'text':
        non_null = df[col].dropna()
        if len(non_null) > 0:
            # Check for empty strings
            empty_strings = (non_null == '').sum()
            if empty_strings > 0:
                result['Issues'].append(f"Found {empty_strings} empty strings")
        if non_null.empty:
            extra_info = f"Column '{col}' has no values."
        else:
            most_common = non_null.value_counts().idxmax() if not non_null.value_counts().empty else None
            extra_info = f"Most common: {most_common}" if most_common else "No common value"


        result.update({
            'Examples': non_null.unique()[:2].tolist(), 
            "Extra info": extra_info,
                            })
    
    # Format issues as a string
    result['Issues'] = '; '.join(result['Issues']) if result['Issues'] else 'None'
    validation_results.append(result)

    # Create a DataFrame for the current column's validation results
    summary_data = pd.DataFrame(validation_results)
    
# Convert inds_val_required to a DataFrame
inds_val_required = pd.DataFrame(inds_val_required)
inds_val_required


All columns checked for column: eeg_attempted
All columns checked for column: eeg_attempted_reasons
All columns checked for column: eeg_today_date
All columns checked for column: eeget_date_v2_v2


Unnamed: 0,Column,Participants
0,record_id,[]
1,eeg_attempted,"[Q1K_MHC_200229_F1, Q1K_HSJ_1525-1130_M1, Q1K_..."
2,eeg_attempted_reasons,"[Q1K_HSJ_10040_P, Q1K_HSJ_10040_M1, Q1K_HSJ_10..."
3,eeg_site,"[Q1K_MHC_200179_P, Q1K_MHC_200179_M1]"
4,eeg_participant_code,[]
5,eeg_code_software,"[Q1K_HSJ_10043_P, Q1K_HSJ_100119_M1, Q1K_HSJ_1..."
6,eeg_birthdate_v2_v2,[]
7,eeg_today_date,"[Q1K_HSJ_10040_P, Q1K_HSJ_10040_M1, Q1K_HSJ_10..."
8,eeg_age_years_testdate,[]
9,eeg_age_years,[]


In [577]:
summary_data

Unnamed: 0,Raw variable name,Type,Missing n,Missing %,Unique Values,Issues,Examples,Extra info,Top Categories
0,record_id,Text,0,0.0,274,,"[42, 43]",Most common: 42,
1,eeg_attempted,Yes_no,8,2.5,2,,"[0.0, 1.0]","Yes %: 84.8, No % : 12.7","{'Yes': 274, 'No': 41}"
2,eeg_attempted_reasons,Text,14,34.1,16,,[Hair way too thick. Braided not possible to u...,Most common: Participant had to leave.,
3,eeg_site,Categorical,2,0.7,2,,"[2.0, 1.0]","{'HSJ': 0.78, 'MHC': 0.21}","{'HSJ': 215, 'MHC': 57}"
4,eeg_participant_code,Text,0,0.0,274,,"[Q1K_MHC_20042_P, Q1K_HSJ_10043_P]",Most common: Q1K_MHC_20042_P,
5,eeg_code_software,Text,4,1.5,269,,"[Q042_P, Q043_F1]",Most common: Q123_P,
6,eeg_birthdate_v2_v2,Date,0,0.0,271,,"[1993-08-19, 2007-05-02]",Date range: 1959-07-03 to 2022-05-27,
7,eeg_today_date,Date,19,5.9,119,,"[2024-08-09, 2024-03-28]",Date range: 2024-02-07 to 2025-05-02,
8,eeg_age_years_testdate,Numerical,0,0.0,270,,"[30.97, 16.91]","Min: 2.63, Max: 65.14, Mean: 24.77",
9,eeg_age_years,Numerical,0,0.0,271,,"[31.7, 18.0]","Min: 2.93, Max: 65.83, Mean: 25.38",


In [578]:
participant_list= pd.DataFrame(participant_list)
participant_list

Unnamed: 0,0
0,180
1,56
2,101
3,525-1132
4,122
5,525-1018
6,525-1211
7,525-1127
8,98
9,525-1161


In [579]:
# Merge summary data with mapping_df
summary_data = summary_data.merge(mapping_df, left_on='Raw variable name', right_on='raw_var_name', how='left')
summary_data = summary_data.drop(columns=['raw_var_name'])

# Make new label be the second column
summary_data = summary_data.rename(columns={'new_label': 'Column', })
summary_data = summary_data[['Column'] + [col for col in summary_data.columns if col != 'Column']]

summary_data


Unnamed: 0,Column,Raw variable name,Type,Missing n,Missing %,Unique Values,Issues,Examples,Extra info,Top Categories
0,Record ID,record_id,Text,0,0.0,274,,"[42, 43]",Most common: 42,
1,Was EEG attempted?,eeg_attempted,Yes_no,8,2.5,2,,"[0.0, 1.0]","Yes %: 84.8, No % : 12.7","{'Yes': 274, 'No': 41}"
2,Reasons:,eeg_attempted_reasons,Text,14,34.1,16,,[Hair way too thick. Braided not possible to u...,Most common: Participant had to leave.,
3,EEG site:,eeg_site,Categorical,2,0.7,2,,"[2.0, 1.0]","{'HSJ': 0.78, 'MHC': 0.21}","{'HSJ': 215, 'MHC': 57}"
4,Participant EEG code,eeg_participant_code,Text,0,0.0,274,,"[Q1K_MHC_20042_P, Q1K_HSJ_10043_P]",Most common: Q1K_MHC_20042_P,
5,Eye tracking code,eeg_code_software,Text,4,1.5,269,,"[Q042_P, Q043_F1]",Most common: Q123_P,
6,Birthdate,eeg_birthdate_v2_v2,Date,0,0.0,271,,"[1993-08-19, 2007-05-02]",Date range: 1959-07-03 to 2022-05-27,
7,EEG date 1,eeg_today_date,Date,19,5.9,119,,"[2024-08-09, 2024-03-28]",Date range: 2024-02-07 to 2025-05-02,
8,Age in years REMOVE,eeg_age_years_testdate,Numerical,0,0.0,270,,"[30.97, 16.91]","Min: 2.63, Max: 65.14, Mean: 24.77",
9,Age in years,eeg_age_years,Numerical,0,0.0,271,,"[31.7, 18.0]","Min: 2.93, Max: 65.83, Mean: 25.38",


In [580]:
# Save the summary DataFrame to a CSV file
#summary_data.to_csv(f"../source/{date}/Main_validation_summary_main_variables_{date}.csv", index=False)
# Save the participant_list DataFrame to a CSV file
#participant_list.to_csv(f"../source/{date}/Main_record_ids_to_check_{date}.csv", index=False)    



## Validation of ALL variables

In [581]:
vali_df.columns.tolist()


['record_id',
 'redcap_event_name',
 'eeg_attempted',
 'eeg_attempted_reasons',
 'eeg_site',
 'eeg_participant_code',
 'eeg_code_software',
 'eeg_birthdate_v2_v2',
 'eeg_today_date',
 'eeg_age_years_testdate',
 'eeg_age_years',
 'eeg_sex_birth',
 'eeg_sex_birth_specify',
 'eeg_diagnosis',
 'eeg_diagnosis_unk',
 'eeg_diagnosis_other',
 'eeg_participant_medic',
 'eeg_participant_handedness',
 'eeg_age_v2_v2',
 'eeget_date_v2_v2',
 'eeg_time_v2_v2',
 'eeg_exp_v2_v2',
 'eeg_bathroom',
 'eeg_cellphone',
 'egg_light_off',
 'eeg_instal_obs',
 'eeg_head_circ_v2_v2',
 'eeg_height_v2',
 'eeg_nasion_inion_v2_v2',
 'eeg_lefttoright_v2_v2',
 'eeg_44_47',
 'eeg_47_51',
 'eeg_51_54',
 'eeg_54_56',
 'eeg_56_58',
 'eeg_58_61',
 'eeg_other_net_size',
 'eeg_h12664',
 'eeg_s003466',
 'eeg_s004367',
 'eeg_s002720',
 'eeg_s004597',
 'eeg_s004463',
 'eeg_h12417',
 'eeg_h12416',
 'eeg_h12419',
 'eeg_h12424',
 'eeg_h12432',
 'eeg_h12421',
 'eeget_hc_netsize_v2_v2',
 'eeg_size_other_v2_v2',
 'eeg_fit_v2_v2',
 '

### Clean up variables


#### Net size

In [582]:
net_size_cols = [
    'eeg_44_47',
    'eeg_47_51',
    'eeg_51_54',
    'eeg_54_56',
    'eeg_56_58',
    'eeg_58_61',
    'eeg_other_net_size'
]

In [583]:
# Create a new column with the selected net size
vali_df['eeg_net_size'] = vali_df[net_size_cols].apply(
    lambda row: next((col for col in net_size_cols if row[col] == 0.0), None),
    axis=1
)

# If the net size is set to other, use the value from the other column
vali_df['eeg_net_size'] = vali_df.apply(
    lambda row: row['eeg_size_other_v2_v2'] if row['eeg_net_size'] == 'eeg_other_net_size' else row['eeg_net_size'],
    axis=1
)

# Remove the eeg 
vali_df['eeg_net_size'] = vali_df['eeg_net_size'].str.replace('eeg_', '')

# Drop the original columns
#vali_df = vali_df.drop(columns=net_size_cols)

vali_df.eeg_net_size.value_counts()


54_56    63
51_54    55
56_58    52
58_61    42
47_51    27
44_47     2
61-64     1
Name: eeg_net_size, dtype: int64

#### EEG Net ids

In [584]:
net_ids_cols = [ 'eeg_h12664',
 'eeg_s003466',
 'eeg_s004367',
 'eeg_s002720',
 'eeg_s004597',
 'eeg_s004463',
 'eeg_h12417',
 'eeg_h12416',
 'eeg_h12419',
 'eeg_h12424',
 'eeg_h12432',
 'eeg_h12421'
 ]

In [585]:
# create a new column with the selected net size
vali_df['eeg_net_ids'] = vali_df[net_ids_cols].apply(
    lambda row: next((col for col in net_ids_cols if row[col] == 1), None),
    axis=1
)
# Remove the eeg
vali_df['eeg_net_ids'] = vali_df['eeg_net_ids'].str.replace('eeg_', '')

# Drop the original columns
#vali_df = vali_df.drop(columns=net_ids_cols)
vali_df.eeg_net_ids.value_counts()


h12421    21
h12419    13
h12432    10
h12424    10
Name: eeg_net_ids, dtype: int64

In [586]:
vali_df

Unnamed: 0,record_id,redcap_event_name,eeg_attempted,eeg_attempted_reasons,eeg_site,eeg_participant_code,eeg_code_software,eeg_birthdate_v2_v2,eeg_today_date,eeg_age_years_testdate,...,eeget_p8_b2_quest_v2_v2,eeget_p8_b2_inv_v2_v2,eeget_p8_b2_reasons_v2_v2,eeget_p8_b2_notes_v2_v2,eeg_endrecord,eeget_general_notes_v2_v2,change_status,eeget_session_log_complete,eeg_net_size,eeg_net_ids
1,40,phase_3_arm_1,0.0,,1.0,Q1K_HSJ_10040_P,,2018-03-17,,,...,,,,,,,,0,,
2,41,phase_3_arm_1,0.0,,1.0,Q1K_HSJ_10040_M1,,1985-01-10,,,...,,,,,,,,0,,
3,42,phase_3_arm_1,1.0,,2.0,Q1K_MHC_20042_P,Q042_P,1993-08-19,2024-08-09,30.973942,...,,,,,,,,2,,h12432
4,43,phase_3_arm_1,1.0,,1.0,Q1K_HSJ_10043_P,,2007-05-02,2024-03-28,16.906576,...,,,,,,,,0,51_54,
6,45,phase_3_arm_1,1.0,,1.0,Q1K_HSJ_10043_F1,Q043_F1,1966-04-13,2024-05-31,58.133865,...,,,,,14:21,,,2,51_54,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502,525-1256,phase_3_arm_1,1.0,,1.0,Q1K_HSJ_1525-1256_P,Q1256_P,2009-10-28,2025-05-01,15.507505,...,,,,,13:53,,,2,58_61,
503,525-1257,phase_3_arm_1,1.0,,1.0,Q1K_HSJ_1525-1256_S1,Q1256_S1,2008-01-01,2025-05-01,17.330837,...,,,,,12:05,,,2,56_58,
504,525-1258,phase_3_arm_1,1.0,,1.0,Q1K_HSJ_1525-1256_S2,Q1256_S2,2007-01-03,2025-05-01,18.324698,...,,,,,11:00,,,2,56_58,
505,525-1259,phase_3_arm_1,0.0,No time left. Family had to leave.,1.0,Q1K_HSJ_1525-1256_M1,Q1256_M1,1980-01-06,2025-05-01,45.317723,...,,,,,,,,2,,


In [587]:
column_types = {
    'yes_no': [
        'eeg_attempted', 
        'eeg_bathroom', 
        'eeg_cellphone',
        'egg_light_off',
        'eeg_44_47', 
        'eeg_47_51', 
        'eeg_51_54',
        'eeg_54_56', 
        'eeg_56_58', 
        'eeg_58_61', 
        'eeg_other_net_size',
        'eeg_h12664', 
        'eeg_s003466', 
        'eeg_s004367', 
        'eeg_s002720',
        'eeg_s004597', 
        'eeg_s004463', 
        'eeg_h12417', 
        'eeg_h12416',
        'eeg_h12419', 
        'eeg_h12424', 
        'eeg_h12432', 
        'eeg_h12421',
        'eeg_elec_digitized', 
        'eeg_imped_v2_v2', 
        'eeg_rsrio_done', 
        'eeg_rs_done',
        'eeg_datagood_rsrio',
        'eeg_dataquest_rsrio',
        'eeg_datainvalid_rsrio',
        'eeg_to_done', 
        'eeg_go_done', 
        'eeg_vep_done', 
        'eeg_nsp_done', 
        'eeg_pl_done',
        'eeg_vs_done', 
        'eeg_as_done', 
        'eeg_fsp_done', 
        'eeg_mmn_done',
        'eeg_aep_done',
        'eeget_p1_b1_good_v2_v2', 
        'eeget_p1_b1_quest_v2_v2', 
        'eeget_p1_b1_inv_v2_v2',
        'eeget_p1_b2_good_v2_v2', 
        'eeget_p1_b2_quest_v2_v2', 
        'eeget_p1_b2_inv_v2_v2',
        'eeget_p1_b3_good_v2_v2', 
        'eeget_p1_b3_quest_v2_v2', 
        'eeget_p1_b3_inv_v2_v2',
        'eeget_p1_b4_good_v2_v2', 
        'eeget_p1_b4_quest_v2_v2', 
        'eeget_p1_b4_inv_v2_v2',
        'eeget_p1_b4_good_2_v2_v2', 
        'eeget_p1_b5_quest_v2_v2', 
        'eeget_p1_b5_inv_v2_v2',
        'eeget_p1_b4_good_3_v2_v2', 
        'eeget_p1_b5_quest_2_v2_v2', 
        'eeget_p1_b5_inv_2_v2_v2',
        'eeget_to_b1_good_v2_v3', 
        'eeget_to_b1_quest_v2_v3', 
        'eeget_to_b1_inv_v2_v3',
        'eeget_p6_b1_good_v2_v2', 
        'eeget_p6_b1_quest_v2_v2', 
        'eeget_p6_b1_inv_v2_v2',
        'eeget_p3_b1_good_v2_v2', 
        'eeget_p3_b1_quest_v2_v2', 
        'eeget_p3_b1_inv_v2_v2',
        'eeget_p3_b2_good_v2_v2', 
        'eeget_p3_b2_quest_v2_v2', 
        'eeget_p3_b2_inv_v2_v2',
        'eeget_p3_b3_good_v2_v2', 
        'eeget_p3_b3_quest_v2_v2', 
        'eeget_p3_b3_inv_v2_v2',
        'eeget_p3_b3_inv_3_v2_v2', 
        'eeget_p3_b3_good_2_v2_v2', 
        'eeget_p3_b3_quest_2_v2_v2',
        'eeget_p3_b3_inv_2_v2_v2', 
        'eeget_p5_b1_good_v2_v2', 
        'eeget_p5_b1_quest_v2_v2',
        'eeget_p5_b1_inv_v2_v2', 
        'eeget_p5_b2_good_v2_v2', 
        'eeget_p5_b2_quest_v2_v2',
        'eeget_p5_b2_inv_v2_v2', 
        'eeget_p5_b3_good_v2_v2', 
        'eeget_p5_b3_quest_v2_v2',
        'eeget_p5_b3_inv_v2_v2', 
        'eeget_p5_b4_good_v2_v2', 
        'eeget_p5_b4_quest_v2_v2',
        'eeget_p5_b4_inv_v2_v2', 
        'eeget_p9_b1_good_v2_v2', 
        'eeget_p9_b1_quest_v2_v2',
        'eeget_p9_b1_inv_v2_v2', 
        'eeget_p7_b1_good_v2_v2', 
        'eeget_p7_b1_quest_v2_v2',
        'eeget_p7_b1_inv_v2_v2', 
        'eeget_p7_b1_good_2_v2_v2', 
        'eeget_p7_b1_quest_2_v2_v2',
        'eeget_p7_b1_inv_2_v2_v2', 
        'eeget_p4_b1_good_v2_v2', 
        'eeget_p4_b1_quest_v2_v2',
        'eeget_p4_b1_inv_v2_v2', 
        'eeget_p2_b1_good_v2_v2', 
        'eeget_p2_b1_quest_v2_v2',
        'eeget_p2_b1_inv_v2_v2', 
        'eeget_p10_b1_good_v2_v2', 
        'eeget_p10_b1_quest_v2_v2',
        'eeget_p10_b1_inv_v2_v2', 
        'eeget_p8_b1_good_v2_v2', 
        'eeget_p8_b1_quest_v2_v2',
        'eeget_p8_b1_inv_v2_v2', 
        'eeget_p8_b2_good_v2_v2', 
        'eeget_p8_b2_quest_v2_v2',
        'eeget_p8_b2_inv_v2_v2',
        'eeget_p3_b3_100_v2_v2',
        'eeget_p3_b4_yes_v2_v2',
        'eeg_p1_b5_v2_v2',
        'eeg_p1_b6_v2_v2',
        'eeg_vep_block3',
        'eeg_plr_block2'
    ],
    
    'categorical': [
        'eeg_sex_birth', 
        'eeg_site', 
        'eeg_participant_handedness', 
        'eeg_fit_v2_v2',
        'eeg_vep_a_b_file', 
        'eeg_ssaep_a_b_version', 
        'eeg_mmn_a_b_file',
        'eeg_p2_rand_v2_v2', 
        'eeget_p2_chklist_impedance_v2_v2',
        'eeg_net_size',
        'eeg_rs_rio_eyeo_com',
        'eeg_rs_rio_reason',        'eeget_p1_b1_reasons_v2_v2',
        'eeget_p1_b2_reasons_v2_v2',
        'eeget_p1_b3_reasons_v2_v2',
        'eeget_p1_b4_reasons_v2_v2',
        'eeget_p1_b4_reasons_2_v2_v2',
        'eeget_p1_b4_reasons_3_v2_v2',
        'eeget_to_b1_reasons_v2_v3',
        'eeget_p6_b1_reasons_v2_v2',
        'eeget_p3_b1_reasons_v2_v2',
        'eeget_p3_b2_reasons_v2_v2',
        'eeget_p3_b3_reasons_v2_v2',
        'eeget_p3_b3_reasons_2_v2_v2',
        'eeget_p5_b1_reasons_v2_v2',
        'eeget_p5_b2_reasons_v2_v2',
        'eeget_p5_b3_reasons_v2_v2',
        'eeget_p5_b4_reasons_v2_v2',
        'eeget_p9_b1_reasons_v2_v2',
        'eeget_p7_b1_reasons_v2_v2',
        'eeget_p7_b1_reasons_2_v2_v2',
        'eeget_p4_b1_reasons_v2_v2',
        'eeget_p2_b1_reasons_v2_v2',
        'eeget_p10_b1_reasons_v2_v2',
        'eeget_p8_b1_reasons_v2_v2',
        'eeget_p8_b2_reasons_v2_v2'
    ],
    
    'numerical': [
        'eeg_age_years', 
        'eeg_age_years_testdate', 
        'eeg_age_v2_v2', 
        'eeg_head_circ_v2_v2',
        'eeg_height_v2', 
        'eeg_nasion_inion_v2_v2', 
        'eeg_lefttoright_v2_v2',
        'eeg_p5_cal_v2_v2', 
        'eeg_p5_val_v2_v2', 
        'eeg_p7_cal_v2_v2', 
        'eeg_p7_val_v2_v2',
        'eeg_p8_imp_v2_v2', 
        'eeg_p8_cal_v2_v2', 
        'eeg_p8_val_v2_v2', 
        'eeg_p5_imp_v2_v2',
        'eeg_p6_cal_v2_v2', 
        'eeg_p6_val_v2_v2', 
        'eeg_p6_imp_v2_v2', 
        'eeg_p3_cal_v2_v2',
        'eeg_p3_val_v2_v2', 
        'eeg_p9_imp_v2_v2', 
        'eeg_p9_cal_v2_v2', 
        'eeg_p9_val_v2_v2',
        'eeg_p2_imp_v2_v2'
    ],
    
    'date': [
        'eeg_birthdate_v2_v2', 
        'eeg_today_date', 
        'eeget_date_v2_v2',
        'eeg_beginrecord',
        'eeg_endrecord'
    ],

    'time': [
        'eeg_time_v2_v2',
        'eeg_rs_rio_times_v2', 
        'eeget_p1_start_v2_v2',
        'eeget_to_start_v2', 
        'eeget_p6_start_v2_v2', 
        'eeget_p3_start_v2_v2',
        'eeget_p5_start_v2_v2', 
        'eeget_p9_start_v2_v2', 
        'eeget_p7_start_v2_v2',
        'eeget_p4_start_v2_v2', 
        'eeget_p2_start_v2_v2', 
        'eeget_p10_start_v2_v2',
        'eeget_p8_start_v2_v2'
    ],
    
    'text': [
        'record_id',
        'redcap_event_name',
        'eeg_participant_code', 
        'eeg_code_software', 
        'eeg_sex_birth_specify',
        'eeg_diagnosis', 
        'eeg_diagnosis_unk', 
        'eeg_diagnosis_other',
        'eeg_participant_medic', 
        'eeg_attempted_reasons',
        'eeg_exp_v2_v2', 
        'eeg_instal_obs', 
        'eeget_hc_netsize_v2_v2', 
        'eeg_size_other_v2_v2',
        'eeg_dis_notes_v2_v2', 
        'eeg_code_dig', 
        'eeg_elec_digitalized_notes',
        'eeg_imped_notes_v2_v2', 
        'eeg_p0_code', 
        'eeg_rsrio_notdone', 
        'eeg_rs_rio_notes',
        'eeg_rs_notdone', 
        'eeg_p1_code',
        'eeget_p1_b1_notes_v2_v2', 
        'eeget_p1_b2_notes_v2_v2', 
        'eeget_p1_b3_notes_v2_v2',
        'eeget_p1_b4_notes_v2_v2', 
        'eeget_p1_b4_notes_2_v2_v2', 
        'eeget_p1_b4_notes_3_v2_v2',
        'eeg_to_code', 
        'eeg_to_notdone', 
        'eeg_go_code', 
        'eeg_go_notdone',
        'eeg_p5_cal_val_notes_v2_v2', 
        'eeg_p7_val_cal_notes_v2_v2', 
        'eeg_p2_imp_notes_v2_v2',
        'eeg_vep_code', 
        'eeg_vep_notdone', 
        'eeg_p9_imp_notes_v2_v2', 
        'eeg_p8_val_cal_notes_v2_v2',
        'eeg_nsp_code', 
        'eeg_nsp_notdone', 
        'eeg_p6_cal_val_notes_v2_v2', 
        'eeg_p5_imp_notes_v2_v2',
        'eeg_pl_code', 
        'eeg_pl_notdone', 
        'eeg_vs_code', 
        'eeg_vs_notdone', 
        'eeg_p3_cal_val_notes_v2_v2',
        'eeg_as_code', 
        'eeg_as_notdone', 
        'eeg_p1_notes_v2_v2', 
        'eeg_fsp_code', 
        'eeg_fsp_notdone',
        'eeg_p10_imp_notes_v2_v2', 
        'eeg_p9_cal_val_notes_v2_v2', 
        'eeg_mmn_code', 
        'eeg_mmn_notdone',
        'eeget_general_notes_v2_v2', 
        'change_status', 
        'eeget_p3_b1_notes_v2_v2',
        'eeget_p3_b2_notes_v2_v2', 
        'eeget_p3_b3_notes_v2_v2', 
        'eeget_p3_b3_notes_2_v2_v2',
        'eeget_p5_b1_notes_v2_v2', 
        'eeget_p5_b2_notes_v2_v2', 
        'eeget_p5_b3_notes_v2_v2',
        'eeget_p5_b4_notes_v2_v2', 
        'eeget_p6_b1_notes_v2_v2', 
        'eeget_p9_b1_notes_v2_v2',
        'eeget_p7_b1_notes_v2_v2', 
        'eeget_p4_b1_notes_v2_v2', 
        'eeget_p2_b1_notes_v2_v2',
        'eeget_p10_b1_notes_v2_v2', 
        'eeget_p8_b1_notes_v2_v2', 
        'eeget_p8_b2_notes_v2_v2',
        'eeget_to_b1_notes_v2_v3', 
        'eeget_session_log_complete',
        'eeg_net_ids',

    ]
}

In [588]:
vali_df.columns.tolist()  # Check the columns in the DataFrame

['record_id',
 'redcap_event_name',
 'eeg_attempted',
 'eeg_attempted_reasons',
 'eeg_site',
 'eeg_participant_code',
 'eeg_code_software',
 'eeg_birthdate_v2_v2',
 'eeg_today_date',
 'eeg_age_years_testdate',
 'eeg_age_years',
 'eeg_sex_birth',
 'eeg_sex_birth_specify',
 'eeg_diagnosis',
 'eeg_diagnosis_unk',
 'eeg_diagnosis_other',
 'eeg_participant_medic',
 'eeg_participant_handedness',
 'eeg_age_v2_v2',
 'eeget_date_v2_v2',
 'eeg_time_v2_v2',
 'eeg_exp_v2_v2',
 'eeg_bathroom',
 'eeg_cellphone',
 'egg_light_off',
 'eeg_instal_obs',
 'eeg_head_circ_v2_v2',
 'eeg_height_v2',
 'eeg_nasion_inion_v2_v2',
 'eeg_lefttoright_v2_v2',
 'eeg_44_47',
 'eeg_47_51',
 'eeg_51_54',
 'eeg_54_56',
 'eeg_56_58',
 'eeg_58_61',
 'eeg_other_net_size',
 'eeg_h12664',
 'eeg_s003466',
 'eeg_s004367',
 'eeg_s002720',
 'eeg_s004597',
 'eeg_s004463',
 'eeg_h12417',
 'eeg_h12416',
 'eeg_h12419',
 'eeg_h12424',
 'eeg_h12432',
 'eeg_h12421',
 'eeget_hc_netsize_v2_v2',
 'eeg_size_other_v2_v2',
 'eeg_fit_v2_v2',
 '

In [589]:
symmary_df

Unnamed: 0,record_id,eeg_attempted,eeg_attempted_reasons,eeg_site,eeg_participant_code,eeg_code_software,eeg_birthdate_v2_v2,eeg_today_date,eeg_age_years_testdate,eeg_age_years,eeg_sex_birth,eeg_participant_medic,eeg_participant_handedness,eeg_age_v2_v2,eeget_date_v2_v2
1,40,0.0,,1.0,Q1K_HSJ_10040_P,,2018-03-17,,,7.124034,,,,,
2,41,0.0,,1.0,Q1K_HSJ_10040_M1,,1985-01-10,,,40.304615,,,,,
3,42,1.0,,2.0,Q1K_MHC_20042_P,Q042_P,1993-08-19,2024-08-09,30.973942,31.699487,1.0,Gabapentin,1.0,372.0,2024-08-09
4,43,1.0,,1.0,Q1K_HSJ_10043_P,,2007-05-02,2024-03-28,16.906576,17.999001,2.0,Épilepsie et TDAH (parent ne connait pas le nom),3.0,203.0,2024-03-28
6,45,1.0,,1.0,Q1K_HSJ_10043_F1,Q043_F1,1966-04-13,2024-05-31,58.133865,59.051064,2.0,,1.0,698.0,2024-05-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502,525-1256,1.0,,1.0,Q1K_HSJ_1525-1256_P,Q1256_P,2009-10-28,2025-05-01,15.507505,15.507505,2.0,Vyvanse 40mg,1.0,186.0,2025-05-01
503,525-1257,1.0,,1.0,Q1K_HSJ_1525-1256_S1,Q1256_S1,2008-01-01,2025-05-01,17.330837,17.330837,2.0,Vyvanse 40 mg,1.0,208.0,2025-05-01
504,525-1258,1.0,,1.0,Q1K_HSJ_1525-1256_S2,Q1256_S2,2007-01-03,2025-05-01,18.324698,18.324698,1.0,sertraline 125mg 1 mg Intuniv 10 mg Vyvanse,3.0,220.0,2025-05-01
505,525-1259,0.0,No time left. Family had to leave.,1.0,Q1K_HSJ_1525-1256_M1,Q1256_M1,1980-01-06,2025-05-01,45.317723,45.317723,1.0,,,544.0,2025-05-01


In [590]:
symmary_df.columns

Index(['record_id', 'eeg_attempted', 'eeg_attempted_reasons', 'eeg_site',
       'eeg_participant_code', 'eeg_code_software', 'eeg_birthdate_v2_v2',
       'eeg_today_date', 'eeg_age_years_testdate', 'eeg_age_years',
       'eeg_sex_birth', 'eeg_participant_medic', 'eeg_participant_handedness',
       'eeg_age_v2_v2', 'eeget_date_v2_v2'],
      dtype='object')

In [591]:
validation_results = []

# Create table for validation results

#Forms to check 
inds_val_required= []
participant_list=  set()

# Check columns

symmary_df=vali_df.copy()
for col in symmary_df.columns: 
    # Process each column
    col_type = next((t for t, cols in column_types.items() if col in cols), 'unknown')

 
    if col in ['eeg_attempted', 'eeg_today_date', 'eeget_date_v2_v2']:
        print("All columns checked for column:", col)
        df = symmary_df
        inds_val_required.append({'Column': col, 'Participants': df.loc[df[col].isna(), 'eeg_participant_code'].unique().tolist()})
        missing_ids = df.loc[df[col].isna(), 'record_id'].unique()
        participant_list.update(missing_ids)
 
    elif col == 'eeg_attempted_reasons':
        # Check those 
        df = symmary_df[symmary_df['eeg_attempted'] == 0.0]
        print("All columns checked for column:", col)
        inds_val_required.append({'Column': col, 'Participants': df.loc[df[col].isna(), 'eeg_participant_code'].unique().tolist()})
        missing_ids = df.loc[df[col].isna(), 'record_id'].unique()
        participant_list.update(missing_ids)
    
    elif col in special_cols:
        if col == 'eeg_sex_birth_specify':
            # Check those that are not in the list
            df = symmary_df[symmary_df['eeg_sex_birth'].isna()]
        elif col == 'eeg_diagnosis_unk':
            # Check those that are not in the list
            df = symmary_df[symmary_df['eeg_diagnosis_unk'].isna()]

    else: 
        # Get rid of all those that did not attempt the test
        df = symmary_df[symmary_df['eeg_attempted'] == 1.0]
        inds_val_required.append({'Column': col, 'Participants': df.loc[df[col].isna(), 'eeg_participant_code'].unique().tolist()})
        missing_ids = df.loc[df[col].isna(), 'record_id'].unique()
        participant_list.update(missing_ids)
    

        
    na_count = df[col].isna().sum()
    total_count = len(df)
    na_percent = round((na_count / total_count) * 100, 1)
    
    result = {
        'Raw variable name': col,
        'Type': col_type.capitalize(),
        'Missing n': na_count,
        'Missing %': na_percent,
        'Unique Values': df[col].nunique(),
        'Issues': []
    }



    # Type-specific validation
    if col_type == 'yes_no':
        valid_values = df[col].dropna().unique()
        if len(valid_values) >1:
            yes_count = df[col].isin([1, 1.0, "1"]).sum()
            no_count = df[col].isin([0, 0.0, "0"]).sum()
        elif len(valid_values) == 1:
            no_count = df[col].isin([1, 1.0, "1"]).sum()
            yes_count_count = df[col].isin([0, 0.0, "0"]).sum()
        yes_percent = round((yes_count / total_count) * 100, 1)
        no_percent = round((no_count / total_count) * 100, 1)
        other_count = total_count - yes_count - no_count - na_count
        
        # Get mapped top categories
        top_cats = get_top_categories(df[col], col)
        
        result.update({
            "Top Categories": str(top_cats),
            "Examples": df[col].dropna().unique()[:2].tolist(),
            "Extra info": f"Yes %: {yes_percent}, No % : {no_percent}",
        })
        
        if other_count > 0:
            result['Issues'].append(f"Found {other_count} values that are not 0/1")
            
    elif col_type == 'categorical':
        value_counts = df[col].value_counts(dropna=False)
        value_percent = (value_counts / total_count) * 100
        value_percent = value_percent.round(1)
        top_values = value_counts.head(5).to_dict()
        top_values_percent = value_percent.head(5).to_dict()


        
        # Get mapped top categories
        top_cats = get_top_categories(df[col], col, percent=False)
        top_cats_percent = get_top_categories(df[col], col, percent=True)
        
        result.update({
            "Top Categories": str(top_cats),
            "Examples": df[col].dropna().unique()[:2].tolist(), 
            "Extra info": str(top_cats_percent),
        })
 
    elif col_type == 'numerical':
        non_null = df[col].dropna()
        non_null= non_null.round(2)
        if len(non_null) > 0:
            result.update({

                'Examples': non_null.unique()[:2].tolist(),
                "Extra info": f"Min: {non_null.min()}, Max: {non_null.max()}, Mean: {round(non_null.mean(), 2)}"
             })
                
    elif col_type == 'date':
        try:
            dates = pd.to_datetime(df[col].dropna())
            result.update({
                'Examples': dates.dt.strftime('%Y-%m-%d').unique()[:2].tolist(), 
                'Extra info': f"Date range: {dates.min().strftime('%Y-%m-%d')} to {dates.max().strftime('%Y-%m-%d')}"
            })
            
 
        except Exception as e:
            result['Issues'].append(f"Date parsing error: {str(e)}")
            
    elif col_type == 'time':
        try:
            times = pd.to_datetime(df[col].dropna(), format='%H:%M', errors='coerce')
            result.update({
                'Examples': times.dt.strftime('%H:%M').unique()[:2].tolist(), 
                'Extra info': f"Time range: {times.min().strftime('%H:%M')} to {times.max().strftime('%H:%M')}"
            })
            
        except Exception as e:
            result['Issues'].append(f"Time parsing error: {str(e)}")


    elif col_type == 'text':
        non_null = df[col].dropna()
        if len(non_null) > 0:
            # Check for empty strings
            empty_strings = (non_null == '').sum()
            if empty_strings > 0:
                result['Issues'].append(f"Found {empty_strings} empty strings")
        if non_null.empty:
            extra_info = f"Column '{col}' has no values."
        else:
            most_common = non_null.value_counts().idxmax() if not non_null.value_counts().empty else None
            extra_info = f"Most common: {most_common}" if most_common else "No common value"


        result.update({
            'Examples': non_null.unique()[:2].tolist(), 
            "Extra info": extra_info,
                            })
    
    # Format issues as a string
    result['Issues'] = '; '.join(result['Issues']) if result['Issues'] else 'None'
    validation_results.append(result)

    # Create a DataFrame for the current column's validation results
    summary_data = pd.DataFrame(validation_results)
    
# Convert inds_val_required to a DataFrame
inds_val_required = pd.DataFrame(inds_val_required)
 

# Convert participant_list to a DataFrame
participant_list= pd.DataFrame(participant_list)
 

All columns checked for column: eeg_attempted
All columns checked for column: eeg_attempted_reasons
All columns checked for column: eeg_today_date
All columns checked for column: eeget_date_v2_v2


In [592]:
summary_data

Unnamed: 0,Raw variable name,Type,Missing n,Missing %,Unique Values,Issues,Examples,Extra info,Top Categories
0,record_id,Text,0,0.0,274,,"[42, 43]",Most common: 42,
1,redcap_event_name,Text,0,0.0,1,,[phase_3_arm_1],Most common: phase_3_arm_1,
2,eeg_attempted,Yes_no,8,2.5,2,,"[0.0, 1.0]","Yes %: 84.8, No % : 12.7","{'Yes': 274, 'No': 41}"
3,eeg_attempted_reasons,Text,14,34.1,16,,[Hair way too thick. Braided not possible to u...,Most common: Participant had to leave.,
4,eeg_site,Categorical,2,0.7,2,,"[2.0, 1.0]","{'HSJ': 0.78, 'MHC': 0.21}","{'HSJ': 215, 'MHC': 57}"
...,...,...,...,...,...,...,...,...,...
274,eeget_general_notes_v2_v2,Text,256,93.4,18,,"[Not done, participant refusal, Globalement, q...","Most common: Not done, participant refusal",
275,change_status,Text,266,97.1,2,,"[0.0, 1.0]",Most common: 1.0,
276,eeget_session_log_complete,Text,0,0.0,3,,"[2, 0]",Most common: 2,
277,eeg_net_size,Categorical,35,12.8,7,,"[51_54, 47_51]","{'54_56': 0.23, '51_54': 0.2, '56_58': 0.19, '...","{'54_56': 63, '51_54': 55, '56_58': 51, '58_61..."


In [593]:
# Merge summary data with mapping_df
summary_data = summary_data.merge(mapping_df, left_on='Raw variable name', right_on='raw_var_name', how='left')
summary_data = summary_data.drop(columns=['raw_var_name'])

# Make new label be the second column
summary_data = summary_data.rename(columns={'new_label': 'Column', })
summary_data = summary_data[['Column'] + [col for col in summary_data.columns if col != 'Column']]

summary_data
 

Unnamed: 0,Column,Raw variable name,Type,Missing n,Missing %,Unique Values,Issues,Examples,Extra info,Top Categories
0,Record ID,record_id,Text,0,0.0,274,,"[42, 43]",Most common: 42,
1,Event Name,redcap_event_name,Text,0,0.0,1,,[phase_3_arm_1],Most common: phase_3_arm_1,
2,Was EEG attempted?,eeg_attempted,Yes_no,8,2.5,2,,"[0.0, 1.0]","Yes %: 84.8, No % : 12.7","{'Yes': 274, 'No': 41}"
3,Reasons:,eeg_attempted_reasons,Text,14,34.1,16,,[Hair way too thick. Braided not possible to u...,Most common: Participant had to leave.,
4,EEG site:,eeg_site,Categorical,2,0.7,2,,"[2.0, 1.0]","{'HSJ': 0.78, 'MHC': 0.21}","{'HSJ': 215, 'MHC': 57}"
...,...,...,...,...,...,...,...,...,...,...
274,General session notes:,eeget_general_notes_v2_v2,Text,256,93.4,18,,"[Not done, participant refusal, Globalement, q...","Most common: Not done, participant refusal",
275,status change?,change_status,Text,266,97.1,2,,"[0.0, 1.0]",Most common: 1.0,
276,Complete?,eeget_session_log_complete,Text,0,0.0,3,,"[2, 0]",Most common: 2,
277,,eeg_net_size,Categorical,35,12.8,7,,"[51_54, 47_51]","{'54_56': 0.23, '51_54': 0.2, '56_58': 0.19, '...","{'54_56': 63, '51_54': 55, '56_58': 51, '58_61..."


In [594]:

# Save the summary DataFrame to a CSV file
#summary_data.to_csv(f"../source/{date}/full_validation_summary_main_variables_{date}.csv", index=False)
# Save the participant_list DataFrame to a CSV file
#participant_list.to_csv(f"../source/{date}/full_record_ids_to_check_{date}.csv", index=False)    
