This script reuses a lot fo code from make_full_dataset file. Requires LORIS-release csv file

In [None]:
import pandas as pd
import numpy as np

def remove_admin_cols(full):
    # Remove uninteresting columns
    columns_to_drop = []

    column_suffixes_to_drop = ["Administration", "Data_entry", "Days_Baseline", "START_DATE", "Season", "Site", "Study", "Year", "Commercial_Use", "Release_Number"]
    for suffix in column_suffixes_to_drop:
        cols_w_suffix = [x for x in full.columns if suffix in x]
        columns_to_drop.extend(cols_w_suffix)

    present_columns_to_drop = full.filter(columns_to_drop)
    full = full.drop(present_columns_to_drop, axis = 1)
    return full 

def get_ID_from_EID(full, EID_cols):

    # Get only EID cols
    full_for_EID_check = full[EID_cols]

    # In EID cols df, fill missing EIDs with EIDs from other questionnaires 
    full_for_EID_check = full_for_EID_check.ffill(axis=1).bfill(axis=1)

    # Drop lines with different EID within one row
    full = full[full_for_EID_check.eq(full_for_EID_check.iloc[:, 0], axis=0).all(1)]

    # Fill ID field with the first non-null questionnaire-specific EID
    full["ID"] = full_for_EID_check.iloc[:, 0]

    return full

# Drop rows with underscores in ID (NDARZZ007YMP_1, NDARAA075AMK_Visit_1)
def drop_rows_w_underscore_in_id(full):

    rows_with_underscore_in_id = full[full["ID"].str.contains("_")]
    non_empty_columns_in_underscore = rows_with_underscore_in_id.columns[
        ~rows_with_underscore_in_id.isna().all()
    ].tolist() 
    non_empty_questionnaires_in_underscore = set([x.split(",")[0] for x in non_empty_columns_in_underscore])
    
    non_empty_questionnaires_in_underscore.remove("Identifiers")
    non_empty_questionnaires_in_underscore.remove("ID")
    full_wo_underscore = full[~full["ID"].str.contains("_")]

    # Drop questionnaires present in rows with underscores from data ({'DailyMeds', 'TRF', 'TRF_P', 'TRF_Pre'})
    for questionnaire in non_empty_questionnaires_in_underscore:
        full_wo_underscore = full_wo_underscore.drop(full_wo_underscore.filter(regex=(questionnaire+",")), axis=1)

    return full_wo_underscore

relevent_assessments_list = ["WISC", "TOWRE", "WIAT"]

# LORIS saved query (all data)
full = pd.read_csv("../../diagnosis_predictor/data/raw/LORIS-release-10.csv", dtype=object)

# Replace NaN (currently ".") values with np.nan
full = full.replace(".", np.nan)

# Drop first row (doesn't have ID)
full = full.iloc[1: , :]

# Drop empty columns
full = full.dropna(how='all', axis=1)

full = remove_admin_cols(full)

# Get ID columns (contain quetsionnaire names, e.g. 'ACE,EID', will be used to check if an assessment is filled)
EID_cols = [x for x in full.columns if ",EID" in x]

# Get ID col from EID cols
full = get_ID_from_EID(full, EID_cols)

full_wo_underscore = drop_rows_w_underscore_in_id(full)

# Drop questionnaires present in rows with underscores from data from list of ID columns
EID_cols = [x for x in EID_cols if 'TRF' not in x]
EID_cols = [x for x in EID_cols if 'DailyMeds' not in x]

# Remove EID columns: not needed anymore
full_wo_underscore = full_wo_underscore.drop(EID_cols, axis=1)

# Convert numeric columns to numeric type (all except ID and DX)
full_wo_underscore = full_wo_underscore.apply(lambda col: convert_numeric_col_to_numeric_type(col))

# Remove ID column - not needed anymore
full_wo_underscore = full_wo_underscore.drop("ID", axis=1)

In [24]:
df = full_wo_underscore
relevant_columns = ["WIAT,WIAT_Word_Stnd"]
df = df[relevant_columns]
df = df.dropna()
df

Unnamed: 0,"TOWRE,TOWRE_Complete","TOWRE,TOWRE_IncompleteReason","TOWRE,TOWRE_PDE_AE","TOWRE,TOWRE_PDE_Desc","TOWRE,TOWRE_PDE_Perc","TOWRE,TOWRE_PDE_Raw","TOWRE,TOWRE_PDE_Scaled","TOWRE,TOWRE_SWE_AE","TOWRE,TOWRE_SWE_Desc","TOWRE,TOWRE_SWE_Perc",...,"WISC,WISC_VP_Raw","WISC,WISC_VP_Scaled","WISC,WISC_VSI","WISC,WISC_VSI_Percentile","WISC,WISC_VSI_Sum","WISC,WISC_Vocab_Raw","WISC,WISC_Vocab_Scaled","WISC,WISC_WMI","WISC,WISC_WMI_Percentile","WISC,WISC_WMI_Sum"
