In [52]:
import numpy as np
import pandas as pd

RES_DIR = "/home/riccardo/petTOAD/results/"
UTL_DIR = "/home/riccardo/petTOAD/data/utils/"


# Load datasets
df_petTOAD = pd.read_csv(f"{RES_DIR}/df_petTOAD.csv")
df_sel_pts = pd.read_csv(f"{UTL_DIR}/ADNI_selected_pts.csv")
df_adnimerge = pd.read_csv(f"{UTL_DIR}/ADNIMERGE.csv")
# Change PTID names
df_petTOAD["PTID"] = df_petTOAD["PTID"].str.replace("_", "")
df_adnimerge["PTID"] = "ADNI" + df_adnimerge["PTID"].str.replace("_", "")
df_sel_pts["PTID"] = "ADNI" + df_sel_pts["Subject"].str.replace("_", "")
# Drop subjs mistakenly with T2 axial
to_drop = df_sel_pts[df_sel_pts["Description"].isin(["Axial T2-FLAIR", "Axial T2-Star"])]["PTID"]
df_sel_pts = df_sel_pts[~df_sel_pts["PTID"].isin(to_drop)]
# All the other subjects were excluded for some reasons...
df_petTOAD = df_petTOAD.rename(columns={"Group_bin_Fazekas": "Group_Fazekas_2.0"})
df_petTOAD = df_petTOAD[df_petTOAD["WMH_load_subj_space"] < 80000]
sim_pts = df_petTOAD["PTID"]
df_sel_pts_sim = df_sel_pts[df_sel_pts["PTID"].isin(sim_pts)]
df_sel_pts_sim = df_sel_pts_sim.drop_duplicates(["PTID", "Acq Date"])
df_sel_pts_sim["Acq Date"] = pd.to_datetime(df_sel_pts_sim["Acq Date"])
df_adnimerge["EXAMDATE"] = pd.to_datetime(df_adnimerge["EXAMDATE"])
df_adnimerge = df_adnimerge[df_adnimerge["PTID"].isin(sim_pts)]
df_merged = pd.merge(df_sel_pts_sim, df_adnimerge, on = "PTID")
df_merged_unique_dates = df_merged[
    df_merged["EXAMDATE"].between(
        df_merged["Acq Date"] - pd.Timedelta(weeks=16),
        df_merged["Acq Date"] + pd.Timedelta(weeks=16),
    )
]
df_merged_unique_dates = df_merged_unique_dates.drop_duplicates(["PTID", "Acq Date"])

In [65]:
df_sel_pts[df_sel_pts["Visit"].isin(["init", "bl", "sc"])]["PTID"].unique().shape

(557,)

In [68]:
sum(sim_pts.isin(df_sel_pts[df_sel_pts["Visit"].isin(["init", "bl", "sc"])]["PTID"].unique()))

262

In [53]:
# Unfortunately the df you get when downloading data has a "group" column which is different than the DX column
# in ADNIMERGE... we check that the DX column of adnimerge.csv from the study date on which we got the imaging
# data is the same as the Group column from the imaging df
check = df_merged_unique_dates[["PTID", "DX", "Group"]].copy()
check["DX"] = np.where(check["DX"] == "EMCI", "MCI", check["DX"])
check["DX"] = np.where(check["DX"] == "LMCI", "MCI", check["DX"])
check["DX"] = np.where(check["DX"] == "SMC", "CN", check["DX"])
check["Group"] = np.where(check["Group"] == "EMCI", "MCI", check["Group"])
check["Group"] = np.where(check["Group"] == "LMCI", "MCI", check["Group"])
check["Group"] = np.where(check["Group"] == "SMC", "CN", check["Group"])
# For these patients it is, so no problem
pts_adnimerge_and_same_DXBL_DX = [pt for pt in check[check["Group"] == check["DX"]]["PTID"]]
# For these patients it is not... so big problems
pts_adnimerge_and_diff_DXBL_DX = [pt for pt in df_merged_unique_dates["PTID"] if pt not in pts_adnimerge_and_same_DXBL_DX]
# We also have some patients for which there is no available adnimerge data, but these are mostly screening
# imaging visits so the group is the cognitive performance.
df_pts_with_no_adnimerge_data = df_sel_pts_sim[~df_sel_pts_sim["PTID"].isin(df_merged_unique_dates["PTID"])]
# These are ok, since we focus mainly on imaging and if it's the screening the group is correct
pts_with_no_adnimerge_data_screening = df_pts_with_no_adnimerge_data[df_pts_with_no_adnimerge_data["Visit"].isin(["sc", "init", "bl"])]["PTID"].to_list()
# These are not ok because the imaging data is acquired at a later timepoint and we are not sure
# if they changed their cognitive status...
pts_with_no_adnimerge_data_no_screening = df_pts_with_no_adnimerge_data[~df_pts_with_no_adnimerge_data["Visit"].isin(["sc", "init", "bl"])]["PTID"].to_list()
# Pts to exclude/check/decide what to do:
pts_to_exclude = pts_adnimerge_and_diff_DXBL_DX + pts_with_no_adnimerge_data_no_screening

In [54]:
len(pts_with_no_adnimerge_data_screening)

63