In [1]:
import pandas as pd
import numpy as np

In [30]:
def add_group_column(roi_activations_df: pd.DataFrame) -> pd.DataFrame:
    with open("../subjects/pilot_anx_subjects.txt") as f:
        pilot_anx_subject_ids = f.read().splitlines()
    
    with open("../subjects/subject_same_mri.txt") as f:        
        lines = f.read().splitlines()
        
    original_50_subject_ids = []
    
    for subject_id in lines:
        if subject_id not in pilot_anx_subject_ids:
            original_50_subject_ids.append(subject_id)
        
    
    # group can be 'pilot_anx', 'original_50', or 'extra'
    def get_group(subject_id):
        if subject_id in pilot_anx_subject_ids:
            return 'pilot_anx'
        elif subject_id in original_50_subject_ids:
            return 'original_50'
        else:
            return 'extra'
    
    roi_activations_df['group'] = [get_group(subject_id) for subject_id in roi_activations_df['subject_id']]
    
    return roi_activations_df

In [31]:
roi_activations_xyz_path = "../data/additional_150_with_path.csv"

print("loading csv...")
df_with_coords = pd.read_csv(roi_activations_xyz_path)
print(f"loaded {len(df_with_coords)} rows")

print("adding group column...")
add_group_column(df_with_coords)

print(df_with_coords.head())

# num rows in the dataframe
print(len(df_with_coords))

df_with_coords.groupby(["session", "subject_id", "image_name", "run", "roi_num"]).count()

loading csv...
loaded 7712100 rows
adding group column...
   roi_value  x_coord  y_coord  z_coord  \
0   0.531260       58       75       37   
1   0.821149       59       73       36   
2   0.505634       59       73       37   
3   0.068516       59       73       38   
4   0.343551       59       74       35   

                                         zfstat_path  roi_num  \
0  /mnt/storage/daniel/feat-preprocess-datasink/a...        1   
1  /mnt/storage/daniel/feat-preprocess-datasink/a...        1   
2  /mnt/storage/daniel/feat-preprocess-datasink/a...        1   
3  /mnt/storage/daniel/feat-preprocess-datasink/a...        1   
4  /mnt/storage/daniel/feat-preprocess-datasink/a...        1   

        subject_id  run image_name  is_nonlinear            session      group  
0  NDARINVY6FE3R8A    2      corGo         False  baselineYear1Arm1  pilot_anx  
1  NDARINVY6FE3R8A    2      corGo         False  baselineYear1Arm1  pilot_anx  
2  NDARINVY6FE3R8A    2      corGo         False 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,roi_value,x_coord,y_coord,z_coord,zfstat_path,is_nonlinear,group
session,subject_id,image_name,run,roi_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
baselineYear1Arm1,NDARINV00CY2MDM,corGo,1,1,246,246,246,246,246,246,246
baselineYear1Arm1,NDARINV00CY2MDM,corGo,1,2,246,246,246,246,246,246,246
baselineYear1Arm1,NDARINV00CY2MDM,corGo,1,3,246,246,246,246,246,246,246
baselineYear1Arm1,NDARINV00CY2MDM,corGo,1,4,246,246,246,246,246,246,246
baselineYear1Arm1,NDARINV00CY2MDM,corGo,1,5,246,246,246,246,246,246,246
baselineYear1Arm1,...,...,...,...,...,...,...,...,...,...,...
baselineYear1Arm1,NDARINVZT44Y065,incStopvcorGo,2,7,246,246,246,246,246,246,246
baselineYear1Arm1,NDARINVZT44Y065,incStopvcorGo,2,8,246,246,246,246,246,246,246
baselineYear1Arm1,NDARINVZT44Y065,incStopvcorGo,2,9,246,246,246,246,246,246,246
baselineYear1Arm1,NDARINVZT44Y065,incStopvcorGo,2,10,246,246,246,246,246,246,246


In [36]:
# num unique subjects for each group
print(df_with_coords.groupby("group")["subject_id"].nunique())

group
extra           59
original_50     46
pilot_anx      139
Name: subject_id, dtype: int64


In [41]:
def filter_nonlinear_feat(df: pd.DataFrame) -> pd.DataFrame:
    return df[df["zfstat_path"].apply(lambda x: "NL" in x)]

nonlinear_df_with_coords = filter_nonlinear_feat(df_with_coords)

print(len(nonlinear_df_with_coords))

# nonlinear_df_with_coords

3888522


In [42]:
def filter_linear_feat(df: pd.DataFrame) -> pd.DataFrame:
    # "NL" not in df["zfstat_path"]
    
    return df[df["zfstat_path"].apply(lambda x: "1LN" in x or "2LN" in x)]

linear_df_with_coords = filter_linear_feat(df_with_coords)

print(len(linear_df_with_coords))

# linear_df_with_coords

3856050


In [44]:
# number of rows with roi_value == 0

print(len(linear_df_with_coords[linear_df_with_coords["roi_value"] == 0]))

act_zero_df_linear_feat = linear_df_with_coords[linear_df_with_coords["roi_value"] == 0]

act_zero_df_nonlinear_feat = nonlinear_df_with_coords[nonlinear_df_with_coords["roi_value"] == 0]

print(f"total num subjects: {df_with_coords['subject_id'].nunique()}")
print(f"num subjects (linear FEAT) with 0 activations: {act_zero_df_linear_feat['subject_id'].nunique()}")
print(f"num subjects (nonlinear FEAT) with 0 activations: {act_zero_df_nonlinear_feat['subject_id'].nunique()}")

unique_subjects_linear_feat = act_zero_df_linear_feat['subject_id'].unique()
unique_subjects_nonlinear_feat = act_zero_df_nonlinear_feat['subject_id'].unique()

# print(f"unique subjects with 0 activations (linear FEAT): {act_zero_df_linear_feat['subject_id'].unique()}")
# print(f"unique subjects with 0 activations (nonlinear FEAT): {act_zero_df_nonlinear_feat['subject_id'].unique()}")

overlap = np.intersect1d(unique_subjects_linear_feat, unique_subjects_nonlinear_feat)

print(f"num matches: {len(overlap)}")

# print df filtered by overlap subjects
overlap_df = df_with_coords[df_with_coords["subject_id"].isin(overlap)]

print(overlap_df.groupby("group")["subject_id"].nunique())

75006
total num subjects: 244
num subjects (linear FEAT) with 0 activations: 22
num subjects (nonlinear FEAT) with 0 activations: 22
num matches: 22
group
extra          6
original_50    7
pilot_anx      9
Name: subject_id, dtype: int64
