<a href="https://colab.research.google.com/github/cristinarainich/Child-Mind-Institute-Project/blob/main/behavioral_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import math
import pylab as pl
import seaborn as sns
from scipy import stats
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
sns.set()

In [4]:
working_path = Path('/content/drive/MyDrive/KKNV/CMI')

In [5]:
cd /content/drive/MyDrive/KKNV/CMI

/content/drive/MyDrive/KKNV/CMI


In [25]:
!ls /content/drive/MyDrive/KKNV/CMI

'CMI Project. Updates.gslides'	 HBN_R11_Pheno.csv    HBN_R6_Pheno.csv	 missing_ids.txt
 final_data.csv			 HBN_R2_1_Pheno.csv   HBN_R7_Pheno.csv	 old
 final_ids.csv			 HBN_R3_Pheno.csv     HBN_R8_Pheno.csv	 query_September2024.csv
 HBN_R10_Pheno.csv		 HBN_R4_Pheno.csv     HBN_R9_Pheno.csv
 HBN_R1_1_Pheno.csv		 HBN_R5_Pheno.csv     id_list.txt


In [13]:
# info on whether there are phenotypical full files and imaging data availale
file1 = pd.read_csv(
    working_path.joinpath('HBN_R1_1_Pheno.csv'),
    index_col=None
)
file2 = pd.read_csv(
    working_path.joinpath('HBN_R2_1_Pheno.csv'),
    index_col=None
)
file3 = pd.read_csv(
    working_path.joinpath('HBN_R3_Pheno.csv'),
    index_col=None
)
file4 = pd.read_csv(
    working_path.joinpath('HBN_R4_Pheno.csv'),
    index_col=None
)
file5 = pd.read_csv(
    working_path.joinpath('HBN_R5_Pheno.csv'),
    index_col=None
)
file6 = pd.read_csv(
    working_path.joinpath('HBN_R6_Pheno.csv'),
    index_col=None
)
file7 = pd.read_csv(
    working_path.joinpath('HBN_R7_Pheno.csv'),
    index_col=None
)
file8 = pd.read_csv(
    working_path.joinpath('HBN_R8_Pheno.csv'),
    index_col=None
)
file9 = pd.read_csv(
    working_path.joinpath('HBN_R9_Pheno.csv'),
    index_col=None
)
file10 = pd.read_csv(
    working_path.joinpath('HBN_R10_Pheno.csv'),
    index_col=None
)
file11 = pd.read_csv(
    working_path.joinpath('HBN_R11_Pheno.csv'),
    index_col=None
)

In [14]:
files = [
    file1, file2, file3, file4, file5, file6, file7, file8, file9, file10, file11
    ]
for i in range(0, 11):
  release_info = files[i]
  release_info['Release'] = i + 1

In [15]:
# creating a common df on the info from all the releases
fullpheno = pd.DataFrame()
for i in files:
  fullpheno = pd.concat([fullpheno, i])

In [16]:
print('The shape of the fullpheno file :', fullpheno.shape)
print('The number of duplicated entries in the fullpheno file: ', fullpheno['EID'].duplicated().sum())
# this basically means that the neuroimaging data from the same participants could be in different releases

The shape of the fullpheno file : (5810, 7)
The number of duplicated entries in the fullpheno file:  1246


In [17]:
ids = pd.read_csv(
    working_path.joinpath('final_ids.csv'),
    index_col=None
)

In [18]:
final_ids = ids['Identifiers'].values.tolist()
print('The number of sibjects in our sample: ', len(final_ids))

The number of sibjects in our sample:  619


In [10]:
# Write the IDs to a text file in Colab
with open('id_list.txt', 'w') as file:
    for id in final_ids:
        file.write(id + '\n')


In [11]:
from google.colab import files
files.download('id_list.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
# so now we need to compare our subjects and see in what releases we might have their data
release = []
no_release_ids = [] # ids that are in final id .csv and in our sample, but absent from pheno data files
for id in final_ids:
  subset = fullpheno[fullpheno['EID'] == id]
  release_info = subset['Release'].values.tolist()
  if len(release_info) == 1:
    release.append(release_info[0])
  elif len(release_info) == 0:
    no_release_ids.append(id)
    release.append('No release information')
  else:
    name = ''
    for i in release_info:
      name = name + str(i) + ' '
    release.append(name)

ids['Release'] = release

In [20]:
ids.groupby(['Release'])[['Identifiers']].count()

Unnamed: 0_level_0,Identifiers
Release,Unnamed: 1_level_1
1,120
2,15
3,13
4,39
5,22
6,16
7,50
8,51
9,38
10,32


In [21]:
# manual search in the database - where they could be if they are there at all
no_release_ids

['NDARAE264WPZ',
 'NDARBE220VRK',
 'NDARCU811WCY',
 'NDARDX969ECK',
 'NDAREA136BDX',
 'NDAREY721PVD',
 'NDARKK745DHA',
 'NDARLN658KCD',
 'NDARMG114FCW',
 'NDARNK354JWK',
 'NDARRY217HD3',
 'NDARTK435YWU',
 'NDARVN280JTN',
 'NDARWP864WGV',
 'NDARZT011LBZ',
 'NDARBV503LF7',
 'NDARFB784LDG',
 'NDARPK265ZXW',
 'NDARUL596DY9']

I checked and downloaded the MRI datasets that we have. Here is the file called missing_ids.txt with the identifiers for whome we don't have an MRI set.

In [32]:
with open(working_path.joinpath('missing_ids.txt'), 'r') as file:
    content = file.read()

# Convert content into a list by splitting on new lines
missing_ids = content.splitlines()  # This will create a list of items
print('The number of missing ids: ', len(missing_ids))

<class 'str'>
The number of missing ids:  139


In [54]:
notes_missing_ids_1 = ['NDARAE264WPZ',
 'NDARBE220VRK',
 'NDARCU811WCY',
 'NDARDX969ECK',
 'NDAREA136BDX',
 'NDAREY721PVD',
 'NDARKK745DHA',
 'NDARLN658KCD',
 'NDARMG114FCW',
 'NDARNK354JWK',
 'NDARRY217HD3',
 'NDARTK435YWU',
 'NDARWP864WGV',
 'NDARZT011LBZ',
 'NDARBV503LF7',
 'NDARFB784LDG',
 'NDARPK265ZXW',
 'NDARUL596DY9']

In [61]:
notes_missing_ids_2 = 'NDARAN385MDH, NDARBH024NH2, NDAREL622FA6, NDAREW976FNL, NDARFF598HGT, NDARFY075REK, NDARGR875AXY, NDARGV956EGX, NDARHB000YF8, NDARHN131ZP6, NDARHW650FXU, NDARJH441HJD, NDARKU278YRR, NDARLA395AG8, NDARLC358CYJ, NDARLX816JUZ, NDARMH625WKL, NDARNP399JVF, NDARRG415BJM, NDARTX934NH6, NDARUV418FB8, NDARUX114RGJ, NDARVV248VW0, NDARVX337AC0, NDARWZ495PG4, NDARYJ389DWX, NDARYP516VUU, NDARAU447JZH, NDARVM025NCF, NDARDR296XHN, NDARTF566PYH, NDARXT325FV6, NDARAG584XLU, NDARDJ092YKH, NDARDN924BV2, NDAREW201WD9, NDARGZ282DLD, NDARMV575DC1, NDARTC707KPU, NDARVX856RHN, NDARWX051KEV, NDARYY694NE7, NDARDV245WJG, NDARJH763NPD, NDARLE091XAZ, NDARPE551CK7, NDARPL596YTD, NDARRV505ND6, NDARYJ735XPK, NDARHP039DBU, NDARJJ817UP1, NDARLJ168LXY, NDARWF205BUM, NDARWN424BPK, NDAREU438HAF, NDARGN721GKT, NDARJV411EH6, NDARKH291KRE, NDARME573TRB, NDARPC931KR1, NDARTL667CCG,NDARUV147TDD, NDARVF039ZLX, NDARXG799KWJ, NDARZK891FTB, NDARZM580BG2, NDARBJ016AKE, NDARFJ988HKR, NDARGV263ZRY, NDARHA780UYE, NDARMH488KY0, NDARRH199NCH, NDARRV410KY1, NDARTF150VPJ, NDARTH529YKZ, NDARVK196LEK, NDARWB903FVY, NDARYG172DBJ, NDARZK709KEY, NDARAE710YWG, NDARAH239PGG, NDARAU939WUK, NDARCD453JG4, NDARET484TTP, NDARFR601RDQ, NDARGJ653MG2, NDARJC299PRZ, NDARJW989EM6, NDARLC655XBR, NDARLK690ZK9, NDARNM838ABN, NDARTC527WPZ, NDARTL878RZ2, NDARZB345GKD, NDARCM677TC1, NDARDG644LJX, NDARHL237MPV, NDARLK034DJB, NDARYR692VVC, NDARAT696TMM, NDARBH536UD3, NDARCH084YL1, NDARFE593CUC, NDARFF757TUX, NDARGW785MWF, NDARJK827DBP, NDARLB930UWR, NDARMJ741DZB, NDARRR351RXB, NDARTF250GYT, NDARTH373NE7, NDARVM414ZWU, NDARWR139LVZ, NDARZW623WYG, NDARAA773LUW, NDARAG644KE3, NDARBH701KA1, NDARCE788KXW, NDARJT819VAX, NDARPB701XDP, NDARTH261GB5'

In [62]:
# Convert content into a list by splitting on commas
notes_2 = notes_missing_ids_2.split(',')  # This will create a list of items
notes = notes_missing_ids_1 + notes_2
notes = [note.strip() for note in notes]

In [63]:
print(len(notes))
print(len(missing_ids))

139
139


In [64]:
missing_from_notes = [id for id in missing_ids if id not in notes]

# Display the result
print("Missing IDs from notes:", missing_from_notes)

Missing IDs from notes: ['NDARDR804MFE', '']


In [65]:
missing_from_missing_ids = [id for id in notes if id not in missing_ids]

# Display the result
print("Missing IDs from notes:", missing_from_missing_ids)

Missing IDs from notes: ['NDARDV245WJG', 'NDARAG644KE3']


In [66]:
ids[ids['Identifiers'] == 'NDARDR804MFE']

Unnamed: 0.1,Unnamed: 0,Identifiers,Release
107,714,NDARDR804MFE,5


In [70]:
print(len(missing_ids[:-1]))

138


In [71]:
missing_ids = missing_ids[:-1]
values = ids['Identifiers'].values.tolist()
mri_present = []
for id in values:
  if id not in missing_ids:
    mri_present.append('Yes')
  else:
    mri_present.append('No')
print(len(mri_present))

619


In [72]:
ids['MRI_status'] = mri_present

In [73]:
ids.groupby(['MRI_status'])[['Identifiers']].count()

Unnamed: 0_level_0,Identifiers
MRI_status,Unnamed: 1_level_1
No,138
Yes,481
