In [1]:
# Imports
import os

# Data science and vis tools
import numpy as np
import pandas as pd

## Extract Data

Netmat matricies Data file: **Data/netmats_100.txt**

- 1003 subjects
- For time series (4800 total timepoints)

Network matricies: Using partial temporal correlation between nodes' timeseries. This aims to estimate direct connection strengths better than achieved by full correlation. To slightly improve the estimates of partial correlation coefficients, a small amount of L2 regularization is applied (setting rho=0.01 in the Ridge Regression netmats option in FSLNets) [Smith OHBM 2014, FSLNets].

In [16]:
hcp_netmats_file = os.path.abspath('../../Data/HCP/netmats_100.txt')
id_file = os.path.abspath('../../Data/HCP/subjectIDs.txt')
subject_descriptions_file = os.path.abspath('../../Data/HCP/subjects.csv')

In [3]:
# Read Network Matricies
subjects = np.loadtxt(hcp_netmats_file, dtype=float)

ICAd = int(np.sqrt(subjects.shape[1]))

# Convert into matrix
matrix = np.array([np.reshape(i, (ICAd, ICAd)) for i in subjects])

data_raw = matrix

In [4]:
data_raw.shape

(1003, 100, 100)

In [6]:
# Read Subject IDs
subject_IDs = np.loadtxt(id_file, dtype=int)

net_mat_data = pd.Series(dict(zip(subject_IDs, matrix))).to_frame().rename(columns = {0 : 'netmat'})
net_mat_data.head()

Unnamed: 0,netmat
100206,"[[0.0, 0.61676, 9.5727, -5.4959, 0.34639, 3.00..."
100307,"[[0.0, -0.29664, 17.317, -9.0467, -0.28723, 1...."
100408,"[[0.0, 1.6486, 6.6189, -8.8877, 1.4337, 1.006,..."
100610,"[[0.0, -0.90275, 7.7215, -8.3907, 3.3144, 2.93..."
101006,"[[0.0, -0.088768, 9.4979, -10.412, 1.0646, 4.3..."


In [7]:
# Read Subject Descriptions
subject_data = pd.read_csv(subject_descriptions_file, index_col = 'Subject')
subject_data.head()

Unnamed: 0_level_0,Release,Acquisition,Gender,Age,3T_Full_MR_Compl,T1_Count,T2_Count,3T_RS-fMRI_Count,3T_RS-fMRI_PctCompl,3T_Full_Task_fMRI,...,Noise_Comp,Odor_Unadj,Odor_AgeAdj,PainIntens_RawScore,PainInterf_Tscore,Taste_Unadj,Taste_AgeAdj,Mars_Log_Score,Mars_Errs,Mars_Final
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100004,S900,Q06,M,22-25,False,0,0,0,0.0,False,...,5.2,101.12,86.45,2.0,45.9,107.17,105.31,1.8,0.0,1.8
100206,S900,Q11,M,26-30,True,1,1,4,100.0,True,...,6.0,108.79,97.19,1.0,49.7,72.63,72.03,1.84,0.0,1.84
100307,Q1,Q01,F,26-30,True,1,1,4,100.0,True,...,3.6,101.12,86.45,0.0,38.6,71.69,71.76,1.76,0.0,1.76
100408,Q3,Q03,M,31-35,True,1,1,4,100.0,True,...,2.0,108.79,98.04,2.0,52.6,114.01,113.59,1.76,2.0,1.68
100610,S900,Q08,M,26-30,True,2,1,4,100.0,True,...,2.0,122.25,110.45,0.0,38.6,84.84,85.31,1.92,1.0,1.88


In [8]:
# Combine the netmats and id data in data frame
subject_data = subject_data[['Gender', 'Age']]

# Now merge the two tables
data = subject_data.join(net_mat_data, how = 'right')
data.head()

Unnamed: 0,Gender,Age,netmat
100206,M,26-30,"[[0.0, 0.61676, 9.5727, -5.4959, 0.34639, 3.00..."
100307,F,26-30,"[[0.0, -0.29664, 17.317, -9.0467, -0.28723, 1...."
100408,M,31-35,"[[0.0, 1.6486, 6.6189, -8.8877, 1.4337, 1.006,..."
100610,M,26-30,"[[0.0, -0.90275, 7.7215, -8.3907, 3.3144, 2.93..."
101006,F,31-35,"[[0.0, -0.088768, 9.4979, -10.412, 1.0646, 4.3..."


In [9]:
data.shape

(1003, 3)

## Save Cleaned Data

In [14]:
clean_data_file = os.path.abspath('../../Data/data_clean.csv.gz')

In [15]:
np.set_printoptions(threshold = ICAd ** 2)

# Save it as a cleaned file so other notebooks can use the data
data.to_csv(clean_data_file, index_label = 'subject_id', compression = 'gzip')

## Get Cleaned Data

**Copy code block below to get cleaned data**

In [2]:
import pandas as pd
import numpy as np
import os

import gzip # Extract gz file
from ast import literal_eval # Gets network matrix array from csv 
import re

data_file = os.path.abspath('../../Data/data_clean.csv.gz')

with gzip.open(data_file) as filepath:
    data_clean = pd.read_csv(filepath, index_col = 'subject_id', 
            converters = {'netmat' : lambda x : np.array(literal_eval(re.sub('(?<!\[)\s+|[\\n]', ', ', x)))})

In [3]:
data_clean.head()

Unnamed: 0_level_0,Gender,Age,netmat
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100206,M,26-30,"[[0.0, 0.61676, 9.5727, -5.4959, 0.34639, 3.00..."
100307,F,26-30,"[[0.0, -0.29664, 17.317, -9.0467, -0.28723, 1...."
100408,M,31-35,"[[0.0, 1.6486, 6.6189, -8.8877, 1.4337, 1.006,..."
100610,M,26-30,"[[0.0, -0.90275, 7.7215, -8.3907, 3.3144, 2.93..."
101006,F,31-35,"[[0.0, -0.088768, 9.4979, -10.412, 1.0646, 4.3..."
