# AMP-SCZ MRIQC data prep

In [None]:
import bids
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [None]:
# XXX Change this to location of MRIQC directory
MRIQC_PATH = '/mnt/church/amp-scz/mriqc'
# XXX Change this to ODS export location of U24 MRI QC spreadsheet
MANUAL_QC_EXPORT = 'U24 MRI QC.ods'

In [None]:
orig_ratings = pd.read_excel(MANUAL_QC_EXPORT)

In [None]:
# Select rows with defined subject and session, retrieve MRIQC-relevant ratings
# Drop sessions that are not yet rated
pared_ratings = orig_ratings.loc[
    ~(orig_ratings['sub-*/ses-*_gs'].isna()),
    ['sub_id_gs', 'ses_id_gs', 'T1w', 'T2w', 'fmri AP bold run 1', 'fmri AP bold run 2', 'fmri PA bold run 1', 'fmri PA bold run 2'],
].dropna()
# Session labels interpreted as floats
pared_ratings['ses_id_gs'] = pared_ratings['ses_id_gs'].astype(int).astype('str')

In [None]:
def tail_to_query(tail):
    components = tail.split('_')
    query = {'suffix': components.pop()}
    query.update([entity.split('-') for entity in components])
    if 'run' in query:
        query['run'] = int(query['run'])
    if 'dir' in query:
        query['direction'] = query.pop('dir')
    return query

In [None]:
# Tidy up
ratings = pared_ratings.rename(
    columns={
        'sub_id_gs': 'subject',
        'ses_id_gs': 'session',
        'fmri AP bold run 1': 'dir-AP_run-1_bold',
        'fmri AP bold run 2': 'dir-AP_run-2_bold',
        'fmri PA bold run 1': 'dir-PA_run-1_bold',
        'fmri PA bold run 2': 'dir-PA_run-2_bold',
    }
).melt(id_vars=['subject', 'session'], var_name='tail', value_name='rating')

# Fix run entity for bad run 2
fix_index = ratings.rating == '4 (for Run 3)'
ratings.loc[fix_index, 'tail'] = ratings.loc[fix_index, 'tail'].values[0].replace('2', '3')
ratings.loc[fix_index, 'rating'] = '4'

# Drop missing ratings ("-")
ratings.rating = pd.to_numeric(ratings.rating, errors='coerce')
ratings = ratings.dropna()

In [None]:
ratings

In [None]:
sum(ratings['rating'] < 3)

In [None]:
layout = bids.BIDSLayout(MRIQC_PATH, validate=False, database_path="./u24mriqc.db")

In [None]:
def make_iqm_record(record, layout):
    out = record.copy()
    query = record.copy()
    
    tail = query.pop('tail')
    query.update(tail_to_query(tail))
    jsons = layout.get(extension='.json', **query)
    if not jsons:
        # Missing MRIQC results, will just show up as nans
        return record
    jsonfile = jsons[-1]  # T1w images have norm/non-norm variants, pick norm

    iqms = jsonfile.get_dict()
    del iqms['bids_meta']
    del iqms['provenance']
    
    out.update(iqms)
    return out

In [None]:
iqm_records = [
    make_iqm_record(record, layout)
    for record in ratings[['subject', 'session', 'tail']].to_dict(orient='records')
]

In [None]:
all_scores = pd.merge(ratings, pd.DataFrame.from_records(iqm_records))
all_scores = all_scores.set_index(pd.Index(all_scores[["subject", "session", "tail"]]))

all_scores['suffix'] = all_scores['tail'].apply(lambda x: x.split('_')[-1])
all_scores['pass'] = (all_scores['rating'] > 2).astype(int)

In [None]:
all_scores['pass']

In [None]:
T1w_scores = all_scores[all_scores.suffix == 'T1w'].select_dtypes(include=np.number).dropna(axis=1, how='all').dropna(axis=0)
T2w_scores = all_scores[all_scores.suffix == 'T2w'].select_dtypes(include=np.number).dropna(axis=1, how='all').dropna(axis=0)
BOLD_scores = all_scores[all_scores.suffix == 'bold'].select_dtypes(include=np.number).dropna(axis=1, how='all').dropna(axis=0)

In [None]:
T1w_train, T1w_test = train_test_split(T1w_scores, test_size=0.1, random_state=42)
T2w_train, T2w_test = train_test_split(T2w_scores, test_size=0.1, random_state=42)
BOLD_train, BOLD_test = train_test_split(BOLD_scores, test_size=0.1, random_state=42)

In [None]:
T1w_train.to_csv('T1w_train.csv')
T2w_train.to_csv('T2w_train.csv')
BOLD_train.to_csv('BOLD_train.csv')
T1w_test.to_csv('T1w_test.csv')
T2w_test.to_csv('T2w_test.csv')
BOLD_test.to_csv('BOLD_test.csv')

In [None]:
print(T1w_scores.shape, T2w_scores.shape, BOLD_scores.shape)

In [None]:
T1w_scores.corrwith(T1w_scores["rating"]).dropna().sort_values()

In [None]:
T1w_scores.corrwith(T1w_scores["rating"]).index

In [None]:
T1w_scores.corrwith(T1w_scores["pass"]).dropna().sort_values()