In [1]:
#### load packages ####
import numpy as np
import pandas as pd
import nilearn
import nibabel as nib

### Model Selection
* **Behavioral metric** : Change in matrix reasoning score (T2- T1)
* **Morphological metric** : Change in mean cortical thickness (T2 - T1)

We use a LASSO regression as a form of feature selection to determine which sulci, if any predict change in matrix reasoning performance. We use cross-validation to tune the shrinking paramater (alpha) and select the model that minimizes cross-validated mean seqared error.

In [2]:
# import from sklearn CV and lasso
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.linear_model import Lasso

loo = LeaveOneOut()

In [3]:
## Lasso ##
def lasso_(Xr, yr, alpha_vals, n_folds = loo):
    ''''
        Determine best paramters for LASSO regression and fit model
        input
        ----------
        Xr = predictors 
        yr = DV
        alpha_vals = array of possible alpha values to test 
        n_folds = folds for cross-validation. Default is loo
        Returns
        --------
        depth values as an array     
    '''

    # determine the alpha-value that minimizes MSE with GridSearchCV.
    lasso = Lasso()
    lasso_regressor = GridSearchCV(lasso, alpha_vals, scoring = 'neg_mean_squared_error', cv = n_folds)
    lasso_regressor.fit(Xr, yr)

    # best alpha and MSE
    best_alpha = lasso_regressor.best_params_
    best_MSE = lasso_regressor.best_score_

    # fit the best model
    best_model = lasso_regressor.best_estimator_
    best_model.fit(Xr,yr)
    best_model.predict(Xr)
    
    #best_model.score_
    return best_alpha, best_MSE, best_model.coef_



In [4]:
## load data (in wide format) for each hemisphere ## 

# right
right_hemi_ct = pd.read_csv('devpfc_matrixr_lasso_rh.csv')

# left
left_hemi_ct = pd.read_csv('devpfc_matrixr_lasso_lh.csv')



In [8]:
## set-up model ## 

#  predictors
Xr = [
     'central', 'ifs', 'imfs_h', 'imfs_v', 'iprs', 'sprs',
     'sfs_a', 'sfs_p', 'pmfs_a', 'pmfs_i', 'pmfs_p',
     'aalf', 'half', 'ds', 'ts', 'prts', 'lfms'
     ]

# DV
yr = ['MatrixR_change']

# alpha values
alpha = {'alpha':[.005, .01, .03, .05, .07, .09, .1, .3, .5, .7, 1]}


#### Right hemisphere lasso

In [9]:
## Lasso on RH sulci ## 

rh_lasso = lasso_(right_hemi_ct[Xr], right_hemi_ct[yr], alpha)

In [10]:
## best RH model ##
# alpha
print(rh_lasso[0])

# neg MSE 
print(rh_lasso[1])

# beta-vals
print(Xr)
print(rh_lasso[2])

{'alpha': 0.5}
-26.756235827664405
['central', 'ifs', 'imfs_h', 'imfs_v', 'iprs', 'sprs', 'sfs_a', 'sfs_p', 'pmfs_a', 'pmfs_i', 'pmfs_p', 'aalf', 'half', 'ds', 'ts', 'prts', 'lfms']
[ 0.  0.  0. -0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


##### Right hemisphere sulci are not related:

'alpha' = 0.5

MSE = (-) 26.756235827664405

betas = 0.  0.  0. -0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.

#### Left hemisphere lasso

In [11]:
lh_lasso = lasso_(left_hemi_ct[Xr], left_hemi_ct[yr], alpha)

In [12]:
## best LH model ##
# alpha
print(lh_lasso[0])

# neg MSE 
print(lh_lasso[1])

# beta-vals
print(Xr)
print(lh_lasso[2])

{'alpha': 0.07}
-24.283969026526297
['central', 'ifs', 'imfs_h', 'imfs_v', 'iprs', 'sprs', 'sfs_a', 'sfs_p', 'pmfs_a', 'pmfs_i', 'pmfs_p', 'aalf', 'half', 'ds', 'ts', 'prts', 'lfms']
[ 0.          0.         -0.         -0.          0.          0.
 -0.          0.         -1.49273578  0.         -0.         14.90415914
  0.         -0.          0.          2.88950575 -8.4316517 ]


##### Left hemisphere sulci are related:

'alpha' = .07,
MSE = (-) 24.28

'pmfs_a' = -1.49
'aalf' = 14.9
'prts' = 2.89
'lfms' = -8.43

### Model Evaluation
After determining which sulci are releted to change in matrix reasoning performance we use these sulci to construct a model to determine predictiveness. 
To account for the wide developmental age range and the correlation between baseline age and change in reasoning performance (r = -0.43, p = .004), we also include baseline age as a predictor. 
We compare this lasso-selected model to alternative nested models. 

In [13]:
## load sklearn functions for loocv ##
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import linear_model
from sklearn.utils import resample

#### Fit linear regression with loocv
We fit all of our models using a leave-out-out cross validation which is appropriate for small sample sizes

In [14]:
# LOOCV
def lm_loocv (Xr, yr, mod):
    """
    linear regression with a leave-one-subject out cross validation proceedure. 
    input: input: Xr = matrix or array of predictors from dataframe  in format X= df[['X1', 'X2', 'Xn']]
    yr = Array of DV of interest from dataframe in format y= df['yr']
    mod = model name as strip. will be used to save a .csv with model predictions are saved as file for later use
    returns: r_squared-cv, mean squared error, dataframe of measured and predicted y-values.

    """    
    
    # create arrays to append y-vals
    ytests = []
    ypreds = []
    
    # set input arrays as np objects
    X_array = np.array(Xr)
    y_array = np.array(yr)
 
     # split into train and test
    for train_idx, test_idx in loo.split(Xr):
        X_train, X_test = X_array[train_idx], X_array[test_idx] #requires arrays
        y_train, y_test = y_array[train_idx], y_array[test_idx]

        # model to fit
        lm = linear_model.LinearRegression() 
        
        # fit model on training data
        model = lm.fit(X_train,y_train) 
        
        # generate predictions with testing data
        y_pred = lm.predict(X_test) 
        
        #there is only one y-test and y-pred per iteration over the loo.split, 
        #so we append each score to respective lists.
        
        ytests += list(y_test) #should be your original y input
        ypreds += list(y_pred)

    #get mean squared error and R2-cv values by comparing the test to the predicted.       
    rr = metrics.r2_score(ytests, ypreds)
    ms_error = metrics.mean_squared_error(ytests, ypreds)
    
    # save predicted/measured scores as a dataframe (and write to csv)
    model_preds = pd.DataFrame({"Measured": ytests, "Predicted": ypreds})
    model_preds.to_csv(path_or_buf= "plots/" + mod + ".csv")
    
    # return regression fit metrics
    return rr , ms_error, model_preds  

#### Setup data
Only left hemisphere sulci were associated with change in matrix reasoning, so we first select only those sulci

#### Model 1a - lasso-selected sulci + baseline age ####
First we fit the model derived from the left hemisphere findings. Four sulci are included as predictors in the model. We also include baseline age as that is significantly related to change in MatrixR (r = -0.43, p = .004).

In [16]:
## set model for selected sulci ##

# x-vals are change in sulcal thickness of selected sulci + baseline age
Xr =[ 'baseline_age','pmfs_a', 'aalf', 'prts', 'lfms' ]

# predicting matrix reasoning score
yr = 'MatrixR_change'

## predict reasoning score ##
mod_1a = lm_loocv(left_hemi_ct[Xr], left_hemi_ct[yr], "model_1a")

# print MSE and R2
# We can assess the fit of this model by looking at MSE-cv and R2 values
print(mod_1a[0], mod_1a[1])

0.2188021381072427 19.941036606605927


Model is strongly predictive of change in matrix reasoning scores (R2 = .22, MSE = 19.94). 

#### Model 2 - baseline age alone
To assess the unique variance *not* due to age. We can remove sulci from the model and use baseline age as the sole predictor. 

In [19]:
## set model for age alone ##
Xr =[ 'baseline_age' ]

In [20]:
## predict reasoning score ##
mod_2 = lm_loocv(left_hemi_ct[Xr], left_hemi_ct[yr], "model_2")

In [21]:
# print MSE and R2
print(mod_2[0], mod_2[1])

0.09766803446319794 23.033133644892366


Removing the tertiary sulci from the model decreases the model fit (R2 = .098, MSE = 23.03). Thus, the change in thickness of these four sulci is explaining additional variance beyond what is explained by baseline age. 

#### Model 3 - all left hemisphere sulci + baseline age
Finally, we can test a model include all left hemisphere LPFC sulci and baseline age in the model.

In [22]:
## set model for ##
Xr = [ 'baseline_age', 'central', 'ifs', 'imfs_h', 'imfs_v', 'iprs', 'sprs',
     'sfs_a', 'sfs_p', 'pmfs_a', 'pmfs_i', 'pmfs_p',
     'aalf', 'half', 'ds', 'ts', 'prts', 'lfms' ]

In [23]:
## predict reasoning score ##
mod_3 = lm_loocv(left_hemi_ct[Xr], left_hemi_ct[yr], "model_1c")

# print MSE and R2
print(mod_3[0], mod_3[1])

-0.6331228579465626 41.687470334971266


Model with all sulci is notably worse (R2 = -0.63, MSE = 41.69). 

#### Correlate model predictions
To further characterize the fit of our best model (model 1a) we can correlate our predicted scores from the model with the subjects' actual matrix reasoning scores. 

In [24]:
## model 1a ## 
mod_1a_preds = mod_1a[2]
mod_1a_preds.corr(method = 'spearman')

Unnamed: 0,Measured,Predicted
Measured,1.0,0.461514
Predicted,0.461514,1.0


Predicted change in reasoning scores correlates strongly with the actual predicted scores (Spearman's r = .46).