## Import and Clean Data

In [None]:
#import
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split
from sklearn.metrics import explained_variance_score, r2_score
from sklearn.linear_model import Ridge

#load in data
#fc and sc are upper triangular vectors for each subject; rows are subjects, columns are pairwise connectivity
fc = pd.read_csv('fc.csv', header=None).values
sc = pd.read_csv('sc.csv', header=None).values

#generate hybrid connectivity
hc=np.concatenate((sc, fc), axis=1)

#select data you want to put into the model
data = hc

#load in cognition data; rows are subjects
cognition = pd.read_csv('cognition.csv', header=None).values

In [None]:
#remove rows with NaN for cognition
data_clean=data[~np.isnan(cognition).any(axis=1)]
cog_clean=cognition[~np.isnan(cognition).any(axis=1)]

## Make Model Choices

In [None]:
#set the number of permutations and cross-validation loops
permutations = 100
cv_loops = 10

#set the hyperparameter gridsearch space
alphas=[x*10+10 for x in range(10)

#set the model you want to use and the parameter grid to use
regr = Ridge(max_iter = 1000000, normalize=True)
paramGrid = {'alpha': alphas}

#set specific cognition metric you want to predict
cog_metric = cog_clean[:,0]

## Run Prediction Model

In [None]:
#create variables to store results
r2 = np.zeros(permutations)
var = np.zeros(permutations)
correlation = np.zeros(permutations)
opt_alpha = np.zeros(permutations)
tuned_alphas = np.zeros([permutations,cv_loops])
predictions = np.zeros([permutations,int(np.ceil(data_clean.shape[0]*0.2))])
cog_test = np.zeros([permutations,int(np.ceil(data_clean.shape[0]*0.2))])
feat_imp = np.zeros([permutations,data_clean.shape[1]])

#iterate through the permutations
for perm in range (permutations):
    
    print("Permutation %d" % perm)
    print(time.localtime(time.time()))
    
    #split data into training and testing set
    x_train, x_test, y_train, y_test = train_test_split(data_clean, cog_metric, test_size=0.2, shuffle=True, random_state=perm)
    
    #create variables to store scores and params from cross-validation loop
    nested_scores = []
    best_params = []

    #iterate through cross-validation loops
    for i in range(cv_loops):
        
        
        #set parameters for inner and outer loops for CV
        inner_cv = KFold(n_splits=5, shuffle=True, random_state=i)
        outer_cv = KFold(n_splits=5, shuffle=True, random_state=i)
        
        #define regressor with grid-search CV for inner loop
        gridSearch = GridSearchCV(estimator=regr, param_grid=paramGrid, n_jobs=-1, verbose=0, cv=inner_cv)
        
        #fit regression model using inner cross-validation 
        gridSearch.fit(x_train, y_train)

        #save parameters corresponding to the best score
        best_params.append(list(gridSearch.best_params_.values()))

        #evaluate model trained on inner loop for CV using outer loop
        nested_score = cross_val_score(gridSearch, X=x_train, y=y_train, cv=outer_cv, 
                                       scoring='r2', verbose=0)
        
        #save r2 scores from outer loop 
        nested_scores.append(np.median(nested_score))

    # save best params
    with open('best_params.txt', 'w') as file:
        for listitem in best_params:
            file.write('{}\n'.format(listitem))    

    # extract best params
    cv_alphas = []
    with open('best_params.txt', 'r') as file:
        for row in file:
            row = row.replace('[', '')
            row = row.replace(']', '')
            pair = row.split(',')
            cv_alphas.append(float(pair[0]))
    
    #save tuned hyperparameters
    tuned_alphas[perm,:] = np.asarray(cv_alphas)

    #choose optimised hyperparameter based on median of tuned hyperparameters
    opt_alpha[perm] = np.median(cv_alphas)
    
    #fit model with optimised hyperparameters on entire training set
    model = Ridge(alpha=optimized_alpha, max_iter=1000000, normalize=True)
    model.fit(x_train, y_train);
    
    #evaluate test r2    
    r2[perm] = model.score(x_test,y_test)
    
    #store test y values
    cog_test[perm,:] = y_test

    #store model predictions
    predictions[perm,:] = model.predict(x_test)
    
    #compute explained variance
    var[perm]=explained_variance_score(cog_test[perm,:], predictions[perm,:])

    #compute Pearson correlation between true and predicted y 
    correlation[perm] = np.corrcoef(cog_test[perm,:], predictions[perm,:])[1,0]

    #store feature importance from model
    feat_imp[perm,:] = model.coef_
    

## Save Results

In [None]:
#save results at the end of each permutation
np.savetxt('r2.txt', r2, delimiter=',')
np.savetxt('var.txt', var, delimiter=',')
np.savetxt('correlation.txt', correlation, delimiter=',')
np.savetxt('opt_alpha.txt', opt_alpha, delimiter=',')
np.savetxt('predictions.txt', predictions, delimiter=',')
np.savetxt('cog_test.txt', cog_test, delimiter=',')
np.savetxt('feat_imp.txt', feat_imp, delimiter=',')