In [1]:
import pandas as pd
import numpy as np
import os
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler

In [2]:
# load global structural measures
global_data = pd.read_csv('raw_data/global_brain_measures.csv').dropna(how='any')

In [3]:
# load scanning positions x,y,z
pos_data = pd.read_csv('raw_data/scanning_positions.csv').dropna(how='any')
# load covariates 
age_sex = pd.read_csv('raw_data/ukbb_age_sex.csv').dropna(how='any')
cols = ['eid']
for i in range(25):
    cols.append('PC'+str(i+1))
PCs = pd.read_csv('raw_data/top_100_PCs.csv')[cols].dropna(how='any')

In [4]:
# load PRS data
PRS_data = pd.read_csv('raw_data/multi_PRSs.csv')

In [5]:
# merge data

global_data.set_index('eid',inplace=True)
pos_data.set_index('eid',inplace=True)
age_sex.set_index('eid',inplace=True)
PCs.set_index('eid',inplace=True)
PRS_data.set_index('eid',inplace=True)

l = list(set(global_data.index) & set(pos_data.index) & set(age_sex.index) & set(PCs.index) & set(PRS_data.index))
final_global_measures = global_data.loc[l]
final_pos = pos_data.loc[l]
final_age_sex = age_sex.loc[l]
final_PCs = PCs.loc[l]
final_PRSs = PRS_data.loc[l]

final_global_measures.reset_index(inplace=True)
final_pos.reset_index(inplace=True)
final_age_sex.reset_index(inplace=True)
final_PCs.reset_index(inplace=True)
final_PRSs.reset_index(inplace=True)

In [6]:
# create covariates
PCs_25 = final_PCs.iloc[:,1:].values
age = final_age_sex.iloc[:,2:3].values
sex = final_age_sex.iloc[:,1:2].values
postions = final_pos.iloc[:,1:].values
sex = sex + 1
co = np.hstack((PCs_25,age,sex,age*age,age*sex,age*age*sex,postions))

In [7]:
# global strutural measures
ACT = final_global_measures['ACT'].values
TCSA = final_global_measures['TCSA'].values
ICV = final_global_measures['ICV'].values

In [8]:
def regression_covariant(covariant_matrix, y, standard_scale=False):
    a = np.hstack((covariant_matrix,np.ones((covariant_matrix.shape[0], 1))))
    w = np.linalg.lstsq(a,y,rcond=None)[0]

    residual = y - covariant_matrix.dot(w[:-1])
    residual = residual.astype('float64')

    if standard_scale:
        residual = StandardScaler().fit_transform(residual.reshape(-1,1)).flatten()

    return residual, w

In [9]:
from scipy import stats

def pearsonr_ci(x,y,alpha=0.05):
    ''' calculate Pearson correlation along with the confidence interval using scipy and numpy
    Parameters
    ----------
    x, y : iterable object such as a list or np.array
      Input for correlation calculation
    alpha : float
      Significance level. 0.05 by default
    Returns
    -------
    r : float
      Pearson's correlation coefficient
    pval : float
      The corresponding p value
    lo, hi : float
      The lower and upper bound of confidence intervals
    '''

    r, p = stats.pearsonr(x,y)
    r_z = np.arctanh(r)
    se = 1/np.sqrt(x.size-3)
    z = stats.norm.ppf(1-alpha/2)
    lo_z, hi_z = r_z-z*se, r_z+z*se
    lo, hi = np.tanh((lo_z, hi_z))
    return r, p, lo, hi

In [27]:
X = final_PRSs.iloc[:,1:].values
X[:,7] = -X[:,7]
X[:,8] = -X[:,8]
X[:,11] = -X[:,11]

n_traits = X.shape[1]

R = np.empty([n_traits,3])
P = np.empty([n_traits,3])
Low = np.empty([n_traits,3])
High = np.empty([n_traits,3])

[rTCSA,w] = regression_covariant(co,TCSA,standard_scale=True)
[rACT,w] = regression_covariant(co,ACT,standard_scale=True)
[rICV,w] = regression_covariant(co,ICV,standard_scale=True)

for i in range(n_traits):
    x = X[:,i]
    [rx,w1] = regression_covariant(co,x,standard_scale=True)
    r1,p1,lo1,hi1 = pearsonr_ci(rx, rTCSA)
    r2,p2,lo2,hi2 = pearsonr_ci(rx, rACT)
    r3,p3,lo3,hi3 = pearsonr_ci(rx, rICV)
    R[i,0] = r1
    R[i,1] = r2
    R[i,2] = r3
    P[i,0] = p1
    P[i,1] = p2
    P[i,2] = p3
    Low[i,0] = round(lo1, 4)
    Low[i,1] = round(lo2, 4)
    Low[i,2] = round(lo3, 4)
    High[i,0] = round(hi1, 4)
    High[i,1] = round(hi2, 4)
    High[i,2] = round(hi3, 4)

In [28]:
CIs1 = []
CIs2 = []
CIs3 = []
for i in range(n_traits):
    CIs1.append('(' + str(Low[i,0]) + ' , ' + str(High[i,0]) + ')') 
    CIs2.append('(' + str(Low[i,1]) + ' , ' + str(High[i,1]) + ')') 
    CIs3.append('(' + str(Low[i,2]) + ' , ' + str(High[i,2]) + ')') 

In [40]:
from statsmodels.stats import multitest

def fdr_correction(P):
    size = P.shape
    temp_p = P.flatten()
    Ps = multitest.multipletests(temp_p,alpha=0.05,method='fdr_bh')
    P_corrected = Ps[1].reshape(size)
    return P_corrected

  import pandas.util.testing as tm


In [41]:
re_cols = ['TCSA','ACT','ICV']
re_R = pd.DataFrame(data=R,columns=re_cols)
re_R.to_csv('results/global_structural_measures/re_R.csv',index=False)
re_P = pd.DataFrame(data=P,columns=re_cols)
re_P.to_csv('results/global_structural_measures/re_P.csv',index=False)
correct_P = fdr_correction(P)
re_P_corrected = pd.DataFrame(data=correct_P,columns=re_cols)
re_P_corrected.to_csv('results/global_structural_measures/re_P_corrected.csv',index=False)

In [29]:
CI_data = pd.DataFrame(data=CIs1,columns=['TCSA'])
CI_data['ACT'] = CIs2
CI_data['ICV'] = CIs3
CI_data.to_csv('results/global_structural_measures/CIs.csv',index=False)