In [1]:
# Prepare

%matplotlib notebook

import numpy as np
import pandas as pd
import scipy
import time
import warnings

from sklearn.linear_model import LinearRegression

In [2]:
efeatures_NMA=pd.read_csv('../data/efeatures_NMA_20.csv')
Xe=np.array(efeatures_NMA.drop(columns=['data','ttype','cellnames']))

geneExp_NMA=pd.read_csv('../data/geneExp_NMA_20.csv')
Xg=np.array(geneExp_NMA.drop(columns=['data','ttype','cellnames']))

Xe = Xe - np.mean(Xe, axis=0)
Xe = Xe / np.std(Xe, axis=0)

Xg = Xg - np.mean(Xg, axis=0)
Xg = Xg / np.std(Xg, axis=0)

print('Shape of Xe:', Xe.shape, '\nShape of Xg:', Xg.shape)

Shape of Xe: (1208, 20) 
Shape of Xg: (1208, 20)


In [3]:
Xg_3=Xg[:,0:3:]
Xe_3=Xe[:,0:3:]

reg = LinearRegression().fit(Xg_3,Xe_3)
r2 = reg.score(Xg_3, Xe_3, sample_weight=None)

r2

0.9769418404940772

## Cross Validation

In [4]:
def cv(X, Y, reps=20, folds=10, dims=np.array([3,5,7,9,11,13,15,20]), seed=42):

    r2 = np.zeros((folds, reps, len(dims))) * np.nan
    
    np.random.seed(seed)
    t = time.time()
    n=X.shape[0]
    for m,dim in enumerate(dims):
            X = Xg[:,0:dim:]
            Y = Xe[:,0:dim:]

            for rep in range(reps):
                #print(rep+1, end='')
                ind = np.random.permutation(n)
                X = X[ind,:]
                Y = Y[ind,:]
            
                # CV folds
                for cvfold in range(folds):
                    #print('.', end='')
    
                    indtest  = np.arange(cvfold*int(n/folds), (cvfold+1)*int(n/folds))
                    indtrain = np.setdiff1d(np.arange(n), indtest)
                    Xtrain = np.copy(X[indtrain,:])
                    Ytrain = np.copy(Y[indtrain,:])
                    Xtest  = np.copy(X[indtest,:])
                    Ytest  = np.copy(Y[indtest,:])
                
                    # mean centering
                    X_mean = np.mean(Xtrain, axis=0)
                    Xtrain -= X_mean
                    Xtest  -= X_mean
                    Y_mean = np.mean(Ytrain, axis=0)
                    Ytrain -= Y_mean
                    Ytest  -= Y_mean
                    
                    fit = LinearRegression().fit(Xtrain,Ytrain)
                    r2[cvfold, rep, m] =1-np.sum((fit.predict(Xtest)-Ytest)**2)/np.sum(Xtest**2)
                
    t = time.time() - t
    min,s = divmod(t, 60)
    h,min = divmod(min, 60)
    print('Time: {}h {:2.0f}min {:2.0f}s'.format(h,min,s))    
    
    return r2

In [5]:
cv_result = cv(X=Xg,Y=Xe)
cv_mean = np.nanmean(cv_result, axis=(0,1))

Time: 0.0h  0min  1s


In [6]:
cv_result

array([[[0.97816906, 0.97726467, 0.97077795, ..., 0.95144781,
         0.96104923, 0.94764038],
        [0.9711292 , 0.96924817, 0.96092604, ..., 0.94411148,
         0.96832167, 0.95793476],
        [0.98068407, 0.97359963, 0.96535019, ..., 0.96433461,
         0.95667512, 0.94533254],
        ...,
        [0.97444722, 0.96357513, 0.96254073, ..., 0.95628798,
         0.95047713, 0.94328988],
        [0.97611957, 0.96627017, 0.97753102, ..., 0.96262242,
         0.95667156, 0.95106707],
        [0.9781545 , 0.97166498, 0.95611437, ..., 0.95135001,
         0.95188547, 0.9534309 ]],

       [[0.97358047, 0.96904416, 0.95673923, ..., 0.95638089,
         0.94966538, 0.95271288],
        [0.97046037, 0.96904715, 0.97174882, ..., 0.96610511,
         0.96012551, 0.94785334],
        [0.98073256, 0.96288765, 0.95412499, ..., 0.9618297 ,
         0.95691972, 0.94698089],
        ...,
        [0.98515507, 0.95540698, 0.97038493, ..., 0.95425763,
         0.95522904, 0.95902335],
        [0.9

In [7]:
cv_mean

array([0.97673486, 0.96703491, 0.96409212, 0.96161062, 0.95876097,
       0.95733931, 0.95539223, 0.95234821])