# Use DeepCCA to transform ADNI features

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import torch
import torch.nn as nn
import numpy as np
from linear_cca import linear_cca
from torch.utils.data import BatchSampler, SequentialSampler
from DeepCCAModels import DeepCCA
from main import Solver
from utils import load_data, svm_classify
from objectives import cca_loss
try:
    import cPickle as thepickle
except ImportError:
    import _pickle as thepickle
from IPython.display import display
import pickle
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import sklearn.model_selection as model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

### Read the database, examine it:

Instead of reading the whole database, we read only the data that's useful to us. That is, we read only specific columns of data, and we take only the row containing the first scan for each person. 

In "ADNI Regressional Analysis.ipynb" we have done that exactly, as well as performed linear regression transformation to the imaging data, in order to remove any age, sex, and DLICV_baseline effect. 

Furthermore, in "ADNI OPNMF.ipynb" we have performed dimensionality reduction through the OPNMF method, reducing the number of the ROIs from 145 to just 18. (Hasn't been done so this does not apply)

The data is located at "./DATA/Reduced_Linearly_Transformed_Unique_Dataset.pkl" 

(Need to run the RA code if data is not found)

In [3]:
unique = pd.read_pickle("./DATA/Linearly_Transformed_Unique_Dataset.pkl")
print(unique.shape)
unique.head(15)

(1302, 208)


Unnamed: 0,PTID,Date,Age,Sex,DLICV_baseline,APOE4_Alleles,APOE_Genotype,Diagnosis_nearest_2.0,MUSE_Volume_4,MUSE_Volume_11,...,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358,Diagnosis_nearest_2.0_cat
0,002_S_0295,2006-04-18,84.742466,0,1485405.375,1.0,E3/E4,CN,-440.777069,-507.297168,...,1,1,1,0,0,0,0,0,1,0
9,002_S_0413,2006-05-02,76.283562,1,1364116.0,0.0,E3/E3,CN,577.755137,-188.813792,...,0,1,1,0,1,0,0,0,0,0
24,002_S_0559,2006-05-23,79.223288,0,1570479.625,1.0,E3/E4,CN,198.499249,1080.290951,...,0,0,0,0,1,0,0,0,0,0
31,002_S_0619,2006-06-01,77.447945,0,1859348.25,2.0,E4/E4,Dementia,2623.687012,649.558822,...,0,0,0,1,1,0,0,0,2,1
45,002_S_0729,2006-07-17,65.056164,1,1166961.75,1.0,E3/E4,MCI,246.226215,628.340793,...,0,0,0,1,1,0,0,0,1,2
64,002_S_0816,2006-08-30,70.767123,0,1444128.125,2.0,E4/E4,Dementia,-145.138564,-193.593195,...,0,0,0,0,1,0,0,0,2,1
69,002_S_0938,2006-10-05,82.167123,1,1309685.0,0.0,E3/E3,Dementia,169.421397,-610.085153,...,0,1,1,0,1,0,0,0,0,1
74,002_S_0954,2006-10-10,69.19863,1,1075661.5,1.0,E3/E4,MCI,-81.66421,1343.833768,...,2,1,1,0,1,0,0,0,1,2
81,002_S_0955,2006-10-11,78.161644,1,1363607.0,1.0,E3/E4,Dementia,1036.385233,-353.324662,...,1,0,0,0,1,0,0,0,1,1
84,002_S_1018,2006-11-29,70.658904,1,1355603.0,0.0,E3/E3,Dementia,-495.01885,486.447691,...,1,1,1,0,0,0,0,0,0,1


### Create the 2 views:

The first view consists of the imaging data, that are in the form of 145 real numbers. Those numbers are based on a prediction from a Linear Regression estimator trained only on the Cognitive Normal datapoints. The predictions then are subtracted from the actual values, and the remaining value (residual) is the datapoint for each ROI.

The second view consists of the 54 SNP (Single Nucleotide Polymorphism, "snip"), for each individual. They are either 0 or 1. 

The 2 views are the most basic views that can be used for the Deep CCA, and in further tests more features will be included.

In [4]:
# View 1:
view_1 = unique.loc[:,"MUSE_Volume_4":"MUSE_Volume_207"]

# View 2:
view_2 = unique.loc[:,"rs4575098":"rs429358"]

In [5]:
print("View 1:")
display(view_1.head())
print("View 2:")
display(view_2.head())

View 1:


Unnamed: 0,MUSE_Volume_4,MUSE_Volume_11,MUSE_Volume_23,MUSE_Volume_30,MUSE_Volume_31,MUSE_Volume_32,MUSE_Volume_35,MUSE_Volume_36,MUSE_Volume_37,MUSE_Volume_38,...,MUSE_Volume_198,MUSE_Volume_199,MUSE_Volume_200,MUSE_Volume_201,MUSE_Volume_202,MUSE_Volume_203,MUSE_Volume_204,MUSE_Volume_205,MUSE_Volume_206,MUSE_Volume_207
0,-440.777069,-507.297168,-35.171797,-14.510211,90.244138,140.075639,299.828133,63.88968,56.259492,4434.963481,...,745.557312,-188.95447,-1594.432454,-1648.308374,798.003198,-468.672456,-81.798945,283.990527,-134.708868,-102.291612
9,577.755137,-188.813792,35.574764,-39.881572,40.161648,58.255314,-909.95651,-107.325098,118.445748,-932.590538,...,1336.384182,2631.004114,1410.754665,30.295558,-1258.071206,115.187177,-175.177715,-533.517736,-37.990106,-475.586534
24,198.499249,1080.290951,137.416288,142.58683,121.231074,41.449232,1825.886437,-267.694901,6.605333,-947.176391,...,-1764.15837,-2206.292278,1473.087979,532.054466,1714.763199,2469.640085,209.533224,-49.858132,-206.268764,-117.520261
31,2623.687012,649.558822,-162.939446,-122.19178,-329.934406,-351.510297,-3426.992838,-826.297201,-713.213854,-355.750507,...,-641.454806,583.322773,-701.560285,-1369.412583,-2919.253412,-2766.270514,-757.912814,-822.7715,-347.672981,-131.863034
45,246.226215,628.340793,10.979183,24.346908,-165.999584,-114.587813,171.517739,628.498317,88.70557,-521.590388,...,-346.626209,-670.579403,163.045892,1008.186971,-1557.957769,-1396.447884,-146.49525,-188.233592,-200.821122,-254.208574


View 2:


Unnamed: 0,rs4575098,rs6656401,rs2093760,rs4844610,rs4663105,rs6733839,rs10933431,rs35349669,rs6448453,rs190982,...,rs28394864,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358
0,0,0,0,0,1,1,1,0,0,1,...,0,1,1,1,0,0,0,0,0,1
9,1,0,0,0,0,0,0,1,0,0,...,1,0,1,1,0,1,0,0,0,0
24,0,0,0,0,1,0,1,0,0,1,...,2,0,0,0,0,1,0,0,0,0
31,0,1,1,1,0,0,0,2,1,1,...,1,0,0,0,1,1,0,0,0,2
45,0,0,0,0,1,1,0,1,0,0,...,1,0,0,0,1,1,0,0,0,1


### Parameters:

In [6]:
# if a gpu exists, torch.device should be 'gpu'
if torch.cuda.is_available():
    device = torch.device('gpu')
else:
    device = torch.device('cpu')
# print("Using", torch.cuda.device_count(), "GPUs")

# the path to save the final learned features
save_to = './DATA/ADNI_initial_DCCA_features.pkl'

# the size of the new space learned by the model (number of the new features)
outdim_size = 50

# size of the input for view 1 and view 2
input_shape1 = 145 # view_1.shape[1]
input_shape2 = 54  # view_2.shape[2]

# number of layers with nodes in each one
# this apparently can be different for each network, some experimentation is needed!
layer_sizes1 = [256, 1024, 1024, outdim_size]
layer_sizes2 = [256, 1024, 1024, outdim_size]
# layer_sizes1 = [64, 128, outdim_size]
# layer_sizes2 = [64, 128, outdim_size]
# the parameters for training the network
learning_rate = 1e-3
epoch_num = 200
batch_size = 1000

# the regularization parameter of the network
# seems necessary to avoid the gradient exploding especially when non-saturating activations are used
reg_par = 1e-3

# specifies if all the singular values should get used to calculate the correlation or just the top 
# outdim_size ones
# if one option does not work for a network or dataset, try the other one
use_all_singular_values = False

# if a linear CCA should get applied on the learned features extracted from the networks
# it does not affect the performance on noisy MNIST significantly
apply_linear_cca = True

###  Building, training, and producing the new features by DCCA

In [7]:
# Convert the pandas dataframe to numpy arrays for pytorch:
view_1_n = view_1.to_numpy()
view_2_n = view_2.to_numpy()

In [8]:
# Scramble the datapoints for randomness:
indices = np.arange(view_1_n.shape[0])
# np.random.shuffle(indices)
view_1_n = view_1_n[indices]
view_2_n = view_2_n[indices].astype(np.float64) # DeepCCA MLP requires double type

print(view_1_n.shape, type(view_1_n), view_1_n.dtype)
print(view_2_n.shape, type(view_2_n), view_2_n.dtype)

view_1_t = torch.from_numpy(view_1_n)
print(view_1_t.shape, type(view_1_t))
view_2_t = torch.from_numpy(view_2_n)
print(view_2_t.shape, type(view_2_t))

(1302, 145) <class 'numpy.ndarray'> float64
(1302, 54) <class 'numpy.ndarray'> float64
torch.Size([1302, 145]) <class 'torch.Tensor'>
torch.Size([1302, 54]) <class 'torch.Tensor'>


In [9]:
data1 = view_1_t
data2 = view_2_t

model = DeepCCA(layer_sizes1, layer_sizes2, input_shape1,
                input_shape2, outdim_size, use_all_singular_values, device=device).double()
l_cca = None
if apply_linear_cca:
    l_cca = linear_cca()
    
    
solver = Solver(model, l_cca, outdim_size, epoch_num, batch_size,
                learning_rate, reg_par, device=device, epoch_log_freq=20)
s_1, s_2 = data1.shape[0], data2.shape[0]

# Split the dataset into training, validation and testing (75%-15%-10%):
train1, train2 = data1[0:int(s_1 * 0.75)], data2[0:int(s_2 * 0.75)]
val1, val2 = data1[int(s_1 * 0.75):int(s_1 * 0.9)], data2[int(s_2 * 0.75):int(s_2 * 0.9)]
test1, test2 = data1[int(s_1 * 0.9):], data2[int(s_2 * 0.9):]

solver.fit(train1, train2, val1, val2, test1, test2)
# TODO: Save linear_cca model if needed

[ INFO : 2022-01-11 17:00:16,600 ] - DataParallel(
  (module): DeepCCA(
    (model1): MlpNet(
      (layers): ModuleList(
        (0): Sequential(
          (0): Linear(in_features=145, out_features=256, bias=True)
          (1): Sigmoid()
          (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
        )
        (1): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): Sigmoid()
          (2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
        )
        (2): Sequential(
          (0): Linear(in_features=1024, out_features=1024, bias=True)
          (1): Sigmoid()
          (2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
        )
        (3): Sequential(
          (0): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
          (1): Linear(in_features=1024, out_features=50, bias=True)
       

[ INFO : 2022-01-11 17:01:21,576 ] - Epoch 81: val_loss did not improve from -23.0617
[ INFO : 2022-01-11 17:01:21,577 ] - Epoch 81/200 - time: 0.79 - training_loss: -43.7028 - val_loss: -22.7452
[ INFO : 2022-01-11 17:01:37,474 ] - Epoch 101: val_loss did not improve from -23.0617
[ INFO : 2022-01-11 17:01:37,474 ] - Epoch 101/200 - time: 0.79 - training_loss: -44.8188 - val_loss: -22.5855
[ INFO : 2022-01-11 17:01:53,426 ] - Epoch 121: val_loss did not improve from -23.0617
[ INFO : 2022-01-11 17:01:53,427 ] - Epoch 121/200 - time: 0.80 - training_loss: -45.5785 - val_loss: -22.6083
[ INFO : 2022-01-11 17:02:10,228 ] - Epoch 141: val_loss did not improve from -23.0617
[ INFO : 2022-01-11 17:02:10,229 ] - Epoch 141/200 - time: 0.88 - training_loss: -46.1268 - val_loss: -22.7041
[ INFO : 2022-01-11 17:02:26,721 ] - Epoch 161: val_loss did not improve from -23.0617
[ INFO : 2022-01-11 17:02:26,722 ] - Epoch 161/200 - time: 0.81 - training_loss: -46.5432 - val_loss: -22.6211
[ INFO : 202

In [10]:
set_size = [0, 
            train1.size(0), 
            train1.size(0) + val1.size(0), 
            train1.size(0) + val1.size(0) + test1.size(0)]

losses, outputs = solver._get_outputs(data1, data2)
losses = np.round(losses,3)
print(losses)
print(np.mean(losses))

[-48.005 -17.84 ]
-32.9225


In [11]:
print(type(outputs[0]))
print(outputs[0].shape)
print(type(outputs[1]))
print(outputs[1].shape)

<class 'numpy.ndarray'>
(1302, 50)
<class 'numpy.ndarray'>
(1302, 50)


### Saving the new features:

In [12]:
# Saving new features in a gzip pickled file specified by save_to
with open(save_to, 'wb') as f:
    pickle.dump(outputs, f)

### Loading the model:

In [13]:
d = torch.load('checkpoint.model')
solver.model.load_state_dict(d)
solver.model.parameters()

<generator object Module.parameters at 0x7fcaab88fac0>

### Testing the Correlation between inputs and outputs of the deep Network:

In [14]:
l_i = cca_loss(outdim_size, use_all_singular_values, device)
loss_i = l_i.loss(data1[:,0:54], data2).item() # first 54 columns of imaging data
l_o = cca_loss(outdim_size, use_all_singular_values, device)
loss_o = l_o.loss(torch.from_numpy(outputs[0]), torch.from_numpy(outputs[0])).item()

print("Deep Network input correlation: ",  loss_i)
print("Deep Network output correlation: ", loss_o)

Deep Network input correlation:  -9.61433374246316
Deep Network output correlation:  -49.89852184343137


In [15]:
from sklearn.cross_decomposition import CCA

print("CCA on input data:")
X = data1
Y = data2
cca = CCA(n_components=50)
cca.fit(X, Y)
X_c, Y_c = cca.transform(X, Y)
print(cca.score(X, Y))

print("CCA on output data:")
X = outputs[0]
Y = outputs[1]
cca = CCA(n_components=50,max_iter=10000)
cca.fit(X, Y)
X_c, Y_c = cca.transform(X, Y)
print(cca.score(X, Y))

CCA on input data:
-0.4337692736170777
CCA on output data:
0.5283700944219817


### Training and testing of SVM with linear kernel on the view 1 with new features vs old features: (Imaging)

In [36]:
s = svm.LinearSVC(dual=False)
X , Y = view_1, unique["Diagnosis_nearest_2.0_cat"]
parameters = {'C':[0.0001, 0.001, 0.01, 0.1, 1, 10]}
clf = GridSearchCV(s, parameters, n_jobs=-1, cv=10)
clf.fit(X,Y)
print("Best Parameters for untrained data:",clf.best_params_)
acc_untrained = clf.best_score_
print("Untrained Accuracy: ", round(np.mean(acc_untrained)*100,3))

s = svm.LinearSVC(dual=False)
X , Y = outputs[0], unique["Diagnosis_nearest_2.0_cat"]
parameters = {'C':[0.0001, 0.001, 0.01, 0.1, 1, 10]}
clf = GridSearchCV(s, parameters, n_jobs=-1, cv=10)
clf.fit(X,Y)
print("Best Parameters for trained data:",clf.best_params_)
acc_untrained = clf.best_score_
print("Trained Accuracy:   ", round(np.mean(acc_trained)*100,3))

Best Parameters for untrained data: {'C': 1}
Untrained Accuracy:  51.688
Best Parameters for trained data: {'C': 0.001}
Trained Accuracy:    49.162


### Training and testing of SVM with linear kernel on the view 2 with new features vs old features: (Genetic)

In [37]:
s = svm.LinearSVC(dual=False)
X , Y = view_2, unique["Diagnosis_nearest_2.0_cat"]
parameters = {'C':[0.0001, 0.001, 0.01, 0.1, 1, 10]}
clf = GridSearchCV(s, parameters, n_jobs=-1, cv=10)
clf.fit(X,Y)
print("Best Parameters for untrained data:",clf.best_params_)
acc_untrained = clf.best_score_
print("Untrained Accuracy: ", round(np.mean(acc_untrained)*100,3))

s = svm.LinearSVC(dual=False)
X , Y = outputs[1], unique["Diagnosis_nearest_2.0_cat"]
parameters = {'C':[0.0001, 0.001, 0.01, 0.1, 1, 10]}
clf = GridSearchCV(s, parameters, n_jobs=-1, cv=10)
clf.fit(X,Y)
print("Best Parameters for trained data:",clf.best_params_)
acc_untrained = clf.best_score_
print("Trained Accuracy:   ", round(np.mean(acc_trained)*100,3))

Best Parameters for untrained data: {'C': 0.0001}
Untrained Accuracy:  48.08
Best Parameters for trained data: {'C': 10}
Trained Accuracy:    49.162


### Training and testing of SVM with linear kernel on both views with new features vs old features: (Imaging + Genetic)

In [31]:
print(outputs[0].shape)
print(outputs[1].shape)
both = np.concatenate((outputs[0], outputs[1]), axis=1)
print(both.shape)

(1302, 50)
(1302, 50)
(1302, 100)


In [38]:
c = list(unique.columns)
MRI_columns = c[c.index("MUSE_Volume_4"):c.index("MUSE_Volume_207")+1]
genetic_columns = c[c.index("rs4575098"):c.index("rs429358")+1]
columns_of_interest = []
columns_of_interest += MRI_columns + genetic_columns





s = svm.LinearSVC(dual=False)
X , Y = unique[columns_of_interest] , unique["Diagnosis_nearest_2.0_cat"]
parameters = {'C':[0.0001, 0.001, 0.01, 0.1, 1, 10]}
clf = GridSearchCV(s, parameters, n_jobs=-1, cv=10)
clf.fit(X,Y)
print("Best Parameters for untrained data:",clf.best_params_)
acc_untrained = clf.best_score_
print("Untrained Accuracy: ", round(np.mean(acc_untrained)*100,3))

s = svm.LinearSVC(dual=False)
X , Y = both, unique["Diagnosis_nearest_2.0_cat"]
parameters = {'C':[0.0001, 0.001, 0.01, 0.1, 1, 10]}
clf = GridSearchCV(s, parameters, n_jobs=-1, cv=10)
clf.fit(X,Y)
print("Best Parameters for trained data:",clf.best_params_)
acc_untrained = clf.best_score_
print("Trained Accuracy:   ", round(np.mean(acc_trained)*100,3))

Best Parameters for untrained data: {'C': 0.1}
Untrained Accuracy:  52.912
Best Parameters for trained data: {'C': 10}
Trained Accuracy:    49.162
