# Testing DCCA with different objective function

## Importing stuff:

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import torch
import torch.nn as nn
from linear_cca import linear_cca
from torch.utils.data import BatchSampler, SequentialSampler
from DeepCCAModels import DeepCCA
from main import Solver
from utils import load_data, svm_classify
from objectives import cca_loss
try:
    import cPickle as thepickle
except ImportError:
    import _pickle as thepickle
from IPython.display import display
import pickle
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import sklearn.model_selection as model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import pprint
from sklearn import preprocessing

## Reading the data:

In [12]:
unique = pd.read_pickle("./DATA/Linearly_Transformed_Unique_Dataset.pkl")
print(unique.shape)
unique.head(10)

(1302, 209)


Unnamed: 0,PTID,MRID,Date,Age,Sex,DLICV_baseline,APOE4_Alleles,APOE_Genotype,Diagnosis_nearest_2.0,MUSE_Volume_4,...,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358,Diagnosis_nearest_2.0_cat
0,002_S_0295,002_S_0295_2006-04-18,2006-04-18,84.742466,0,1485405.375,1.0,E3/E4,CN,-401.428503,...,1,1,1,0,0,0,0,0,1,0
9,002_S_0413,002_S_0413_2006-05-02,2006-05-02,76.283562,1,1364116.0,0.0,E3/E3,CN,596.355045,...,0,1,1,0,1,0,0,0,0,0
24,002_S_0559,002_S_0559_2006-05-23,2006-05-23,79.223288,0,1570479.625,1.0,E3/E4,CN,224.87456,...,0,0,0,0,1,0,0,0,0,0
31,002_S_0619,002_S_0619_2006-06-01,2006-06-01,77.447945,0,1859348.25,2.0,E4/E4,Dementia,2633.277779,...,0,0,0,1,1,0,0,0,2,1
45,002_S_0729,002_S_0729_2006-07-17,2006-07-17,65.056164,1,1166961.75,1.0,E3/E4,MCI,256.289641,...,0,0,0,1,1,0,0,0,1,2
64,002_S_0816,002_S_0816_2006-08-30,2006-08-30,70.767123,0,1444128.125,2.0,E4/E4,Dementia,-126.260419,...,0,0,0,0,1,0,0,0,2,1
69,002_S_0938,002_S_0938_2006-10-05,2006-10-05,82.167123,1,1309685.0,0.0,E3/E3,Dementia,200.102369,...,0,1,1,0,1,0,0,0,0,1
74,002_S_0954,002_S_0954_2006-10-10,2006-10-10,69.19863,1,1075661.5,1.0,E3/E4,MCI,-60.539913,...,2,1,1,0,1,0,0,0,1,2
81,002_S_0955,002_S_0955_2006-10-11,2006-10-11,78.161644,1,1363607.0,1.0,E3/E4,Dementia,1058.028132,...,1,0,0,0,1,0,0,0,1,1
84,002_S_1018,002_S_1018_2006-11-29,2006-11-29,70.658904,1,1355603.0,0.0,E3/E3,Dementia,-485.048304,...,1,1,1,0,0,0,0,0,0,1


### Scaling the data:

This is needed for SKL CCA objective function to function properly:

In [13]:
# Scale the data:
scaler = preprocessing.StandardScaler()
c = list(unique.columns)
MRI_columns = c[c.index("MUSE_Volume_4"):c.index("MUSE_Volume_207")+1]
unique_scaled = scaler.fit_transform(unique[MRI_columns])
unique[MRI_columns] = unique_scaled
unique.head(10)

Unnamed: 0,PTID,MRID,Date,Age,Sex,DLICV_baseline,APOE4_Alleles,APOE_Genotype,Diagnosis_nearest_2.0,MUSE_Volume_4,...,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358,Diagnosis_nearest_2.0_cat
0,002_S_0295,002_S_0295_2006-04-18,2006-04-18,84.742466,0,1485405.375,1.0,E3/E4,CN,-0.854457,...,1,1,1,0,0,0,0,0,1,0
9,002_S_0413,002_S_0413_2006-05-02,2006-05-02,76.283562,1,1364116.0,0.0,E3/E3,CN,0.712451,...,0,1,1,0,1,0,0,0,0,0
24,002_S_0559,002_S_0559_2006-05-23,2006-05-23,79.223288,0,1570479.625,1.0,E3/E4,CN,0.129082,...,0,0,0,0,1,0,0,0,0,0
31,002_S_0619,002_S_0619_2006-06-01,2006-06-01,77.447945,0,1859348.25,2.0,E4/E4,Dementia,3.911211,...,0,0,0,1,1,0,0,0,2,1
45,002_S_0729,002_S_0729_2006-07-17,2006-07-17,65.056164,1,1166961.75,1.0,E3/E4,MCI,0.178416,...,0,0,0,1,1,0,0,0,1,2
64,002_S_0816,002_S_0816_2006-08-30,2006-08-30,70.767123,0,1444128.125,2.0,E4/E4,Dementia,-0.422337,...,0,0,0,0,1,0,0,0,2,1
69,002_S_0938,002_S_0938_2006-10-05,2006-10-05,82.167123,1,1309685.0,0.0,E3/E3,Dementia,0.09018,...,0,1,1,0,1,0,0,0,0,1
74,002_S_0954,002_S_0954_2006-10-10,2006-10-10,69.19863,1,1075661.5,1.0,E3/E4,MCI,-0.31913,...,2,1,1,0,1,0,0,0,1,2
81,002_S_0955,002_S_0955_2006-10-11,2006-10-11,78.161644,1,1363607.0,1.0,E3/E4,Dementia,1.437457,...,1,0,0,0,1,0,0,0,1,1
84,002_S_1018,002_S_1018_2006-11-29,2006-11-29,70.658904,1,1355603.0,0.0,E3/E3,Dementia,-0.985773,...,1,1,1,0,0,0,0,0,0,1


### Check for NaN values:

In [16]:
unique.isnull().values.any()

False

## Building the Model:

In [14]:
# Standard parameters that shouldn't be changed:
device = torch.device('cpu')
outdim_size = 145
input_shape1 = 145 # view_1.shape[1]
input_shape2 = 54  # view_2.shape[2]
layer_sizes1 = [25, 100, 100, outdim_size]
layer_sizes2 = [25, 100, 100, outdim_size]
learning_rate = 1e-4
epoch_num = 100
epoch_log_freq = 1
batch_size = 1000
reg_par = 1e-3
use_all_singular_values = False
l_cca = linear_cca()

# Data:
view_1 = unique.loc[:,"MUSE_Volume_4":"MUSE_Volume_207"]
view_2 = unique.loc[:,"rs4575098":"rs429358"]
# Convert the pandas dataframe to numpy arrays for pytorch:
view_1_n = view_1.to_numpy()
view_2_n = view_2.to_numpy()
indices = np.arange(view_1_n.shape[0])
# np.random.shuffle(indices)
view_1_n = view_1_n[indices]
view_2_n = view_2_n[indices].astype(np.float64) # DeepCCA MLP requires double type

view_1_t = torch.from_numpy(view_1_n)
view_2_t = torch.from_numpy(view_2_n)

data1 = view_1_t
data2 = view_2_t

model = DeepCCA(layer_sizes1, layer_sizes2, input_shape1, input_shape2, outdim_size, use_all_singular_values, device=device).double()
solver = Solver(model, l_cca, outdim_size, epoch_num, batch_size, learning_rate, reg_par, device=device, epoch_log_freq=epoch_log_freq, log=True)

s_1, s_2 = data1.shape[0], data2.shape[0]
# Split the dataset into training, validation and testing (75%-15%-10%):
train1, train2 = data1[0:int(s_1 * 0.75)], data2[0:int(s_2 * 0.75)]
val1, val2 = data1[int(s_1 * 0.75):int(s_1 * 0.9)], data2[int(s_2 * 0.75):int(s_2 * 0.9)]
test1, test2 = data1[int(s_1 * 0.9):], data2[int(s_2 * 0.9):]

[ INFO : 2022-04-07 09:35:19,462 ] - DataParallel(
  (module): DeepCCA(
    (model1): MlpNet(
      (layers): ModuleList(
        (0): Sequential(
          (0): Linear(in_features=145, out_features=25, bias=True)
          (1): Sigmoid()
          (2): BatchNorm1d(25, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
        )
        (1): Sequential(
          (0): Linear(in_features=25, out_features=100, bias=True)
          (1): Sigmoid()
          (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
        )
        (2): Sequential(
          (0): Linear(in_features=100, out_features=100, bias=True)
          (1): Sigmoid()
          (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
        )
        (3): Sequential(
          (0): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
          (1): Linear(in_features=100, out_features=145, bias=True)
        )
      

## Training the model:

In [15]:
loss = solver.fit(train1, train2, val1, val2, test1, test2, checkpoint=None)
training_losses, val_losses = solver.get_losses()

[ INFO : 2022-04-07 09:35:29,879 ] - Epoch 1: val_loss did not improve from 0.0000
[ INFO : 2022-04-07 09:35:29,879 ] - Epoch 1/100 - time: 9.73 - training_loss: 1.2555 - val_loss: 1.2851
[ INFO : 2022-04-07 09:35:39,540 ] - Epoch 2: val_loss did not improve from 0.0000
[ INFO : 2022-04-07 09:35:39,540 ] - Epoch 2/100 - time: 9.66 - training_loss: 1.2555 - val_loss: 1.1003


LinAlgError: SVD did not converge