In [1]:
import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings('ignore',category=FutureWarning)
warnings.filterwarnings('ignore',category=RRuntimeWarning)
import sys
import os

In [2]:
import pandas as pd
import numpy as np

In [3]:
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [4]:
# Enable R magic
%load_ext rpy2.ipython

In [6]:
HERE = os.path.dirname(os.path.realpath('__file__'))
PROJECT = os.path.abspath(os.path.join(HERE, '..'))
DATA = os.path.join(PROJECT, 'data')

### Our AD PD based training data
SUBGRAPH_15_RDATA = os.path.join(DATA, 'subgraph15_snpset148.RData')
ADPD_AUTOENCODER_MODEL = os.path.join(DATA, 'models/')                              
AUTOENCODER_TRAINED_MATRIX = os.path.join(DATA, "autoencoder_trained_matrix.RData")
TRAINED_PATIENT_CLUSTERS = os.path.join(DATA, "trained_patient_clusters.RData")

## USER Data
USER_FILE = os.path.join(DATA, 'Asif_Genotype_Disease_Only_ROSMAP.csv')
OUTPUT_FILE = os.path.join(DATA, "patient_clusters.csv")

In [7]:
%%R

library(h2o)
library(dplyr)

R[write to console]: 
----------------------------------------------------------------------

Your next step is to start H2O:
    > h2o.init()

For H2O package documentation, ask for help:
    > ??h2o

After starting H2O, you can use the Web UI at http://localhost:54321
For more information visit http://docs.h2o.ai

----------------------------------------------------------------------


R[write to console]: 
Attache Paket: ‘h2o’


R[write to console]: The following objects are masked from ‘package:stats’:

    cor, sd, var


R[write to console]: The following objects are masked from ‘package:base’:

    %*%, %in%, &&, apply, as.factor, as.numeric, colnames, colnames<-,
    ifelse, is.character, is.factor, is.numeric, log, log10, log1p,
    log2, round, signif, trunc, ||


R[write to console]: 
Attache Paket: ‘dplyr’


R[write to console]: The following objects are masked from ‘package:stats’:

    filter, lag


R[write to console]: The following objects are masked from ‘package:base’:

## Prepare input file for the predcition of the user data based on our trained autoencoder model

In [9]:
%%R -i=SUBGRAPH_15_RDATA,USER_FILE

# load SNPs for each Meachnisms
load(SUBGRAPH_15_RDATA)

#load User data
userSNPs <- read.csv(USER_FILE, row.names = 1, stringsAsFactors=FALSE)
userSNPs <- as.data.frame(t(userSNPs), stringsAsFactors = FALSE)

# converting user snps data into mechanisms*patients*SNPs matrices
snp_mat = list()
k = 1
for (i in subgraph15.snps){
  snp_mat[[k]] = select(userSNPs, i)
  k = k + 1
}

names(snp_mat) <- names(subgraph15.snps)


## Predcit Mechanism Scores for the user data with our autoencoder model

In [10]:
%%R -i=ADPD_AUTOENCODER_MODEL

library(h2o)
h2o.init()

# load our autoenocder models (#15) trained with AD-PD data
modelnames <-list.files(path = ADPD_AUTOENCODER_MODEL)
models = lapply(paste0(ADPD_AUTOENCODER_MODEL,"/", modelnames), h2o.loadModel)

# load user data into h2o environment
user_data <- snp_mat
user_snps <- sapply(1:length(user_data), function(x) as.h2o(user_data[[x]]))

# create an empty list to hold predicted mechanisms score
predicted_user_matrix = list()
                    
# predict mechanism socres for user data
for(i in 1:length(models)){
  if (length(models[[i]]@parameters[["hidden"]]) == 1) {
    predicted_user_matrix[[i]] = as.data.frame(h2o.deepfeatures(models[[i]], user_snps[[i]], 1))
  } else if (length(models[[i]]@parameters[["hidden"]]) == 2) {
    predicted_user_matrix[[i]] = as.data.frame(h2o.deepfeatures(models[[i]], user_snps[[i]], 2))
  } else if (length(models[[i]]@parameters[["hidden"]]) == 3) {
    predicted_user_matrix[[i]] = as.data.frame(h2o.deepfeatures(models[[i]], user_snps[[i]], 3))
  }
}

# convert it to data frame (Patients*Mechanism scores)
predicted_user_matrix = do.call(cbind,predicted_user_matrix)

names(predicted_user_matrix) <- names(user_data) #change column names to Mechanism names
rownames(predicted_user_matrix) <- rownames(user_data[[1]]) # change rownames to patient IDs

 Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         2 hours 14 minutes 
    H2O cluster timezone:       Europe/Berlin 
    H2O data parsing timezone:  UTC 
    H2O cluster version:        3.26.0.2 
    H2O cluster version age:    6 months and 4 days !!! 
    H2O cluster name:           H2O_started_from_R_danieldomingo_ypt039 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   10.49 GB 
    H2O cluster total cores:    4 
    H2O cluster allowed cores:  4 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    H2O Internal Security:      FALSE 
    H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4 
    R Version:                  R version 3.6.2 (2019-12-12) 



## Predcit Cluster assignements for the user data with our autoencoder model

In [None]:
%%R -i=AUTOENCODER_TRAINED_MATRIX,TRAINED_PATIENT_CLUSTERS,OUTPUT_FILE

### Start up a 1-node H2O server on the local machine, and allow it to use all CPU cores and up to 6GB of memory:
h2o.init(nthreads=-1, min_mem_size="6G")

### Import autoencoder data set #subgraph15
load(AUTOENCODER_TRAINED_MATRIX)
main_data <- data.frame(autoen)

#### read the cluster assignments of each patients
load(TRAINED_PATIENT_CLUSTERS)
rownames(clusters) <- clusters[,1]
clusters[,1] <- NULL


##### merge cluster assignment to the dataset
fin_data <- merge(clusters,main_data,by="row.names")
fin_data$clusters <- as.factor(fin_data$clusters)
rownames(fin_data) = fin_data$Row.names
fin_data <- fin_data[,-1]
fullD <- as.h2o(fin_data) # get complete data set into h2o frame for cross validation approach

##### Training Classifier

y = "clusters" # response variable
#x = names(trainData)
x = names(fullD)
x = x[-which(x==y)] # predictor variables

### Train Model
snpModel = h2o.glm(training_frame = fullD, 
                   #training_frame = trainD, # keep it commented while using cross validation
                   #validation_frame = validD, # keep it commented while using cross validation
                   x = x, 
                   y = y,
                   nfolds = 10, 
                   family='multinomial',
                   solver='L_BFGS',
                   lambda_search=TRUE)

testh2o <- as.h2o(predicted_user_matrix)

### prediction on test data set
prediction = h2o.predict(snpModel, newdata=testh2o)

predicted.cl = as.data.frame(prediction$predict)
names(predicted.cl) <- "clusters"
predicted.cl$clusters <- as.integer(as.character(gsub("Cluster_", "", predicted.cl$clusters)))

# merge predicted clusters to test data
# so all patients in test data set get a cluster assignment based on their mechanism profile (predictro variables)
predicted_cl.testdata <- cbind(predicted.cl,predicted_user_matrix)
save(predicted_cl.testdata, OUTPUT_FILE)


 Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         2 hours 17 minutes 
    H2O cluster timezone:       Europe/Berlin 
    H2O data parsing timezone:  UTC 
    H2O cluster version:        3.26.0.2 
    H2O cluster version age:    6 months and 4 days !!! 
    H2O cluster name:           H2O_started_from_R_danieldomingo_ypt039 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   10.49 GB 
    H2O cluster total cores:    4 
    H2O cluster allowed cores:  4 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    H2O Internal Security:      FALSE 
    H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4 
    R Version:                  R version 3.6.2 (2019-12-12) 



In [13]:
%%R 

predicted_cl.testdata

            clusters    Subgraph_1   Subgraph_2    Subgraph_3  Subgraph_4
MAP01797756        3  0.0644176607 -0.301283429 -0.3503637632  0.23711466
MAP05522533        3 -0.0368432265 -0.100267459 -0.0284039771  0.20004510
MAP07265221        4 -0.0800137993  0.061516305 -0.0553551507 -0.05363518
MAP21362537        4  0.1477402080 -0.234971334 -0.2334151778 -0.06375135
MAP24073245        2 -0.0933926303  0.035249820 -0.1826242670  0.25483258
MAP24680888        1 -0.1398851799 -0.008879545 -0.4892198105 -0.10287407
MAP25939172        3  0.1156601213 -0.259462207 -0.1229125445 -0.09159378
MAP26631069        2 -0.0336057342  0.102344768 -0.4131752007  0.23887739
MAP29629849        3 -0.0627749605 -0.004820880  0.0876857922 -0.08903225
MAP30663686        3  0.0235194556 -0.226288467  0.0042813258  0.10502849
MAP31504924        3 -0.1180547223 -0.302503021  0.1678699626 -0.16043719
MAP33332646        3 -0.0236805584 -0.057858936 -0.2202071921 -0.05998048
MAP33501827        3 -0.2605067677 -0.

ROS20202806        3  0.0162672100 -0.234515760  0.1026427328  0.25233017
ROS20223617        3 -0.0299850895 -0.264874181 -0.0117431355  0.27970540
ROS20139562        4  0.0476792011  0.066601065  0.0767142027 -0.14312152
ROS21135554        4  0.1043680487  0.031431871 -0.1261729538 -0.08454788
MAP96892753        1  0.2266814648 -0.139529184 -0.4807788219 -0.05363518
MAP59150662        2 -0.0911344634  0.058690677 -0.3108154799  0.56873454
MAP15286377        3 -0.0471731743  0.203737282 -0.1086838573  0.21438705
MAP85171938        3 -0.1858099897  0.378178015  0.0764864285 -0.08649569
MAP15387421        2 -0.0430587227  0.058690677 -0.1856510085 -0.15344567
MAP42988567        2 -0.0064575715 -0.229473321 -0.2435054257 -0.10339846
MAP27481601        4  0.0817658825 -0.264874181 -0.4008624911 -0.10454161
MAP40611041        3  0.1001622546 -0.067257518  0.1851604841 -0.10483110
MAP74753465        3 -0.0472257516  0.058169033 -0.4813069965  0.22631672
MAP83001827        3  0.1134383120  0.

MAP50106442 0.04868871 -0.25519988  0.0322675885 -0.132252764  0.258504497
MAP50108462 0.28127117  0.14885752  0.2645867333 -0.188192297  0.243043844
MAP50108912 0.04767156 -0.09490703  0.1871535891  0.106907429  0.086248933
MAP50304024 0.05216620 -0.11309753 -0.1077582020 -0.197369426  0.061324287
MAP50304998 0.32202314 -0.25519988  0.1708915152  0.040182114  0.122576244
MAP50400259 0.27553198  0.26003278  0.1268021391 -0.208725573  0.318181713
MAP50401390 0.04767156 -0.25519988  0.0851814948  0.079509527  0.241550518
MAP50406633 0.22125815  0.00258822  0.2079557055  0.028378814  0.244483519
MAP50409956 0.27586488 -0.11309753  0.0705141724  0.043408059  0.149410608
MAP51864085 0.33678466  0.00258822  0.2621165199  0.364695428  0.291639226
MAP51903261 0.18245871 -0.25519988  0.1408030804 -0.188192297 -0.011847852
MAP53355949 0.13929808  0.00258822  0.1358981114  0.169759805  0.326934197
MAP58458351 0.27887597  0.00258822 -0.0847085677 -0.188192297  0.241550518
MAP62985554 0.05485219  0

MAP70669392 0.04524255  0.26003278  0.4199619750 -0.202905198  0.120340507
MAP57597479 0.13497471 -0.11309753  0.3678300058 -0.209058466  0.118422217
MAP16513683 0.04714495 -0.23792851 -0.0154198527 -0.170741339 -0.012798656
MAP30819298 0.04435451  0.00258822  0.3815695380 -0.221131504  0.288999965
MAP97264179 0.04889941 -0.25519988 -0.0532030995  0.087845535  0.520617952
MAP62301938 0.13692334 -0.25519988 -0.0410276591  0.040182114  0.090633762
MAP50302392 0.05488274  0.00258822  0.0261452823 -0.223605488 -0.011054380
MAP83984043 0.57457682  0.02097509 -0.0086260367  0.070993369  0.200480378
MAP50402693 0.05015619 -0.25519988  0.1155342060 -0.202868819  0.124783551
MAP72205714 0.32305124 -0.25519988  0.3786875694  0.033268475 -0.012784976
MAP39800111 0.04805883 -0.25519988 -0.0147689542 -0.224380083 -0.011518267
MAP39393581 0.28507763  0.00258822  0.3499458554 -0.210937391 -0.010551796
MAP23791808 0.27751626 -0.25519988  0.1000569839 -0.213103363  0.028403625
MAP60961592 0.57590543 -0

MAP85980779 0.232809346  0.26351810  0.04514476  0.30886639  0.2604974289
MAP86177506 0.194458630  0.01232568  0.04869189  0.29348964  0.3334406606
MAP92023910 0.020749402  0.21956317 -0.23894418  0.12153803  0.1859887800
MAP98096223 0.044377033  0.26351810  0.04869189  0.27795951  0.1426762694
MAP98388248 0.019610804  0.17113273 -0.23569111  0.29348964  0.1113972955
MAP99110004 0.203890678  0.20869901 -0.23569111  0.29348964 -0.0612770792
ROS10221262 0.222756043  0.01549774  0.04514476  0.25357478  0.0578910847
ROS10415168 0.198890336  0.01549774 -0.23894418  0.12153803 -0.0631865569
ROS10686233 0.019610804  0.32060431  0.04514476 -0.04116245 -0.0631865569
ROS10488101 0.044401894  0.32060431 -0.23569111  0.06199999 -0.0012304118
ROS11240247 0.015236455  0.10930187  0.34468378  0.12153803  0.1450064138
ROS11259428 0.009641434  0.10175932  0.34772466  0.30886639  0.1732788465
ROS11233086 0.014067171  0.10531537 -0.23894418  0.13816367  0.2010176344
ROS20537160 0.465573498  0.26351810  0

MAP58199939 0.222756043  0.16874415 -0.23569111  0.07882694  0.1646618283
MAP81874628 0.039131326  0.05951671 -0.23569111  0.01892635  0.0517052274
MAP51442191 0.014067171  0.12401161 -0.23894418  0.30886639  0.0578910847
MAP46547648 0.019610804  0.26351810  0.04514476  0.13816367  0.3143722524
MAP68778359 0.227543522  0.32060431 -0.23569111  0.30886639 -0.0631865569
MAP50402729 0.009641434  0.21584195 -0.23894418  0.06199999  0.0781497666
MAP45984063 0.034859514  0.21262224  0.04869189  0.29348964  0.0775593100
MAP50108048 0.034859514  0.26103129  0.04514476  0.30886639 -0.0440774007
MAP50407486 0.203890678  0.03819946  0.34772466  0.23768434 -0.0631865569
MAP90977195 0.014067171  0.06102744  0.04869189  0.10484391 -0.0631865569
ROS20254452 0.015230514  0.26351810  0.04514476 -0.05803165  0.0775593100
ROS21401001 0.034859514  0.33648283 -0.23894418  0.19657498  0.0578910847
ROS59371363 0.019610804  0.16687015  0.04514476  0.30886639  0.2010176344
ROS20380831 0.009641434  0.11990732  0