In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import binary_crossentropy,mean_squared_error, categorical_crossentropy
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import plot_model,to_categorical
from tensorflow.keras.callbacks import Callback, CSVLogger, EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
from tensorflow.python.framework.graph_util import convert_variables_to_constants
from tensorflow.python.client.device_lib import list_local_devices
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os,sys, gc

In [None]:
tf.config.experimental.list_physical_devices()

In [None]:
#GTEx
df = pd.read_csv('https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz', skiprows=2, compression='gzip', sep='\t')
df['ensg'] = [x[:15] for x in df['Name']]
df.set_index('Name', inplace=True)
df.set_index(['ensg'],inplace=True)
df=df.drop(['Description'],1)
df_file = pd.read_csv("https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep='\t').loc[:,['SAMPID','SMTS', 'SMTSD']]
df_file.set_index('SAMPID', inplace=True)

In [None]:
gc.collect()

In [None]:
os.chdir("/home/fvalle/phd/datasets/gtex/log/3000_random-5000hvg/")

In [None]:
df_topics = pd.read_csv(f"topsbm/topsbm_level_{1}_topic-dist.csv").drop('i_doc', axis=1).drop_duplicates().set_index('doc')
def get_name(partial):
    for i in df_file.index:
        if partial in i:
            return df_file.loc[i,:]
df_topics.index=[get_name(name).name for name in df_topics.index]
#df=df.reindex(index=df_labels.index)
df_labels=df_file.copy()
df_labels=df_labels.reindex(index=df_topics.index)

In [None]:
uniq = len(df_labels['SMTS'].unique())
inputs = df_topics.shape[1]

In [None]:
X_train = df_topics.divide(df_topics.mean(0),1).values.astype(float)
Y_train = to_categorical(np.unique(df_labels['SMTS'], return_inverse=True)[1])
classes=np.unique(df_labels[df_labels.index.isin(df.index)]['SMTS'], return_inverse=True)[0]
print(X_train.shape, Y_train.shape)

In [None]:
K.clear_session()

os.system("rm -rf log.csv")
csv_logger = CSVLogger('log.csv', append=True, separator=',')
es = EarlyStopping(monitor='val_loss', min_delta=1e-1, mode='min', patience=25)

model=Sequential()
model.add(Dense(units=100, input_dim=inputs, use_bias=True, bias_initializer='ones', activation="sigmoid"))
model.add(Dense(units=uniq, input_dim=inputs, activation="sigmoid"))
model.compile(loss=categorical_crossentropy, optimizer=SGD(lr=0.7), metrics=['accuracy', 'AUC'])
plot_model(model, show_shapes=True)

In [None]:
model.summary()

In [None]:
K.set_learning_phase(0)
#model=load_model("model.h5")

In [None]:
X_train_train, X_train_test, Y_train_train, Y_train_test = train_test_split(X_train, Y_train, random_state=42)

In [None]:
model.fit(X_train_train, Y_train_train, epochs=500, batch_size=400, verbose=1, validation_split=0.2, callbacks=[csv_logger, es], shuffle=True, use_multiprocessing=True, workers=12)

In [None]:
model.evaluate(X_train_test, Y_train_test)

In [None]:
model.save("model.h5")

In [None]:
pd.read_csv("log.csv", sep=",")[['loss','val_loss']].plot()

# evaluate on non used on topsbm training

In [None]:
del df_test
del model
gc.collect()

In [None]:
df_Pwt = pd.read_csv(f"topsbm/topsbm_level_{1}_word-dist.csv", index_col=0).dropna(how='any', axis=0)

In [None]:
del df_test
gc.collect()

#get only HV genes
df_test = df.reindex(columns=df.columns[~df.columns.isin(df_topics.index)], index=np.unique([g[:15] for g in df_Pwt.index])).applymap(lambda x: np.log2(x+1))

#df_test = df[df.columns[df.columns.isin(df_file[df_file['SMTS'].isin(df_labels['SMTS'].unique()) & ~df_file.index.isin(df_topics.index)].index)]]
#df_test = df_test.loc[np.unique([g[:15] for g in df_Pwt.index]).astype(str),:].drop_duplicates().applymap(lambda x: np.log(x+1)).round()

df_test = pd.DataFrame(data=np.matmul(df_test.transpose().values,df_Pwt.values), index=df_test.columns, columns=df_Pwt.columns)
df_test=df_test.divide(df_test.sum(axis=1), axis=0) #normalize P(t|d)
df_test.head()

In [None]:
classes = np.unique(df_file.reindex(index=df_test.index)['SMTS'])
X_test = df_test.values
Y_test = to_categorical([np.where(classes==t)[0][0] for t in df_file.reindex(index=df_test.index)['SMTS'].values.ravel()])

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
df_test = df_test.divide(df_test.mean(0),1)

In [None]:
df_test[df_test.index.isin(df_file[df_file['SMTS']=='Brain'].index)].mean(0)

In [None]:
df_topics.divide(df_topics.mean(0),1)[df_topics.divide(df_topics.mean(0),1).index.isin(df_file[df_file['SMTS']=='Brain'].index)].mean(0)

In [None]:
model.evaluate(X_test, Y_test)

In [None]:
[(df_labels['SMTS'].values.ravel()[Y_test[i].argmax()], df_labels['SMTS'].values.ravel()[class_pred]) for i, class_pred in enumerate(model.predict_classes(X_test))]