In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import pandas as pd
import numpy as np
import os,sys, gc
import matplotlib.pyplot as plt
sys.path.append("/content/drive/My Drive/phd")
sys.path.append("/content/drive/My Drive/phd/hsbm-occam")
from hsbmpy import get_max_available_L

os.environ["TF_CUDNN_USE_AUTOTUNE"]="0" #to avoid Nvidia GPU warming up

In [0]:
algorithm = "topsbm"
directory='/content/drive/My Drive/phd/datasets/merged'
L = get_max_available_L(directory, algorithm)-2
os.chdir(directory)

In [0]:
label = 'primary_site'

df_topics = pd.read_csv("%s/%s_level_%d_topic-dist.csv"%(algorithm,algorithm,L)).set_index('doc').drop('i_doc', axis=1)
df_words = pd.read_csv("%s/%s_level_%d_word-dist.csv"%(algorithm,algorithm,L), index_col=0)
df_words.index=[g[:15] for g in df_words.index]
df = pd.read_csv("mainTable.csv", index_col=0).reindex(index=df_words.index)
df = df.divide(df.sum(0),1).transpose().fillna(0)
df_files=pd.read_csv("files.dat", index_col=0)
df_topics.insert(0,'tissue', df_files.reindex(index=df_topics.index)[label])
df_topic_tissue = df_topics.groupby('tissue').mean()
df_files.head()

## Predictor

In [0]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import binary_crossentropy,mean_squared_error, categorical_crossentropy
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import plot_model,to_categorical
from tensorflow.keras.callbacks import Callback, CSVLogger, EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
from tensorflow.python.client.device_lib import list_local_devices
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import os,sys, gc
list_local_devices()

In [0]:
df_labels=df_files.copy()
df_labels=df_labels.reindex(index=df_topics.index)

uniq = len(df_labels[label].unique())

X_train = df_topics.drop('tissue',1)
X_train = X_train.subtract(X_train.mean(0),1).divide(0.5*(X_train.max(0)-X_train.min(0)),1).values.astype(float) #SGD transform
Y_train = to_categorical(np.unique(df_labels[label], return_inverse=True)[1])
inputs = X_train.shape[1]

if uniq==2: #hidden=(1941,100), bs=50 
  Y_train = np.argmax(Y_train, axis=-1)
  uniq = 1
  activation_func = "sigmoid"
  lr = 0.01
  bs = 50
  momentum = 0.9
  l1 = 0.01
  l2 = 0.0001
  loss=binary_crossentropy
else: #hidden (1941,100), bs=500, l1=0.001, l2=1e-9
  activation_func = "softmax"
  lr = 0.03
  bs = 500
  momentum = 0.95
  l1 = 0.001
  l2 = 1e-9
  loss=categorical_crossentropy


classes=np.unique(df_labels[df_labels.index.isin(df.index)][label], return_inverse=True)[0]

X_tm_train, X_tm_test, Y_tm_train, Y_tm_test = train_test_split(X_train, Y_train, random_state=42, train_size=0.95)

print(uniq, X_train.shape, Y_train.shape, X_tm_train.shape, Y_tm_train.shape)

# projections

In [0]:
import tensorflow as tf
from sklearn.metrics import accuracy_score, roc_auc_score

class ProjectorClassifier():
    def __init__(self):
        self.labels = None
        self._isfitted = False
        self._entropy = tf.keras.losses.KLDivergence()
        #self._entropy = tf.keras.losses.MeanSquaredError()
    
    def fit(self, X, Y):
        classes, _ = tf.unique(Y)
        self.labels = tf.map_fn(lambda c: tf.reduce_mean(tf.boolean_mask(X,tf.equal(Y, tf.repeat(c, tf.constant(Y.shape[0], dtype=tf.int64)))), axis=0), classes, dtype=tf.float64)
        self._isfitted = True

    def predict(self, X):
        if not self._isfitted:
            raise ValueError("Call Projector.fit() first")
        predictions = tf.argmin(tf.map_fn(lambda label: tf.map_fn(lambda x: self._entropy(x, label), X), self.labels), axis=0)
        return predictions

    def evaluate(self, X, Y):
        if not self._isfitted:
            raise ValueError("Call Projector.fit() first")
        Y_pred = self.predict(X)
        if tf.reduce_max(Y) > tf.constant(1, dtype=tf.int64):
            Y = to_categorical(Y)
            Y_pred = to_categorical(Y_pred)
        acc = accuracy_score(Y, Y_pred)
        auc = roc_auc_score(Y, Y_pred, average="weighted", multi_class="ovr")
        print(f"Accuracy: {acc}, AUC:{auc}")
        return [acc, auc]

In [0]:
X = tf.convert_to_tensor(df_topics.drop('tissue',1))
Y = tf.convert_to_tensor(Y_train)

#X_pj_train, X_pj_test, Y_pj_train, Y_pj_test = train_test_split(X.numpy(), Y.numpy(), random_state=42, train_size=0.8)
#Y_pj_train = tf.argmax(Y_pj_train, 1)
#Y_pj_test = tf.argmax(Y_pj_test, 1)

#X_pj_train, X_pj_test, Y_pj_train, Y_pj_test = train_test_split(df.values, np.unique(df_files.reindex(index=df.index)[label], return_inverse=True)[1], random_state=42, train_size=0.8)
X_pj_train, X_pj_test, Y_pj_train, Y_pj_test = train_test_split(pd.read_csv("mainTable.csv", index_col=0).reindex(index=df_words.index).fillna(0).applymap(lambda tpm: np.log2(tpm+1)).values.T, np.unique(df_files.reindex(index=df.index)[label], return_inverse=True)[1], random_state=42, train_size=0.8)

X_pj_train, X_pj_test, Y_pj_train, Y_pj_test = list(map(tf.convert_to_tensor, (X_pj_train, X_pj_test, Y_pj_train, Y_pj_test)))
print(uniq, X_pj_train.shape, Y_pj_train.shape, X_pj_test.shape, Y_pj_test.shape, Y_pj_train[0])

import time

with tf.device("GPU"):
    model_proj = ProjectorClassifier()
    model_proj.fit(X_pj_train, Y_pj_train)
    model_proj.evaluate(X_pj_test, Y_pj_test)

# K-NN

In [0]:
from sklearn.neighbors import KNeighborsClassifier

class KNNClassifier(KNeighborsClassifier):
    def __init__(self, n_neighbors=10, **kwargs):
        super().__init__(n_neighbors, **kwargs)

    def fit(self,  X, y):
        return super().fit(X, y)

    def evaluate(self, X, Y):
        Y_pred = self.predict(X)
        if tf.reduce_max(Y) > tf.constant(1, dtype=tf.int64):
            Y = to_categorical(Y)
            Y_pred = to_categorical(Y_pred)
        acc = accuracy_score(Y, Y_pred)
        auc = roc_auc_score(Y, Y_pred, average="weighted", multi_class="ovr")
        print(f"Accuracy: {acc}, AUC:{auc}")
        return [acc, auc]

    
#X_knn_train, X_knn_test, Y_knn_train, Y_knn_test = train_test_split(X.numpy(), Y.numpy(), random_state=42, train_size=0.8)
#Y_knn_train = tf.argmax(Y_knn_train, 1)
#Y_knn_test = tf.argmax(Y_knn_test, 1)

#X_knn_train, X_knn_test, Y_knn_train, Y_knn_test = train_test_split(df.values, np.unique(df_files.reindex(index=df.index)[label], return_inverse=True)[1], random_state=42, train_size=0.8)
X_knn_train, X_knn_test, Y_knn_train, Y_knn_test = train_test_split(pd.read_csv("mainTable.csv", index_col=0).reindex(index=df_words.index).fillna(0).applymap(lambda tpm: np.log2(tpm+1)).values.T, np.unique(df_files.reindex(index=df.index)[label], return_inverse=True)[1], random_state=42, train_size=0.8)

X_pj_train, X_pj_test, Y_pj_train, Y_pj_test = list(map(tf.convert_to_tensor, (X_knn_train, X_knn_test, Y_knn_train, Y_knn_test)))
print(uniq, X_knn_train.shape, Y_knn_train.shape, X_knn_test.shape, Y_knn_test.shape, Y_knn_train[0])
import time
with tf.device("GPU"):
    model_knn = KNNClassifier(n_neighbors=5, n_jobs=4, metric="euclidean")
    model_knn.fit(X_knn_train, Y_knn_train)
    model_knn.evaluate(X_knn_test, Y_knn_test)

# Neural Net

In [0]:
K.clear_session()

os.system("rm -rf log.csv")
csv_logger = CSVLogger('log.csv', append=True, separator=',')
es = EarlyStopping(monitor='val_loss', min_delta=1e-10, mode='min', patience=25)

model=Sequential()
model.add(Dense(units=100, input_dim=inputs, use_bias=True, bias_initializer="ones", activation="relu", kernel_regularizer=l1_l2(l1=l1, l2=l2)))
model.add(Dense(units=uniq, activation=activation_func))
model.compile(loss=loss, optimizer=SGD(lr=lr, momentum=momentum), metrics=['accuracy', 'AUC'])
K.set_learning_phase(0)

print(model.summary())
plot_model(model, to_file=f"model_{label}.png", dpi=600, show_shapes=True)


In [0]:
with tf.device("GPU"):
    model.fit(X_tm_train, Y_tm_train, epochs=1000, batch_size=bs, verbose=1, validation_split=0.25, callbacks=[csv_logger, es], shuffle=True, use_multiprocessing=True, workers=-1)

In [0]:
pd.read_csv("log.csv", sep=",")[['loss','val_loss']].plot()

In [0]:
model.evaluate(X_tm_test, Y_tm_test)

In [0]:
#model.save(f"model_{label}.h5")
#model = load_model(f"model_{label}.h5")
#print(model.summary())
#plot_model(model, to_file=f"model_{label}.png", dpi=600, show_shapes=True)

In [0]:
gc.collect()

# evaluate on non used on topsbm training


In [0]:
df_test_table = pd.read_csv("mainTable_test.csv", index_col = 0)
df_test_table = df_test_table.where(df_test_table<1e5,1e5)

In [0]:
#get only HV genes
df_topic_test = df_test_table.reindex(index=df_words.index)
df_topic_test = df_topic_test.transpose().fillna(-1).astype(int)

df_topic_test = pd.DataFrame(data=np.matmul(df_topic_test.values,df_words.values), index=df_topic_test.index, columns=df_words.columns)
df_test=df_topic_test.divide(df_topic_test.mean(axis=0), axis=1) #normalize P(t|d)

df_test = df_test.subtract(df_topics.drop("tissue",1).mean(0),1).divide((X_train.max(0)-X_train.min(0)),1) #SGD transform

In [0]:
classes = np.unique(df_files.reindex(index=df_test.index)[label])
X_test = df_test.values.astype(float)
Y_test = to_categorical([np.where(classes==t)[0][0] for t in df_files.reindex(index=df_test.index)[label].values.ravel()])
if uniq==1:
  Y_test = np.argmax(Y_test, axis=-1)
  uniq=1
#np.savetxt("classes.txt", classes, fmt="%s")
#np.savetxt("X_test.txt", X_test)
#np.savetxt("Y_test.txt", Y_test)

In [0]:
with tf.device("GPU"):
    model.evaluate(X_test, Y_test, verbose=2, workers=-1, use_multiprocessing=True)
    if uniq ==1:
        model_proj.evaluate(tf.convert_to_tensor(df_topic_test), tf.convert_to_tensor(Y_test))
        model_knn.evaluate(tf.convert_to_tensor(df_topic_test), tf.convert_to_tensor(Y_test))
    else:
        model_proj.evaluate(tf.convert_to_tensor(df_topic_test), tf.argmax(tf.convert_to_tensor(Y_test), axis = 1))
        model_knn.evaluate(tf.convert_to_tensor(df_topic_test), tf.argmax(tf.convert_to_tensor(Y_test), axis = 1))

In [0]:
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
import seaborn as sns

In [0]:
y_pred_p = model.predict(X_test)
if uniq>1:
  y_true = np.argmax(Y_test,axis=-1)
  y_pred = np.argmax(y_pred_p,axis=-1)
else:
  y_pred_p = y_pred_p.ravel()
  y_true = Y_test
  y_pred = np.ones(y_pred_p.shape)
  y_pred[y_pred_p<0.5]=0
results = confusion_matrix(y_true, y_pred, normalize="true")

In [0]:
cm = sns.clustermap(results, 
                    vmax=1,  
                    row_cluster=False, 
                    col_cluster=False, 
                    xticklabels=classes, 
                    yticklabels=classes, 
                    annot=False,
                    annot_kws={"fontsize":15},
                    cbar_pos=(0.99,0.05,0.05,0.7))
ax = cm.ax_heatmap
fig = ax.get_figure()
ax.set_ylabel("real", fontsize=35, rotation=90)
ax.set_yticklabels(labels=classes, rotation=0)
ax.yaxis.tick_left()
ax.yaxis.set_label_position("left")

ax.set_xticklabels(labels=classes, rotation=80)
ax.set_xlabel("predicted",fontsize=35)
ax.tick_params(labelsize=35)

cax = cm.ax_cbar
cax.tick_params(labelsize=30)
cax.set_title("P()", fontsize=30)
plt.tight_layout()
cm.savefig(f"predict_{label}.pdf")

plt.show()

In [0]:
fig, ax= plt.subplots(figsize=(18,15))

if uniq > 1:
  for c in range(uniq):
    ax.hist((1-model.predict(X_test).T[0])[Y_test.argmax(1)==c], histtype="step", lw=15, bins=10, density=True)
else:
  ax.hist(y_pred_p[Y_test==1], histtype="step", lw=15, bins=10, density=False, label="healthy")
  ax.hist(y_pred_p[Y_test==0], histtype="step", lw=15, bins=10, density=False, label="tumor")

ax.tick_params(labelsize=35)
ax.legend(fontsize=35, loc="upper left")
ax.set_title("", fontsize=35)

ax.set_xlim(-0.05,1.05)

ax.set_xlabel("Z", fontsize=35)
ax.set_ylabel("pdf", fontsize=35)

plt.show()

In [0]:
fpr, tpr, thresholds = roc_curve(y_true, y_pred_p)
#fpr, tpr, thresholds = roc_curve(Y_train, model.predict(X_train))

In [0]:
fig, ax = plt.subplots(figsize=(18,15))

ax.plot(fpr,tpr, c="gray", lw=15)

ax.plot([0, 1], [0, 1], 'k--', lw=15)
ax.set_xlabel('False positive rate', fontsize=35)
ax.set_ylabel('True positive rate', fontsize=35)


for f, t, thr in zip(fpr[::100], tpr[::100], thresholds[::100]):
    ax.annotate(thr, (f, t), fontsize=25)

ax.annotate("Area Under Curve = %.4f"%roc_auc_score(y_true, y_pred_p), xy=(0.6, 0.5), fontsize=25)

ax.tick_params(labelsize=35)

plt.show()
fig.savefig(f"roc_{label}.pdf")

In [0]:
fig, ax = plt.subplots(figsize=(15,18))
for itissue, tissue in enumerate(classes):
  tissue_true = y_true.copy()
  tissue_true[y_true==itissue]=1
  tissue_true[y_true!=itissue]=0
  fpr, tpr, thresholds = roc_curve(tissue_true, y_pred_p.T[itissue])

  ax.plot(fpr,tpr, lw=15, alpha=0.8)

  ax.plot([0, 1], [0, 1], 'k--', lw=15)
  ax.set_xlabel('False positive rate', fontsize=35)
  ax.set_ylabel('True positive rate', fontsize=35)

  ax.tick_params(labelsize=35)

ax.set_ylim(0.8,1)
plt.show()
fig.savefig(f"roc_{label}.pdf")