In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import os,sys, gc
import matplotlib.pyplot as plt
from scipy.stats import entropy
sys.path.append("/content/drive/My Drive/phd")
sys.path.append("/content/drive/My Drive/phd/hsbm-occam")
from hsbmpy import get_max_available_L

os.environ["TF_CUDNN_USE_AUTOTUNE"]="0"

In [None]:
algorithm = "topsbm"
directory='/content/drive/My Drive/phd/datasets/cancers/breast'
L = 0
os.chdir(directory)

In [None]:
label = 'Subtype_Selected_Lum'

df_topics = pd.read_csv("topsbm_all/%s_level_%d_topic-dist.csv"%(algorithm,L)).set_index('doc').drop('i_doc', axis=1)
df_words = pd.read_csv("topsbm_all/%s_level_%d_word-dist.csv"%(algorithm,L), index_col=0)
df_words.index=[g[:15] for g in df_words.index]
df = pd.read_csv("mainTable.csv", index_col=0).reindex(index=df_words.index)
df = df.divide(df.sum(0),1).transpose().fillna(0)
df_files=pd.read_csv("files.dat", index_col=0)
df_topics.insert(0,'tissue', df_files.reindex(index=df_topics.index)[label])
df_topic_tissue = df_topics.groupby('tissue').mean()
df_files.head()

## NN based predictor

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import binary_crossentropy,mean_squared_error, categorical_crossentropy
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.utils import plot_model,to_categorical
from tensorflow.keras.callbacks import Callback, CSVLogger, EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
from tensorflow.python.client.device_lib import list_local_devices
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import os,sys, gc
list_local_devices()

In [None]:
def preprocess_data(verbose=True):
  global df_topics
  global df_files
  df_topics = df_topics[df_topics["tissue"]!="unknown"]

  df_labels=df_files.copy()
  df_labels=df_labels.reindex(index=df_topics.index)

  uniq = len(df_labels[label].unique())

  X = df_topics.drop('tissue',1)
  X = X.subtract(X.mean(0),1).divide(0.5*(X.max(0)-X.min(0)),1).values.astype(float) #SGD transform
  Y = to_categorical(np.unique(df_labels[label], return_inverse=True)[1])
  inputs = X.shape[1]

  classes=np.unique(df_labels[df_labels.index.isin(df.index)][label], return_inverse=True)[0]

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, train_size=0.9)

  if verbose:
    print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
  return inputs, uniq, df_labels, X_train, X_test, Y_train, Y_test

In [None]:
inputs, uniq, df_labels, X_train, X_test, Y_train, Y_test = preprocess_data(True)

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def create_model(opt, l1, l2, hidden, loss=categorical_crossentropy, activation_func = "softmax",  verbose=True):
  K.clear_session()

  inputs, uniq, df_labels, X_train, X_test, Y_train, Y_test = preprocess_data(verbose)

  os.system("rm -rf log.csv")
  model=Sequential()
  model.add(Dense(units=hidden, input_dim=inputs, use_bias=True, bias_initializer="ones", activation="relu", kernel_regularizer=l1_l2(l1=l1, l2=l2)))
  model.add(Dense(units=uniq, activation=activation_func))
  model.compile(loss=loss, optimizer=opt, metrics=['accuracy', 'AUC', f1])
  K.set_learning_phase(0)

  if verbose:
    print(model.summary())
    plot_model(model, to_file=f"model_{label}.png", dpi=600, show_shapes=True)
  return model, X_train, X_test, Y_train, Y_test 

In [None]:
def train_GS():
  csv_logger = CSVLogger('log.csv', append=True, separator=',')
  es = EarlyStopping(monitor='val_loss', min_delta=1e-3, mode='min', patience=25)

  momentum = 0.95
  best_model = None
  best_score = -np.inf
  for lr in [0.001, 0.0003]:
    for l1 in [0.001,0.0001]:
      for l2 in [0.001, 0.0001]:
        for bs in [50,100]:
          for momentum in [0.95, 0.98, 0.99]:
            for hidden in [50, 75, 100]:
              print(lr, l1, l2, bs, momentum, hidden)
              opt = SGD(lr=lr, momentum=momentum)
              model, X_train, X_test, Y_train, Y_test  = create_model(opt, l1, l2, hidden, verbose=False)
              with tf.device("GPU"):
                hist = model.fit(X_train, Y_train, epochs=500, batch_size=bs, verbose=0, validation_split=0.8, callbacks=[csv_logger, es], shuffle=True, use_multiprocessing=True, workers=-1)
              f1_score = hist.history["val_f1"][-1]
              if f1_score > best_score:
                loss = hist.history["val_loss"][-1]
                acc = hist.history["val_accuracy"][-1]
                auc = hist.history["val_auc"][-1]
                print(f"loss: {loss}, accuracy: {acc}, AUC: {auc}, f1: {f1_score}")
                best_score = f1_score
                best_model=model
  return best_model, lr, l1, l2, bs, momentum, hidden

model, lr, l1, l2, bs, momentum, hidden = train_GS()
print(lr, l1, l2, bs, momentum, hidden)

In [None]:
#L=1
#model, X_train, X_test, Y_train, Y_test = create_model(SGD(0.001, momentum=0.95), l1=0.001, l2=0.01, hidden=75, loss=categorical_crossentropy, activation_func = "softmax",  verbose=True)
#hist = model.fit(X_train, Y_train, epochs=500, batch_size=50, verbose=1, validation_split=0.8, callbacks=[CSVLogger('log.csv', append=True, separator=','), EarlyStopping(monitor='val_loss', min_delta=1e-3, mode='min', patience=25)], shuffle=True, use_multiprocessing=True, workers=-1)

#L=0
model, X_train, X_test, Y_train, Y_test = create_model(SGD(0.001, momentum=0.99), l1=0.0001, l2=0.0001, hidden=50, loss=categorical_crossentropy, activation_func = "softmax",  verbose=True)
hist = model.fit(X_train, Y_train, epochs=500, batch_size=50, verbose=1, validation_split=0.8, callbacks=[CSVLogger('log.csv', append=True, separator=','), EarlyStopping(monitor='val_loss', min_delta=1e-3, mode='min', patience=25)], shuffle=True, use_multiprocessing=True, workers=-1)

In [None]:
pd.read_csv("log.csv", sep=",")[['loss','val_loss']].plot()

In [None]:
model.evaluate(X_test, Y_test)

In [None]:
model.save(f"model_{label}.h5")
#model = load_model(f"model_{label}.h5")

In [None]:
gc.collect()

# evaluate on non used on topsbm training


In [None]:
classes = df_labels[label].unique()
np.savetxt("classes.txt", classes, fmt="%s")
np.savetxt("X_test.txt", X_test)
np.savetxt("Y_test.txt", Y_test)
print(X_test.shape, Y_test.shape)

In [None]:
model.evaluate(X_test, Y_test, verbose=2, workers=-1, use_multiprocessing=True)

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
import seaborn as sns

In [None]:
y_pred_p = model.predict(X_test)
y_true = np.argmax(Y_test, axis=1)
y_pred = np.argmax(y_pred_p, axis=1)
results = confusion_matrix(y_true, y_pred, normalize="true")

In [None]:
cm = sns.clustermap(results, 
                    vmax=1,  
                    row_cluster=False, 
                    col_cluster=False, 
                    xticklabels=classes, 
                    yticklabels=classes, 
                    annot=False,
                    annot_kws={"fontsize":20})
ax = cm.ax_heatmap
fig = ax.get_figure()
ax.set_ylabel("real", fontsize=35, rotation=90)
ax.set_yticklabels(labels=classes, rotation=0)
ax.yaxis.tick_left()
ax.yaxis.set_label_position("left")

ax.set_xticklabels(labels=classes, rotation=90)
ax.set_xlabel("predicted",fontsize=35)
ax.tick_params(labelsize=35)

cax = cm.ax_cbar
cax.tick_params(labelsize=30)
cax.set_title("P()", fontsize=30)
plt.tight_layout()
cm.savefig(f"predict_{label}.pdf")

plt.show()