In [None]:
import pandas as pd
import numpy as np
import os,sys, gc
import matplotlib.pyplot as plt
from scipy.stats import entropy
sys.path.append("/home/fvalle/phd/master_thesis/")
sys.path.append("/home/fvalle/phd/master_thesis/hsbm/")
from hsbmpy import get_max_available_L

In [None]:
#label = 'disease_type'
algorithm = "topsbm"
directory='/home/fvalle/phd/datasets/merged'
L = get_max_available_L(directory, algorithm)-1
os.chdir(directory)

In [None]:
label = 'tissue_hd'

df_topics = pd.read_csv("%s/%s_level_%d_topic-dist.csv"%(algorithm,algorithm,L)).set_index('doc').drop('i_doc', axis=1)
df_words = pd.read_csv("%s/%s_level_%d_word-dist.csv"%(algorithm,algorithm,L), index_col=0)
df_words.index=[g[:15] for g in df_words.index]
df = pd.read_csv("mainTable_train.csv", index_col=0).reindex(index=df_words.index)
df = df.divide(df.sum(0),1).transpose().fillna(0)

df_files=pd.read_csv("files.dat", index_col=0)
df_topics.insert(0,'tissue', df_files.reindex(index=df_topics.index)[label])
df_topic_tissue = df_topics.groupby('tissue').mean()
df_files.head()

## Projection based predictions

In [None]:
df_Pst = pd.DataFrame(data=np.matmul(df.values,df_words.values), index = df.index, columns = df_words.columns)
df_Pst = df_Pst.divide(df_Pst.sum(1), 0)
predictions = np.array(list(map(lambda x: list(map(lambda y: entropy(x, y), df_topic_tissue.astype(float).values)), df_Pst.astype(float).values)))

df_Pst.insert(0,'tissue', df_files.reindex(index=df_Pst.index)[label])
reals = np.unique(df_Pst.tissue, return_inverse=True)[1]

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
print(f"Accuracy projecting score: {accuracy_score(reals, np.argmin(predictions, axis=1))}")

## NN based predictor

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import binary_crossentropy,mean_squared_error, categorical_crossentropy
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import plot_model,to_categorical
from tensorflow.keras.callbacks import Callback, CSVLogger, EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.python.framework.graph_util import convert_variables_to_constants
from tensorflow.python.client.device_lib import list_local_devices
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
import os,sys, gc
list_local_devices()

In [None]:
df_labels=df_files.copy()
df_labels=df_labels.reindex(index=df_topics.index)

uniq = len(df_labels[label].unique())

X_train = df_topics.drop('tissue',1).divide(df_topics.drop('tissue',1).mean(0),1).values.astype(float)
Y_train = to_categorical(np.unique(df_labels[label], return_inverse=True)[1])
classes=np.unique(df_labels[df_labels.index.isin(df.index)][label], return_inverse=True)[0]

inputs = X_train.shape[1]

X_tm_train, X_tm_test, Y_tm_train, Y_tm_test = train_test_split(X_train, Y_train, random_state=42, train_size=0.95)

print(X_train.shape, Y_train.shape, X_tm_train.shape, Y_tm_train.shape)

In [None]:
K.clear_session()

os.system("rm -rf log.csv")
csv_logger = CSVLogger('log.csv', append=True, separator=',')
es = EarlyStopping(monitor='val_loss', min_delta=1e-10, mode='min', patience=25)

model=Sequential()
model.add(Dense(units=100, input_dim=inputs, use_bias=True, bias_initializer='ones', activation="relu"))
model.add(Dense(units=uniq, input_dim=inputs, activation="softmax"))
model.compile(loss=binary_crossentropy, optimizer=SGD(lr=0.01, momentum=0.4), metrics=['accuracy', 'AUC'])
K.set_learning_phase(0)

print(model.summary())
plot_model(model, show_shapes=True)

In [None]:
model.fit(X_tm_train, Y_tm_train, epochs=1000, batch_size=500, verbose=1, validation_split=0.1, callbacks=[csv_logger, es], shuffle=True, use_multiprocessing=True, workers=-1)

In [None]:
pd.read_csv("log.csv", sep=",")[['loss','val_loss']].plot()

In [None]:
model.evaluate(X_tm_test, Y_tm_test)

In [None]:
model.save("model.h5")

In [None]:
gc.collect()

# evaluate on non used on topsbm training


In [None]:
df_test_table = pd.read_csv("mainTable_test.csv", index_col = 0)

In [None]:
#get only HV genes
df_test = df_test_table.reindex(index=df_words.index)

df_test = df_test.divide(df_test.sum(0),1).transpose().fillna(0)

df_test = pd.DataFrame(data=np.matmul(df_test.values,df_words.values), index=df_test.index, columns=df_words.columns)
df_test=df_test.divide(df_test.mean(axis=0), axis=1) #normalize P(t|d)
df_test.head()

In [None]:
classes = np.unique(df_files.reindex(index=df_test.index)[label])
X_test = df_test.values
Y_test = to_categorical([np.where(classes==t)[0][0] for t in df_files.reindex(index=df_test.index)[label].values.ravel()])

In [None]:
model.evaluate(X_test, Y_test, verbose=2, workers=12)

In [None]:
results = pd.DataFrame(index=classes, columns=classes).fillna(0)

In [None]:
for class_pred, y_test in zip(model.predict_classes(X_test), Y_test):
    results.at[classes[y_test.argmax()], classes[class_pred]]+=1

In [None]:
import seaborn as sns

In [None]:
ax = sns.heatmap(results.divide(results.sum(1),0), annot=False)

fig = ax.get_figure()
fig.savefig("predict.pdf")

In [None]:
fig, ax = plt.subplots(1,2, figsize=(20,10))
tissue='Breast'
df_topic_tissue.transpose().plot(ax=ax[0], marker='o', ms=5)
df_topics[df_topics.index.isin(df_files[df_files['primary_site']==tissue].index)].drop('tissue', 1).transpose().plot(ax=ax[1])
df_topics[df_topics.index.isin(df_files[df_files['primary_site']==tissue].index)].drop('tissue', 1).transpose().mean(1).plot(ax=ax[1], lw=9, ls=':')
ax[1].set_title(tissue)
#ax[0].get_legend().remove()
ax[1].get_legend().remove()
ax[0].set_ylim(0,0.8)
ax[1].set_ylim(0,0.8)
plt.show()

In [None]:
import pickle
from sbmtm import sbmtm

In [None]:
with open("topsbm/topsbm.pkl", "rb") as f:
    model = pickle.load(f)

In [None]:
df_hsbm = pd.DataFrame(index=model.words, columns=model.documents).fillna(0)
for e, count in zip(model.g.get_edges(),model.g.properties[('e', 'count')].get_array()):
    df_hsbm.at[df_hsbm.index[e[1]-1000], df_hsbm.columns[e[0]]]=count

In [None]:
df_hsbm

In [None]:
df.reindex(index=df_hsbm.columns, columns=df_hsbm.index).transpose()

In [None]:
gc.collect()