In [None]:
import pandas as pd
import numpy as np
import logging
log = logging.getLogger("gtex")
hdl = logging.StreamHandler()
hdl.setLevel(logging.DEBUG)
log.addHandler(hdl)
log.setLevel(logging.DEBUG)

In [None]:
genelist=pd.read_csv("https://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None).values.ravel()
url = "https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
df_tissues = pd.read_csv(url, compression="gzip", sep="\t", skiprows=2, index_col=0).drop("Description",1)

#filter proteincoding
df_tissues.index=[g[:15] for g in df_tissues.index]
df_tissues = df_tissues[df_tissues.index.isin(genelist)]

# group by tissue
df_tissues = df_tissues.transpose()
df_tissues["tissue"]=[t.split(" -")[0] for t in df_tissues.index]
df_tissues=df_tissues.groupby("tissue").mean()

#center
df_tissues = df_tissues.applymap(lambda tpm: np.log2(tpm+1))
df_tissues = df_tissues.subtract(df_tissues.mean(0),1)

print(df_tissues.shape)
df_tissues.head(2)

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import MDS

In [None]:
pca = PCA(31)
#mds = MDS(int(31/0.138))
data = pca.fit_transform(df_tissues.values)
#data = (df_tissues.values>0).astype(int)

In [None]:
from hopfield4py import Hopfield
from hopfield4py.hopfield_helper import *
import tensorflow as tf

In [None]:
data_tf=tf.convert_to_tensor((data>0).astype(int)*2-1, dtype=tf.int8)

In [None]:
import matplotlib.pyplot as plt
plt.scatter([x[0] for x in data], [x[1] for x in data])

In [None]:
model = Hopfield(data.shape[1])
model.load_Kanter_Sompolinsky87(data_tf)
model.train()
print(model)

In [None]:
reals = df_tissues.index
preds = get_predicted_labels(df_tissues.index, data_tf, data_tf, model)

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [None]:
classes=[l.split("-")[0] for l in df_tissues.index]

In [None]:
@tf.function
def get_distance_matrix(data_tensor, model, distance=hamming):
    return tf.map_fn(lambda A: tf.map_fn(lambda B: tf.reduce_min([tf.cast(distance(B,model.reconstruct(A)),tf.float64)]), tf.cast(data_tensor,tf.float64), parallel_iterations=6), tf.cast(data_tensor,tf.float64), parallel_iterations=6)

dist_matrix = get_distance_matrix(data_tf, model, dilutedhamming)

In [None]:
ax = sns.heatmap(dist_matrix, 
            vmin=0, 
            vmax=1, 
            xticklabels=classes, 
            yticklabels=classes,
            cbar_kws={"label":"hamming"})

ax.set_xlabel("memories", fontsize=35)
ax.set_ylabel("memories", fontsize=35)

ax.tick_params(labelsize=30)
ax.get_figure().set_figwidth(15)
ax.get_figure().set_figheight(15)
plt.tight_layout()
plt.show()
ax.get_figure().savefig("hammings_after_pca.pdf")

In [None]:
cm = sns.clustermap(confusion_matrix(reals, preds, normalize="true"),
                    vmin = 0,
                    vmax=1,  
                    row_cluster=False, 
                    col_cluster=False, 
                    xticklabels=classes, 
                    yticklabels=classes,
                    annot=False,
                    annot_kws={"fontsize":15})
ax = cm.ax_heatmap
fig = ax.get_figure()
ax.set_ylabel("real", fontsize=35, rotation=90)
ax.set_yticklabels(labels=classes, rotation=0)
ax.yaxis.tick_left()
ax.yaxis.set_label_position("left")

ax.set_xticklabels(labels=classes, rotation=90)
ax.set_xlabel("predicted",fontsize=35)
ax.tick_params(labelsize=15)

cax = cm.ax_cbar
cax.tick_params(labelsize=30)
cax.set_title("P()", fontsize=30)
plt.tight_layout()
cm.savefig(f"predict.pdf")

plt.show()

# From Original dataset

In [None]:
df_files = pd.read_csv("https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", index_col=0, sep="\t")

[GTEx.ipynb](../GTEx.ipynb)

In [None]:
df = pd.read_csv("gtex.csv", index_col=0)

## Preprocess

In [None]:
# transpose
log.info("transpose")
df = df.transpose()

#center
log.info("center")
df = df.applymap(lambda tpm: np.log2(tpm+1))
df = df.subtract(df.mean(0),1)

#group by tissue
df["tissue"]=df_files.reindex(index=df.index)["SMTS"]

df.head(2)

## PCA on the whole log(FPKM+1) dataset

In [None]:
pca = PCA(200)
data = pca.fit_transform(df.drop("tissue",1).values)
with open("pca.pkl","wb") as file:
    import pickle
    pickle.dump(pca, file)

## Train Hopfield in low-dimensional-space

In [None]:
df_tissues = pd.DataFrame(data=data, index=df.index)
df_tissues["tissue"]=df_files.reindex(index=df_tissues.index)["SMTS"]
df_tissues = df_tissues.groupby("tissue").mean()

In [None]:
df_tissues.to_csv("tissues_pca.csv")
#df_tissues=pd.read_csv("tissues_pca.csv", index_col=0)
data = df_tissues.values

In [None]:
data_tf=tf.convert_to_tensor((data>0).astype(int)*2-1, dtype=tf.int8)

In [None]:
model = Hopfield(data.shape[1])
model.load_Kanter_Sompolinsky87(data_tf)
model.train()
print(model)

### Fast check with memories

In [None]:
@tf.function
def get_distance_matrix(data_tensor, model, distance=hamming):
    return tf.map_fn(lambda A: tf.map_fn(lambda B: tf.reduce_min([tf.cast(distance(B,model.reconstruct(A)),tf.float64)]), tf.cast(data_tensor,tf.float64), parallel_iterations=6), tf.cast(data_tensor,tf.float64), parallel_iterations=6)

dist_matrix = get_distance_matrix(data_tf, model, dilutedhamming)

In [None]:
classes=[l.split("-")[0] for l in df_tissues.index]

In [None]:
ax = sns.heatmap(dist_matrix, 
            vmin=0, 
            vmax=1, 
            xticklabels=classes, 
            yticklabels=classes,
            cbar_kws={"label":"hamming"})

ax.set_xlabel("memories")
ax.set_ylabel("memories")
plt.show()

## Get some random sample

In [None]:
#sample
log.info("sample")
df = df.drop("tissue", 1).sample(n=1000, axis=1)

## Test on new samples

In [None]:
new_data = pca.fit_transform(df)
new_data_tf=tf.convert_to_tensor((new_data>0).astype(int)*2-1, tf.int8)
reals = df_files.reindex(index=df.index)["SMTS"]
preds = get_predicted_labels(classes, new_data_tf, data_tf, model)

In [None]:
new_data_tf[0][:10], model.reconstruct(new_data_tf[0])[:10]

In [None]:
plt.scatter([x[0] for x in data], [x[1] for x in data])
for label in classes[:5]:
    mask = reals==label
    plt.scatter([x[0] for x in new_data[mask]], [x[3] for x in new_data[mask]], alpha=0.5)

In [None]:
point = new_data_tf[np.random.randint(0,new_data_tf.shape[1])]
np.argmin([hamming(model.reconstruct(point), memory).numpy() for memory in data_tf])

In [None]:
cm = sns.clustermap(confusion_matrix(reals, preds, normalize="true"),
                    vmin = 0,
                    vmax=0.6,  
                    row_cluster=False, 
                    col_cluster=False, 
                    xticklabels=classes, 
                    yticklabels=classes,
                    annot=False,
                    annot_kws={"fontsize":15})
ax = cm.ax_heatmap
fig = ax.get_figure()
ax.set_ylabel("real", fontsize=35, rotation=90)
ax.set_yticklabels(labels=classes, rotation=0)
ax.yaxis.tick_left()
ax.yaxis.set_label_position("left")

ax.set_xticklabels(labels=classes, rotation=90)
ax.set_xlabel("predicted",fontsize=35)
ax.tick_params(labelsize=15)

cax = cm.ax_cbar
cax.tick_params(labelsize=30)
cax.set_title("P()", fontsize=30)
plt.tight_layout()
#cm.savefig(f"predict_{label}.pdf")

plt.show()