In [None]:
from importlib import reload
from load_cluster_data import load_cluster_data
from pca_kmeans_init import pca_kmeans_init
from betabinomo_LDA_singlecells_kinit import *
import betabinomo_LDA_singlecells_kinit
reload(betabinomo_LDA_singlecells_kinit)
import torch
import sklearn.manifold 
import plotnine as p9

## Settings

In [None]:
input_file = '/gpfs/commons/groups/knowles_lab/Karin/parse-pbmc-leafcutter/leafcutter/junctions/PBMC_input_for_LDA.h5'

torch.manual_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

MAKE_PCA_TSNE = True

float_type = { 
    "device" : device, 
    "dtype" : torch.float, # save memory
}

hypers = {
    "eta" : 1., 
    "alpha_prior" : 1., # karin had 0.65 
    "pi_prior" : 1.
}

K = 15 # should also be an argument that gets fed in

## Load data

In [None]:
final_data, coo_counts_sparse, coo_cluster_sparse, cell_ids_conversion, junction_ids_conversion = load_cluster_data(
    input_file) # , celltypes = ["B", "MemoryCD4T"])
N = coo_cluster_sparse.shape[0]
J = coo_cluster_sparse.shape[1]

my_data = make_torch_data(final_data, **float_type)

Optionally plot reads per cell

## PCA initialization

In [None]:
cell_pcs, pc_sd, init_labels = pca_kmeans_init(final_data, my_data.junc_index, my_data.cell_index, K, float_type)

Look at the PCs

In [None]:
pcs_scaled = cell_pcs.copy()
pcs_scaled -= pcs_scaled.mean(1,keepdims=True)
pcs_scaled /= pcs_scaled.std(1,keepdims=True)
_ = plt.hist(pcs_scaled.flatten(),100)

tSNE on scaled PCs (takes 5-10min)

In [None]:
if MAKE_PCA_TSNE: 
    pcs_sd_scaled = cell_pcs * pc_sd

    PCs_embedded = sklearn.manifold.TSNE(
        n_components=2, 
        learning_rate='auto',
        init='random', 
        perplexity=30).fit_transform(pcs_sd_scaled)

    PC_embed_df = pd.DataFrame(PCs_embedded, columns = ["x","y"])
    PC_embed_df["cell_type"] = cell_ids_conversion["cell_type"].to_numpy()
    #p9.ggplot(X_embed_df, p9.aes(x = "x", y="y", color = "cell_type")) + p9.geom_point()

    #plt.figure(figsize=[8,6]) # for pdf
    plt.figure(figsize=[12,8])
    sns.scatterplot(x = "x",y = "y", hue="cell_type", data= PC_embed_df, edgecolor = 'none', alpha = 0.1)
    plt.xlabel("tSNE 1")
    plt.ylabel("tSNE 2")
    ax = plt.gca()
    ax.set_xticks([])
    ax.set_yticks([])
    #plt.savefig("pca_eig_scaled.pdf")

Label by K-means clustering

In [None]:
markers = [',', '.', 'o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X']
    
if MAKE_PCA_TSNE: 
    PC_embed_df["label"] = init_labels

    plt.figure(figsize=[12,8])
    sns.scatterplot(
        x = "x",y = "y", hue="label", style="label", data= PC_embed_df, 
        edgecolor = 'none', alpha = 0.1, markers = markers, palette=sns.color_palette("cubehelix",15))
    plt.xlabel("tSNE 1")
    plt.ylabel("tSNE 2")
    ax = plt.gca()
    ax.set_xticks([])
    ax.set_yticks([])

## Fit LDA

In [None]:
num_trials = 1 # can't currently run more than 1 or overflow GPU memory :( 
num_iters = 300 # should also be an argument that gets fed in

# loop over the number of trials (for now just testing using one trial but in general need to evaluate how performance is affected by number of trials)
for t in range(num_trials):

    # run coordinate ascent VI
    print(K)

    ALPHA_f, PI_f, GAMMA_f, PHI_f, elbos_all = calculate_CAVI(K, my_data, float_type, hypers = hypers, init_labels = init_labels, num_iterations = num_iters)
    elbos_all = np.array(elbos_all)
    juncs_probs = ALPHA_f / (ALPHA_f+PI_f)
    #theta_f = distributions.Dirichlet(GAMMA_f).sample()
    # z_f = distributions.Categorical(PHI_f).sample() # this would be pretty big! 
    #make theta_f a dataframe 
    theta_f = GAMMA_f / GAMMA_f.sum(1,keepdim=True)
    theta_f_plot = pd.DataFrame(theta_f.cpu())
    theta_f_plot['cell_id'] = cell_ids_conversion["cell_type"].to_numpy()
    theta_f_plot_summ = theta_f_plot.groupby('cell_id').mean()
    print(theta_f_plot_summ)
    
    # save the learned variational parameters
    #np.savez('variational_params.npz', ALPHA_f=ALPHA_f, PI_f=PI_f, GAMMA_f=GAMMA_f, PHI_f=PHI_f, juncs_probs=juncs_probs, theta_f=theta_f, z_f=z_f)


    # plot ELBOs. With K=15 PCA-Kmeans init: -25159712.0
    # With random initialization: -25259360.0 (so somewhat worse)
    plt.plot(elbos_all[2:]); plt.show()

In [None]:
elbos_all[-1]

In [None]:
x = theta_f.cpu().numpy()
x -= x.mean(1,keepdims=True)
x /= x.std(1,keepdims=True)
plt.hist(x.flatten(),100)
pd.crosstab( cell_ids_conversion["cell_type"], x.argmax(axis=1) )

In [None]:
X_embedded = sklearn.manifold.TSNE(
    n_components=2, 
    learning_rate='auto',
    init='random', 
    perplexity=100).fit_transform(x)
X_embed_df = pd.DataFrame(X_embedded, columns = ["x","y"])
X_embed_df["cell_type"] = cell_ids_conversion["cell_type"].to_numpy()

In [None]:
plt.figure(figsize=[12,8])
sns.scatterplot(x = "x",y = "y", hue="cell_type", data= X_embed_df, edgecolor = 'none', alpha = 0.1)
plt.xlabel("tSNE 1")
plt.ylabel("tSNE 2")
ax = plt.gca()
ax.set_xticks([])
ax.set_yticks([])
#plt.savefig("pca_eig_scaled.pdf")
