In [None]:
import sys
sys.path.append("..")
import argparse
import numpy as np
import dgl
from dgl import DGLGraph
import torch
import torch.nn.functional as F
import time
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from collections import Counter
import pickle
import h5py
import random
import glob2
import seaborn as sns

import train
import models

%load_ext autoreload
%autoreload 2

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
device = train.get_device()

In [None]:
category = "real_data"

epochs = 10
batch_size = 128 
pca_size = 50
path = "../"
files = glob2.glob(f'{path}real_data/*.h5')
files = [f[len(f"'{path}real_data"):-3] for f in files]
print(files)


In [None]:
results = pd.DataFrame()
model_name = "GraphConv"
normalize_weights = "log_per_cell"
node_features = "scale"
same_edge_values = False
edge_norm = True
hidden_relu = False
hidden_bn = False
n_layers = 1
hidden_dim = 200
hidden = [300]
nb_genes = 3000
activation = F.relu
for dataset in files:
    print(f">> {dataset}")

    data_mat = h5py.File(f"{path}/real_data/{dataset}.h5", "r")

    Y = np.array(data_mat['Y'])
    X = np.array(data_mat['X'])
    n_clusters = len(np.unique(Y))

    genes_idx, cells_idx = train.filter_data(X, highly_genes=nb_genes)
    X = X[cells_idx][:, genes_idx]
    Y = Y[cells_idx]

    t0 = time.time()
    graph = train.make_graph(
        X,
        Y,
        dense_dim=pca_size,
        node_features=node_features,
        normalize_weights=normalize_weights,
    )

    labels = graph.ndata["label"]
    train_ids = np.where(labels != -1)[0]

    sampler = dgl.dataloading.MultiLayerFullNeighborSampler(n_layers)

    dataloader = dgl.dataloading.NodeDataLoader(
        graph,
        train_ids,
        sampler,
        batch_size=batch_size,
        shuffle=True,
        drop_last=False,
        num_workers=1,
    )
    print(
        f"INPUT: {model_name}  {hidden_dim}, {hidden}, {same_edge_values}, {edge_norm}"
    )
    t1 = time.time()

    for run in range(3):
        t_start = time.time()
        torch.manual_seed(run)
        torch.cuda.manual_seed_all(run)
        np.random.seed(run)
        random.seed(run)

        model = models.GCNAE(
            in_feats=pca_size,
            n_hidden=hidden_dim,
            n_layers=n_layers,
            activation=activation,
            dropout=0.1,
            hidden=hidden,
            hidden_relu=hidden_relu,
            hidden_bn=hidden_bn,
        ).to(device)
        if run == 0:
            print(f">", model)

        optim = torch.optim.Adam(model.parameters(), lr=1e-5)

        scores = train.train(model,
                             optim,
                             epochs,
                             dataloader,
                             n_clusters,
                             plot=False,
                             cluster=["KMeans", "Leiden"])
        scores["dataset"] = dataset
        scores["run"] = run
        scores["nb_genes"] = nb_genes
        scores["hidden"] = str(hidden)
        scores["hidden_dim"] = str(hidden_dim)
        scores["tot_kmeans_time"] = (t1 - t0) + (
            scores['ae_end'] - t_start) + scores['kmeans_time']
        scores["tot_leiden_time"] = (t1 - t0) + (
            scores['ae_end'] - t_start) + scores['leiden_time']
        scores["time_graph"] = t1 - t0
        scores["time_training"] = (scores['ae_end'] - t_start)

        results = results.append(scores, ignore_index=True)

#         results.to_pickle(
#             f"../output/pickle_results/{category}/{category}_gae.pkl")
#         print("Done")

results.mean()  #

In [None]:
category = "real_data"
results = pd.read_pickle(
    f"../output/pickle_results/{category}/{category}_gae.pkl")

results.mean()  #

In [None]:
results = results[["dataset", "run" , "kmeans_ari", "leiden_ari"]]

results = results.groupby("dataset")[[ "kmeans_ari", "leiden_ari"]].mean()

results1 = pd.read_pickle(
    f"../output/pickle_results/{category}/{category}_gae_corr.pkl")
# results1.mean()

results1 = results1[["dataset", "run" , "kmeans_ari", "leiden_ari"]]

results1 = results1.groupby("dataset")[[ "kmeans_ari", "leiden_ari"]].mean()
pd.merge(results, results1, left_index=True, right_index=True)

# CPU analysis

In [None]:
device = train.get_device(use_cpu = True)
print(device)
category = "real_data"

epochs = 10
batch_size = 128 
pca_size = 50
path = "../"
files = glob2.glob(f'{path}real_data/*.h5')
files = [f[len(f"'{path}real_data"):-3] for f in files]
print(files)


In [None]:
results = pd.DataFrame()
model_name = "GraphConv"
normalize_weights = "log_per_cell"
node_features = "scale"
same_edge_values = False
edge_norm = True
hidden_relu = False
hidden_bn = False
n_layers = 1
hidden_dim = 200
hidden = [300]
nb_genes = 3000
activation = F.relu
for dataset in files:
    print(f">> {dataset}")

    data_mat = h5py.File(f"{path}/real_data/{dataset}.h5", "r")

    Y = np.array(data_mat['Y'])
    X = np.array(data_mat['X'])
    n_clusters = len(np.unique(Y))

    genes_idx, cells_idx = train.filter_data(X, highly_genes=nb_genes)
    X = X[cells_idx][:, genes_idx]
    Y = Y[cells_idx]

    t0 = time.time()
    graph = train.make_graph(
        X,
        Y,
        dense_dim=pca_size,
        node_features=node_features,
        normalize_weights=normalize_weights,
    )

    labels = graph.ndata["label"]
    train_ids = np.where(labels != -1)[0]

    sampler = dgl.dataloading.MultiLayerFullNeighborSampler(n_layers)

    dataloader = dgl.dataloading.NodeDataLoader(
        graph,
        train_ids,
        sampler,
        batch_size=batch_size,
        shuffle=True,
        drop_last=False,
        num_workers=1,
    )
    print(
        f"INPUT: {model_name}  {hidden_dim}, {hidden}, {same_edge_values}, {edge_norm}"
    )
    t1 = time.time()

    for run in range(3):
        t_start = time.time()
        torch.manual_seed(run)
        torch.cuda.manual_seed_all(run)
        np.random.seed(run)
        random.seed(run)

        model = models.GCNAE(
            in_feats=pca_size,
            n_hidden=hidden_dim,
            n_layers=n_layers,
            activation=activation,
            dropout=0.1,
            hidden=hidden,
            hidden_relu=hidden_relu,
            hidden_bn=hidden_bn,
        ).to(device)
        if run == 0:
            print(f">", model)

        optim = torch.optim.Adam(model.parameters(), lr=1e-5)

        scores = train.train(model, optim, epochs, dataloader, n_clusters, plot=False,
                            cluster=["KMeans", "Leiden"], use_cpu = True)
        scores["dataset"] = dataset
        scores["run"] = run
        scores["nb_genes"] = nb_genes
        scores["hidden"] = str(hidden)
        scores["hidden_dim"] = str(hidden_dim)
        scores["tot_kmeans_time"] = (t1-t0) + (scores['ae_end'] - t_start) + scores['kmeans_time']
        scores["tot_leiden_time"] = (t1-t0) + (scores['ae_end'] - t_start) + scores['leiden_time']
        scores["time_graph"] = t1-t0
        scores["time_training"] = (scores['ae_end'] - t_start)

        results = results.append(scores, ignore_index = True)

        results.to_pickle(
            f"../output/pickle_results/{category}/{category}_gae_cpu.pkl")
        print("Done")

In [None]:
results.mean()