In [None]:
# !pip install dgl-cu101==0.5.3
# !pip install scikit_learn==0.22.2.post1
# !pip install xlrd==1.2.0
# !pip install leidenalg

In [None]:
import sys
sys.path.append("..")
import argparse
import os
import numpy as np
import dgl
from dgl import DGLGraph
import dgl.function as fn
from dgl.nn.pytorch import GraphConv
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
from dgl.data import register_data_args, load_data
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.nn.functional import binary_cross_entropy_with_logits as BCELoss
from torch.utils.data import DataLoader
from tqdm import tqdm
from collections import Counter
import pickle
import h5py
import random
import train
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import glob2
import scanpy.api as sc
import models
import seaborn as sns
%load_ext autoreload
%autoreload 2

In [None]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
device = train.get_device()

In [None]:
category = "real_data"

In [None]:
epochs = 10
batch_size = 128 
pca_size = 50
path = "../"
files = glob2.glob(f'{path}real_data/*.h5')
files = [f[len(f"'{path}real_data"):-3] for f in files]
print(files)
nb_genes =2000

In [None]:
# results = pd.DataFrame()
results = pd.read_pickle(
    f"../output/pickle_results/{category}/{category}_gae_layers.pkl")
model_name = "GraphConv"
normalize_weights = "log_per_cell"
node_features = "scale"
same_edge_values = False
edge_norm = True
hidden_relu = False
hidden_bn = False
n_layers = 1
hidden_dim = 200
hidden = [300]
nb_genes = 3000
activation = F.relu
for dataset in files:
    print(f">> {dataset}")

    data_mat = h5py.File(f"{path}/real_data/{dataset}.h5", "r")

    Y = np.array(data_mat['Y'])
    X = np.array(data_mat['X'])

    genes_idx, cells_idx = train.filter_data(X, highly_genes=nb_genes)
    X = X[cells_idx][:, genes_idx]
    Y = Y[cells_idx]
    n_clusters = len(np.unique(Y))
    for hidden_dim in [200, 100, 300, 150]:
        for hidden in [[300][50, 50], [50][300], [400], [200, 200], [300, 300],
                       [400, 400][200], [100], None, [100, 100]]:
            t0 = time.time()
            graph = train.make_graph(
                X,
                Y,
                dense_dim=pca_size,
                node_features=node_features,
                normalize_weights=normalize_weights,
            )

            labels = graph.ndata["label"]
            train_ids = np.where(labels != -1)[0]

            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(n_layers)

            dataloader = dgl.dataloading.NodeDataLoader(
                graph,
                train_ids,
                sampler,
                batch_size=batch_size,
                shuffle=True,
                drop_last=False,
                num_workers=1,
            )
            print(
                f"INPUT: {model_name}  {hidden_dim}, {hidden}, {same_edge_values}, {edge_norm}"
            )
            t1 = time.time()

            for run in range(3):
                t_start = time.time()
                torch.manual_seed(run)
                torch.cuda.manual_seed_all(run)
                np.random.seed(run)
                random.seed(run)

                model = models.GCNAE(
                    in_feats=pca_size,
                    n_hidden=hidden_dim,
                    n_layers=n_layers,
                    activation=activation,
                    dropout=0.1,
                    hidden=hidden,
                    hidden_relu=hidden_relu,
                    hidden_bn=hidden_bn,
                ).to(device)
                if run == 0:
                    print(f">", model)

                optim = torch.optim.Adam(model.parameters(), lr=1e-5)

                scores = train.train(model,
                                     optim,
                                     epochs,
                                     dataloader,
                                     n_clusters,
                                     plot=False,
                                     cluster=["KMeans", "Leiden"])
                scores["dataset"] = dataset
                scores["run"] = run
                scores["nb_genes"] = nb_genes
                scores["hidden"] = str(hidden)
                scores["hidden_dim"] = str(hidden_dim)

                results = results.append(scores, ignore_index=True)

                results.to_pickle(
                    f"../output/pickle_results/{category}/{category}_gae_layers.pkl"
                )
                print("Done")

In [None]:
results = pd.read_pickle(
    f"../output/pickle_results/{category}/{category}_gae_layers.pkl")

In [None]:
results["combined"]=results["nb_genes"].astype(str)+ results["hidden"] +results["hidden_dim"].astype(str)

results = results[(results["combined"] != '3000.0[300]200')]
results.drop("combined", axis=1, inplace= True)
