In [5]:
import numpy as np
import pandas as pd
import scanpy as sc
import pymn
import anndata as ad
import time
import os, re, glob
from pyprojroot import here
import resource
import datetime

In [6]:
base_data_folder = "/vault/lfrench/mouse_brain_cluster_replicability/data/"

In [7]:
n_part = 2021488 #from split_into_halves

In [4]:
h5ad_files = [
    base_data_folder + "/whole_mouse_brain/processed/zeng/subsets/AIT21.0.merged.with_multiome.A."+str(n_part)+".h5ad",
    base_data_folder + "/whole_mouse_brain/processed/zeng/subsets/AIT21.0.merged.with_multiome.B."+str(n_part)+".h5ad",
    base_data_folder + "/whole_mouse_brain/processed/macosko/subsets/Macosko_Mouse_Atlas_Single_Nuclei.A."+str(n_part)+".h5ad",
    base_data_folder + "/whole_mouse_brain/processed/macosko/subsets/Macosko_Mouse_Atlas_Single_Nuclei.B."+str(n_part)+".h5ad"    
]

In [5]:
#takes about 1Tb of memory
pattern = re.compile(r'\.(A|B)\.')

for i, f1 in enumerate(h5ad_files):
    for j, f2 in enumerate(h5ad_files):
        if i <= j:
            continue
        start_time = time.time()
        s1 = pattern.search(os.path.basename(f1)).group(1)
        s2 = pattern.search(os.path.basename(f2)).group(1)
        # do your work with ordered pair (f1 -> f2)
        print(s1, " - ", s2, ".", os.path.basename(f1), " - ", os.path.basename(f2))
        
        #load the two h5ad files
        #set study_ids based on filenames
        #set results folder
        adata_a = sc.read_h5ad(f1, backed=True)
        adata_b = sc.read_h5ad(f2, backed=True)
        
        adata_a.obs["study_id"] = adata_a.obs["study_id"].astype(str) + "_" + s1
        adata_b.obs["study_id"] = adata_b.obs["study_id"].astype(str) + "_" + s2
        
#        frac = 0.05
#        rng = np.random.default_rng(0)  # change seed if desired
#        n = adata_a.n_obs
#        m = max(1, int(np.floor(frac * n)))
#        idx = rng.choice(n, size=m, replace=False)
#        idx.sort()  # helps sequential I/O from HDF5
#        adata_a = adata_a[idx, :].to_memory()
        adata_a = adata_a.to_memory()
        print("Loaded A into memory")
#        adata_b = adata_b[idx, :].to_memory()
        adata_b = adata_b.to_memory()
        print("Done loading files")
        
        merged=ad.concat([adata_a, adata_b], join="inner")
        
        pymn.variableGenes(merged, study_col='study_id')
        merged = merged[:, merged.var.highly_variable]
        
        merged.obs['cell.type'] = merged.obs['cell.type'].to_numpy(dtype="str")
        merged.obs['study_id'] = merged.obs['study_id'].to_numpy(dtype="str")
        merged.obs.index = merged.obs.index.to_numpy(dtype="str")
        #use numpy types instead of pandas
        merged.var.highly_variable = merged.var.highly_variable.to_numpy(dtype="bool")
        merged.var.index = merged.var.index.to_numpy(dtype="str")
        merged.obs_names_make_unique()
        
        
        #write merged?
        merged.write(base_data_folder + '/whole_mouse_brain/processed/' +"split_half." + adata_a.obs['study_id'][0] + "_to_" + adata_b.obs['study_id'][0] +'.merged.h5ad')
        



B  -  A . AIT21.0.merged.with_multiome.B.2021488.h5ad  -  AIT21.0.merged.with_multiome.A.2021488.h5ad
Loaded A into memory
Done loading files


  merged.obs['cell.type'] = merged.obs['cell.type'].to_numpy(dtype="str")
  result_folder = "split_half." + adata_a.obs['study_id'][0] + "_to_" + adata_b.obs['study_id'][0] + "." + str(round(time.time()))
  merged.write(base_data_folder + '/whole_mouse_brain/processed/' +"split_half." + adata_a.obs['study_id'][0] + "_to_" + adata_b.obs['study_id'][0] +'.merged.h5ad')


/vault/lfrench/mouse_brain_cluster_replicability/results/split_half.Zeng_B_to_Zeng_A.1761157153
A  -  A . Macosko_Mouse_Atlas_Single_Nuclei.A.2021488.h5ad  -  AIT21.0.merged.with_multiome.A.2021488.h5ad
Loaded A into memory
Done loading files


  merged.obs['cell.type'] = merged.obs['cell.type'].to_numpy(dtype="str")
  result_folder = "split_half." + adata_a.obs['study_id'][0] + "_to_" + adata_b.obs['study_id'][0] + "." + str(round(time.time()))
  merged.write(base_data_folder + '/whole_mouse_brain/processed/' +"split_half." + adata_a.obs['study_id'][0] + "_to_" + adata_b.obs['study_id'][0] +'.merged.h5ad')


/vault/lfrench/mouse_brain_cluster_replicability/results/split_half.Macosko_A_to_Zeng_A.1761161679
A  -  B . Macosko_Mouse_Atlas_Single_Nuclei.A.2021488.h5ad  -  AIT21.0.merged.with_multiome.B.2021488.h5ad
Loaded A into memory
Done loading files


  merged.obs['cell.type'] = merged.obs['cell.type'].to_numpy(dtype="str")
  result_folder = "split_half." + adata_a.obs['study_id'][0] + "_to_" + adata_b.obs['study_id'][0] + "." + str(round(time.time()))
  merged.write(base_data_folder + '/whole_mouse_brain/processed/' +"split_half." + adata_a.obs['study_id'][0] + "_to_" + adata_b.obs['study_id'][0] +'.merged.h5ad')


/vault/lfrench/mouse_brain_cluster_replicability/results/split_half.Macosko_A_to_Zeng_B.1761165552
B  -  A . Macosko_Mouse_Atlas_Single_Nuclei.B.2021488.h5ad  -  AIT21.0.merged.with_multiome.A.2021488.h5ad
Loaded A into memory
Done loading files


  merged.obs['cell.type'] = merged.obs['cell.type'].to_numpy(dtype="str")
  result_folder = "split_half." + adata_a.obs['study_id'][0] + "_to_" + adata_b.obs['study_id'][0] + "." + str(round(time.time()))
  merged.write(base_data_folder + '/whole_mouse_brain/processed/' +"split_half." + adata_a.obs['study_id'][0] + "_to_" + adata_b.obs['study_id'][0] +'.merged.h5ad')


/vault/lfrench/mouse_brain_cluster_replicability/results/split_half.Macosko_B_to_Zeng_A.1761169986
B  -  B . Macosko_Mouse_Atlas_Single_Nuclei.B.2021488.h5ad  -  AIT21.0.merged.with_multiome.B.2021488.h5ad
Loaded A into memory
Done loading files


  merged.obs['cell.type'] = merged.obs['cell.type'].to_numpy(dtype="str")
  result_folder = "split_half." + adata_a.obs['study_id'][0] + "_to_" + adata_b.obs['study_id'][0] + "." + str(round(time.time()))
  merged.write(base_data_folder + '/whole_mouse_brain/processed/' +"split_half." + adata_a.obs['study_id'][0] + "_to_" + adata_b.obs['study_id'][0] +'.merged.h5ad')


/vault/lfrench/mouse_brain_cluster_replicability/results/split_half.Macosko_B_to_Zeng_B.1761175182
B  -  A . Macosko_Mouse_Atlas_Single_Nuclei.B.2021488.h5ad  -  Macosko_Mouse_Atlas_Single_Nuclei.A.2021488.h5ad
Loaded A into memory
Done loading files


  merged.obs['cell.type'] = merged.obs['cell.type'].to_numpy(dtype="str")
  result_folder = "split_half." + adata_a.obs['study_id'][0] + "_to_" + adata_b.obs['study_id'][0] + "." + str(round(time.time()))
  merged.write(base_data_folder + '/whole_mouse_brain/processed/' +"split_half." + adata_a.obs['study_id'][0] + "_to_" + adata_b.obs['study_id'][0] +'.merged.h5ad')


/vault/lfrench/mouse_brain_cluster_replicability/results/split_half.Macosko_B_to_Macosko_A.1761181416


In [6]:
        mem_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        # print the memory usage in megabytes
        print("Peak memory use in Gb  " + str(round(mem_usage / 1024 / 1024,2 )) + " PID  " + str(os.getpid()))


Peak memory use in Gb  998.87 PID  1276392


In [None]:
#####################################################
#restart kernel for memory - still runs out of memory on 1.2Tb machine, below code ran on larger mem machine
#Ran on large memory node separatley
#####################################################

In [None]:
pattern = base_data_folder + '/whole_mouse_brain/processed/split_half.*'

# Using glob to find matching files
for filepath in glob.glob(pattern):
    print(f"Processing: {filepath}")
    merged = sc.read_h5ad(filepath)
    study_ids = merged.obs['study_id'].unique()
    study_id_1, study_id_2 = study_ids
    #print(f"Study ID 1: {study_id_1}")
    #print(f"Study ID 2: {study_id_2}")
    result_folder = "split_half." + str(round(time.time()))
    result_folder = os.path.join(here(), "results", result_folder)
    os.mkdir(result_folder)
    
    merged.obs['cell.type'] = merged.obs['cell.type'].astype(str)
    merged.obs['study_id'] = merged.obs['study_id'].astype(str)
    
    print(result_folder)
    pymn.MetaNeighborUS(merged,
                study_col='study_id',
                ct_col='cell.type',
                fast_version=True, symmetric_output=True)
    print("After running metaneighbor all vs all")
    aurocs = merged.uns["MetaNeighborUS"]
    aurocs.to_csv(result_folder + "/aurocs_full.csv.gz", compression="gzip")

    #run 1 vs best
    pymn.MetaNeighborUS(merged,
                        study_col='study_id',
                        ct_col='cell.type', one_vs_best=True,
                        fast_version=True, symmetric_output=True)
    aurocs = merged.uns["MetaNeighborUS_1v1"]
    aurocs.to_csv(result_folder + "/aurocs_1v1.csv.gz", compression="gzip")

    cell_counts = merged.obs.groupby("study_id").size()
    cell_counts.to_csv(result_folder + "/cell_study_counts.csv")
    cell_type_counts = merged.obs[["study_id", "cell.type"]].drop_duplicates().groupby("study_id").size()
    cell_type_counts.to_csv(result_folder + "/cell_type_per_study_counts.csv")

    for set_threshold in [0.95, 0.99, 0.999]:
        print(set_threshold)
        pymn.topHits(merged, threshold=set_threshold)
        tophit_table = merged.uns['MetaNeighborUS_topHits']
        tophit_table.to_csv(result_folder + "/top_hits."+str(set_threshold)+".csv")

    merged.obs.to_csv(result_folder + "/merged.obs.csv.zip", compression="gzip")
    merged.var.to_csv(result_folder + "/merged.var.csv.zip", compression="gzip")

    #write out peak memory at end
    mem_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    # print the memory usage in megabytes
    print("Peak memory use in Gb  " + str(round(mem_usage / 1024 / 1024,2 )) + " PID  " + str(os.getpid()))

    os.mkdir(os.path.join(result_folder, "Peak memory use in Gb " + str(round(mem_usage / 1024 / 1024 ,2))))

    end_time = time.time()
    print("Time taken h_m_s " + str(datetime.timedelta(seconds=end_time-start_time)).replace(':', '_').split('.')[0])
    os.mkdir(os.path.join(result_folder, "Time taken h_m_s " + str(datetime.timedelta(seconds=end_time-start_time)).replace(':', '_').split('.')[0]))

    del merged    

Processing: /vault/lfrench/mouse_brain_cluster_replicability/data//whole_mouse_brain/processed/split_half.Macosko_A_to_Zeng_B.merged.h5ad
/vault/lfrench/mouse_brain_cluster_replicability/results/split_half.1761233591
