In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import pymn
import anndata as ad
import time
import datetime
import os
from pyprojroot import here
import resource

In [2]:
here()
start_time = time.time()
base_data_folder = "/vault/lfrench/mouse_brain_cluster_replicability/data/"


In [3]:
if not os.path.isdir(os.path.join(here(), "results")):
  os.mkdir(os.path.join(here(), "results"))

In [4]:
result_folder = "full_run_ZengAWS." + str(round(time.time()))
result_folder = os.path.join(here(), "results", result_folder)
os.mkdir(result_folder)
print(result_folder)

/home/lfrench/projects/mouse_brain_comparison/results/full_run_ZengAWS.1718116036


In [7]:
merged = sc.read_h5ad(base_data_folder + '/whole_mouse_brain/processed/merged_Zeng_AWS.Oct2023.h5ad')


In [8]:
print("After loading merged h5ad")

After loading merged h5ad


In [9]:
merged.obs['cell.type'] = merged.obs['cell.type'].astype(str)
merged.obs['study_id'] = merged.obs['study_id'].astype(str)

In [10]:
print("Done changing obs columns to strings")

Done changing obs columns to strings


In [None]:
pymn.MetaNeighborUS(merged,
                    study_col='study_id',
                    ct_col='cell.type',
                    fast_version=True, symmetric_output=True)

In [None]:
print("After running metaneighbor all vs all")
aurocs = merged.uns["MetaNeighborUS"]
aurocs.to_csv(result_folder + "/aurocs_full.csv.gz", compression="gzip")

In [None]:
#run 1 vs best
pymn.MetaNeighborUS(merged,
                    study_col='study_id',
                    ct_col='cell.type', one_vs_best=True,
                    fast_version=True, symmetric_output=True)

In [None]:
aurocs = merged.uns["MetaNeighborUS_1v1"]

In [None]:
aurocs.to_csv(result_folder + "/aurocs_1v1.csv.gz", compression="gzip")

In [None]:
cell_counts = merged.obs.groupby("study_id").size()
cell_counts.to_csv(result_folder + "/cell_study_counts.csv")
cell_type_counts = merged.obs[["study_id", "cell.type"]].drop_duplicates().groupby("study_id").size()
cell_type_counts.to_csv(result_folder + "/cell_type_per_study_counts.csv")

In [None]:
for set_threshold in [0.95, 0.99, 0.999]:
    print(set_threshold)
    pymn.topHits(merged, threshold=set_threshold)
    tophit_table = merged.uns['MetaNeighborUS_topHits']
    tophit_table.to_csv(result_folder + "/top_hits."+str(set_threshold)+".csv")

In [None]:
merged.obs.to_csv(result_folder + "/merged.obs.csv.zip", compression="gzip")
merged.var.to_csv(result_folder + "/merged.var.csv.zip", compression="gzip")

In [None]:
#write out peak memory at end
mem_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
# print the memory usage in megabytes
print("Peak memory use in Gb  " + str(round(mem_usage / 1024 / 1024,2 )) + " PID  " + str(os.getpid()))

os.mkdir(os.path.join(result_folder, "Peak memory use in Gb " + str(round(mem_usage / 1024 / 1024 ,2))))

In [None]:
end_time = time.time()
print("Time taken h_m_s " + str(datetime.timedelta(seconds=end_time-start_time)).replace(':', '_').split('.')[0])
os.mkdir(os.path.join(result_folder, "Time taken h_m_s " + str(datetime.timedelta(seconds=end_time-start_time)).replace(':', '_').split('.')[0]))