# Notes

# Load Modules

In [3]:
import os
from optparse import Values

In [4]:
import numpy as np
import pandas as pd

from scipy.stats import spearmanr, pearsonr
from scipy.spatial import Delaunay

import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.sans-serif'] = 'Arial'
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
%matplotlib inline

# Load data

In [None]:
meta_df = pd.read_csv('simulation_linear_ontrac_input.csv', index_col=0)

In [None]:
meta_df[['Sample','Cell_Type','x','y']].to_csv('simulated_input.csv', index=True, index_label='Cell_ID')

# Gen run lsf

- Our HPC using Load Sharing Facility (LSF) Job Scheduler, please modify the following code to fit your own computer.
- Please also modify the path for yourself

In [5]:
run_lsf_dir = '../../run_lsf'

os.makedirs(run_lsf_dir, exist_ok=True)

In [None]:
templete = '''#!/bin/bash
#BSUB -J {name}
#BSUB -n 12
#BSUB -P acc_YuanLab
#BSUB -q gpuexpress
#BSUB -W 2:00
#BSUB -R "rusage[mem=10000] span[hosts=1]"
#BSUB -gpu num=1
#BSUB -R h100nvl

#BSUB -oo log/job_{name}.out
#BSUB -eo log/job_{name}.err

JOBID=$1

source /hpc/users/wangw32/.bash_profile

mkdir -p output log

conda activate ONTraC
ONTraC --meta-input {meta_input} --NN-dir {NN_dir} --GNN-dir {GNN_dir} --NT-dir {NT_dir} --n-cpu 12 --n-neighbors {n_neighbors} --device cuda --epochs 1000 --batch-size 10 -s 42 --patience 100 --min-delta 0.001 --min-epochs 50 --lr 0.03 --hidden-feats {hidden_feats} --n-gcn-layers {n_gcn_layers} -k {k} --modularity-loss-weight 0.3 --purity-loss-weight 300 --regularization-loss-weight 0.1 --beta 0.03 > log/{name}.log

'''

In [6]:
analysis_templete = '''#!/bin/bash
#BSUB -J {name}_analysis
#BSUB -n 1
#BSUB -P acc_YuanLab
#BSUB -q express
#BSUB -W 2:00
#BSUB -R "rusage[mem=10000] span[hosts=1]"

#BSUB -oo log/job_{name}_analysis.out
#BSUB -eo log/job_{name}_analysis.err

JOBID=$1

source /hpc/users/wangw32/.bash_profile

mkdir -p analysis_output

conda activate ONTraC

ONTraC_analysis --NN-dir {NN_dir} --GNN-dir {GNN_dir} --NT-dir {NT_dir} -o analysis_output/{name} -l log/{name}.log -s --suppress-cell-type-composition
'''

In [21]:
# ------------------------------------
# Hyper Parameters
# ------------------------------------
test_params = {  # first one is the default
    'n_neighbors': [50, 10, 20, 100],
    'hidden_feats': [4, 2, 8, 16],
    'k': [6, 4, 8, 10],
    'n_gcn_layers': [2, 1, 3, 4, 5, 6]
}

In [None]:
from typing import Dict


def gen_run_lsf(name: str,
                meta_input: str,
                params: Dict,
                template: str,
                output_dir: str):
    written_params = {}

    written_params.update({key: params[key][0] for key in params})  # load default params
    written_params["name"] = f'{name}_base'
    written_params["meta_input"] = meta_input
    written_params["NN_dir"] = f'output/{written_params["name"]}_NN'
    written_params["GNN_dir"] = f'output/{written_params["name"]}_GNN'
    written_params["NT_dir"] = f'output/{written_params["name"]}_NT'
    with open(f'{output_dir}/{written_params["name"]}_run_lsf.sh', 'w') as f:
        f.write(template.format(**written_params))
    
    for key, value in params.items():
        for value_ in value[1:]:
            written_params[key] = value_
            written_params["name"] = f'{name}_{key}_{value_}'
            written_params["meta_input"] = meta_input
            written_params["NN_dir"] = f'output/{written_params["name"]}_NN'
            written_params["GNN_dir"] = f'output/{written_params["name"]}_GNN'
            written_params["NT_dir"] = f'output/{written_params["name"]}_NT'
            with open(f'{output_dir}/{written_params["name"]}_run_lsf.sh', 'w') as f:
                f.write(template.format(**written_params))
        written_params[key] = params[key][0]


gen_run_lsf(name='simulated_linear',
            meta_input='raw_data/simulated_linear/simulated_input.csv',
            params=test_params,
            template=templete,
            output_dir=run_lsf_dir)

In [None]:
from typing import Dict


def gen_analysis_lsf(name: str,
                meta_input: str,
                params: Dict,
                template: str,
                output_dir: str):
    written_params = {}

    written_params.update({key: params[key][0] for key in params})  # load default params
    written_params["name"] = f'{name}_base'
    written_params["meta_input"] = meta_input
    written_params["NN_dir"] = f'output/{written_params["name"]}_NN'
    written_params["GNN_dir"] = f'output/{written_params["name"]}_GNN'
    written_params["NT_dir"] = f'output/{written_params["name"]}_NT'
    with open(f'{output_dir}/{written_params["name"]}_analysis_lsf.sh', 'w') as f:
        f.write(template.format(**written_params))
    
    for key, value in params.items():
        for value_ in value[1:]:
            written_params[key] = value_
            written_params["name"] = f'{name}_{key}_{value_}'
            written_params["meta_input"] = meta_input
            written_params["NN_dir"] = f'output/{written_params["name"]}_NN'
            written_params["GNN_dir"] = f'output/{written_params["name"]}_GNN'
            written_params["NT_dir"] = f'output/{written_params["name"]}_NT'
            with open(f'{output_dir}/{written_params["name"]}_analysis_lsf.sh', 'w') as f:
                f.write(template.format(**written_params))
        written_params[key] = params[key][0]  # recover to the default


gen_analysis_lsf(name='simulated_linear',
            meta_input='raw_data/simulated_linear/simulated_input.csv',
            params=test_params,
            template=analysis_templete,
            output_dir=run_lsf_dir)

# Metric

In [22]:
input_df = pd.read_csv('simulated_input.csv', index_col=0)
input_df.head()

Unnamed: 0_level_0,Sample,UMAP1,UMAP2,time,lineage,Cell_Type,r,theta,x,y
Cell_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
cell562,Simulation,3.141247,6.648345,0.325221,1,A,17.108442,92.387241,-0.712621,17.093594
cell779,Simulation,3.083348,6.590264,0.457619,1,A,20.294256,323.012103,16.210293,-12.209964
cell637,Simulation,3.039063,6.706306,0.459809,1,A,20.342765,184.818052,-20.270883,-1.708626
cell168,Simulation,2.943072,6.653493,0.491919,1,A,21.041083,250.849126,-6.90267,-19.876627
cell393,Simulation,3.054525,6.533715,0.491919,1,A,21.041083,162.404762,-20.056692,6.360523


In [None]:
def get_results_df():
    written_params = {}
    written_params.update({key: values[0] for key, values in test_params.items()})
    run_name = 'simulated_linear_base'
    NTScore_df = pd.read_csv(f'../../output/{run_name}_NT/NTScore.csv.gz', index_col=0)
    NTScore_df = NTScore_df.join(input_df['r'])
    corr = abs(spearmanr(NTScore_df['r'], NTScore_df['Cell_NTScore'])[0])
    yield run_name, 'ONTraC', 'base', written_params['n_neighbors'], written_params['hidden_feats'], written_params['k'], written_params['n_gcn_layers'], corr

    for key, value in test_params.items():
        for value_ in value[1:]:
            written_params[key] = value_
            run_name = f'simulated_data_{key}_{value_}'
            NTScore_df = pd.read_csv(f'../../output/{run_name}_NT/NTScore.csv.gz', index_col=0)
            NTScore_df = NTScore_df.join(input_df['r'])
            corr = abs(spearmanr(NTScore_df['r'], NTScore_df['Cell_NTScore'])[0])
            yield run_name, 'ONTraC', key, written_params['n_neighbors'], written_params['hidden_feats'], written_params['k'], written_params['n_gcn_layers'], corr
        written_params[key] = test_params[key][0]  # recover to the default




results_df = pd.DataFrame(get_results_df(),
                          columns = ['run_name', 'Method', 'Params', 'n_neighbors', 'hidden_feats', 'k', 'n_gcn_layers', 'correlation'])
results_df.head()

In [37]:
results_df.to_csv('metrics.csv', index=False)