# Quick overview of dataset we choose

In [11]:
import scanpy as sc, pandas as pd, numpy as np
from utils.plotting import plot_obs_barchart, plot_obs_treemap


ANNDATA_FOLDER = 'datasets'
QUERY_DATASET_NAME = 'LCA'
ABS_FILE_PATH = f'{ANNDATA_FOLDER}/{QUERY_DATASET_NAME}/{QUERY_DATASET_NAME}.h5ad'

# "LCA.h5ad" file contains existing annotations using a human-in-the-loop strategy by the authors 
query_adata = sc.read_h5ad(ABS_FILE_PATH)

fig = plot_obs_barchart(query_adata, max_unique=50, dataset_name=QUERY_DATASET_NAME)
fig.show()

In [10]:
fig = plot_obs_treemap(query_adata, max_unique=50, dataset_name=QUERY_DATASET_NAME)
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [2]:
# query_adata.obs['cell_ontology_type'].unique().tolist()

In [3]:
# print(
#     len(query_adata.obs['cell_ontology_type'].unique().tolist()),
#         len(smartseq_adata.obs['cell_type'].unique().tolist()),
#             len(tenx_adata.obs['cell_type'].unique().tolist()),
#                 len(set(smartseq_adata.obs['cell_type'].unique().tolist() + tenx_adata.obs['cell_type'].unique().tolist()))
#         )

In [4]:
# cell_compartment_lookup_df = pd.concat([smartseq_adata.obs[['compartment','cell_type']], tenx_adata.obs[['compartment','cell_type']]]).drop_duplicates()
# cell_compartment_lookup_df.sort_values(by=['cell_type'])

# Creating individual predictions

In [6]:
from utils.utiltity_functions import run_external_script

## 1. CellTypist

In [7]:
PYTHON_CMD = 'python'
SCRIPT_NAME = 'celltypist/celltypist_prediction_pipeline.py'
MOUNT_GOOGLE_DRIVE = 'False'
EXISTING_ANNOTATIONS_COLUMN = ''
OUTPUT_PREDICTIONS_FILE = 'celltypist_preds.csv'
CELLTYPIST_MODEL_NAME = 'Human_Lung_Atlas.pkl'

args = f'--mount-google-drive "{MOUNT_GOOGLE_DRIVE}" \
--existing-annotations-column "{EXISTING_ANNOTATIONS_COLUMN}" \
--folder-name "{ANNDATA_FOLDER}" \
--dataset-name "{QUERY_DATASET_NAME}" \
--output-predictions-file "{OUTPUT_PREDICTIONS_FILE}" \
--model-name "{CELLTYPIST_MODEL_NAME}"'

run_external_script(PYTHON_CMD, SCRIPT_NAME, args)

Python OUTPUT:

2023041719

Python ERROR:

usage: celltypist_prediction_pipeline.py [-h]
                                         [--mount-google-drive MOUNT_GOOGLE_DRIVE]
                                         [--existing-annotations-column [EXISTING_ANNOTATIONS_COLUMN]]
                                         [--folder-name [FOLDER_NAME]]
                                         [--dataset-name [DATASET_NAME]]
                                         [--output-predictions-file [OUTPUT_PREDICTIONS_FILE]]
                                         [--model-name [MODEL_NAME]]
celltypist_prediction_pipeline.py: error: unrecognized arguments: --mount-google-drive "False" --existing-annotations-column "" --folder-name "datasets" --dataset-name "LCA" --output-predictions-file "celltypist_preds_temp.csv" --model-name "Human_Lung_Atlas.pkl"

If the script fails due to usage issues, try the following command directly in a terminal:
Python celltypist/celltypist_prediction_pipeline.py 

## 2. Azimuth

In [8]:
# RSCRIPT_CMD = '/N/soft/rhel7/r/4.1.1/lib64/R/bin/Rscript'
RSCRIPT_CMD = 'RScript'
SCRIPT_NAME = 'azimuth/azimuth_prediction_pipeline.R'
OUTPUT_PREDICTIONS_FILE = 'azimuth_preds.tsv'

'''
Choose one of Azimuth's references:
    "adiposeref", "bonemarrowref", "fetusref", "heartref", 
    "humancortexref", "kidneyref", "lungref", "mousecortexref", 
    "pancreasref", "pbmcref", "tonsilref"
'''
REFERENCE = 'lungref'

args = f'{ANNDATA_FOLDER} {QUERY_DATASET_NAME} {OUTPUT_PREDICTIONS_FILE} {REFERENCE}'

print(RSCRIPT_CMD, SCRIPT_NAME, args)
run_external_script(RSCRIPT_CMD, SCRIPT_NAME, args)

RScript azimuth/azimuth_prediction_pipeline.R datasets LCA azimuth_preds_temp.tsv lungref
Rscript OUTPUT:

[1] "loaded Seurat"
[1] "loaded Azimuth"
[1] "loaded SeuratData"
[1] "loaded patchwork"
[1] "loaded logr"
[1] "azimuth_preds_2023421719.log"
[1] "logs/log/azimuth_preds_2023421719.log"
[1] "2023-04-17 19:42:00 : datasets"              
[2] "2023-04-17 19:42:00 : LCA"                   
[3] "2023-04-17 19:42:00 : azimuth_preds_temp.tsv"
[4] "2023-04-17 19:42:00 : lungref"               
[1] "2023-04-17 19:42:00 : Initializing arguments"
[1] "2023-04-17 19:42:00 : C:/Users/HP/Desktop/Vikrant/Github_Repositories/ct-ann-predictive-analytics"
[1] "2023-04-17 19:42:00 : Loading the query dataset : datasets/LCA/LCA.h5ad"
[1] "****  Loading the query dataset : datasets/LCA/LCA.h5ad"
[1] "2023-04-17 19:42:18 : Loaded the query dataset"
[1] "2023-04-17 19:42:18 : Running Azimuth using the reference [lungref] dataset."
[1] "Error in dir.exists(reference) : invalid filename argument\nCalls: R

## 3. PopV

Modularizing the PopV Tutorial is tricky. Keeping this aside for now.

In [9]:
# For Lung datasets, run the PopV tutorial ipynb file for now.