# The goal here is to put all of our mapping calls in one place
- It is all bash scripting, but I think putting the code into a notebook is visually helpful
- Hopefully breaking things down into cells will make it more clear what is what

# Code Used for Running CountASAP for ASAPseq data:

In [None]:
%%bash
# Need the awl True here to say we are directly pulling barcodes from
# the given whitelist (specifically h5ad)
# If using whitelist_ex.csv as -wl, set -awl False

#python asap_process.py \
#-cr  $datPath/ASAPYD101_S33_R2_001.fastq.gz \
#-br $datPath/ASAPYD101_S33_R3_001.fastq.gz \
#-wl ../ATACprocOut/atac_processTest.h5.h5ad \
#-ref ex_inputs/asapSeq_barcodes.csv \
#-tol 1 -mis 0.95 -proc 24 -awl True \
#-umi True \
#-out proc_ASAPtest.out > procASAPTime.out

# Scaling on the linux machine:
- Note: all speed benchmarking (Figure 1) was reported using Lane 4 data (not full dataset)
- At least for countASAP, it shouldn't matter, as scaling was linear
- Also, data was CITEseq data, since CellRanger does not support ASAPseq.

In [None]:
%%bash

for i in 1 3 6 9 18 27 36;
do
    python asap_process.py \
	-cr  CSYGXD101_S41_L004_R1_001.fastq.gz \
	-br  CSYGXD101_S41_L004_R2_001.fastq.gz \
	-wl ../CITEseq/d101_barcodes.csv \
	-ref ../CITEseq/citeSeq_codes.csv \
	-tol 1 -mis 0.95 -proc $i -awl False \
	-ass CITE -umi True \
	-out proc"$i"_test.out > proc$i.time.out
done

# Same scaling, but for cellranger:

In [None]:
%%bash

# This was done using cellranger 9.0.1

for i in 1 3 6 9 18 27 36;
do
	./cellranger multi --id=multi$i --csv=runRanger.csv --localcores=$i --l
ocalmem=128 > cellRanger$i.out
done


In [None]:
# Defining runRanger.csv here:
# Need to have it as a separate file if you want to run it
# Ref does not come with cellranger, needed to downlaod
[gene-expression]
reference,cellranger-9.0.1/refdata-gex-mm10-2020-A
create-bam,false
[feature]
reference,citeSeq_codes.csv
[libraries]
fastq_id,fastqs,lanes,feature_types,subsample_rate
CSYGXD101,CITEseq_surface,4,antibody capture,


# Same scaling, but for kallisto
- Also needed to run this with all 4 lanes concatenated into 1 file per read (megaR1, megaR2)
- Also note, actually generating the idx, t2g, and barcodes_kb.csv file was actaully a bit of a pain. Documentation pretty scattered, not clear the formatting of these files

In [None]:
%%bash

for i in 1 3 6 9 18 27 36;
do

kb count -i FeaturesMismatch.idx \
     -g FeaturesMismatch.t2g \
     --workflow kite \
     -x 10xv3 -o out_adt$i \
     -t $i -w barcodes_kb.csv \
      CSYGXD101_S41_L004_R1_001.fastq.gz CSYGXD101_S41_L004_R2_001.fastq.gz
done

In [None]:
# Convert output mtx file to a csv for easier comparison
from scipy.io import mmread
from scipy.sparse import coo_matrix

# Specify the path to your .mtx file
mtx_file_path = 'out_adt18/counts_unfiltered/cells_x_features.mtx'

# Read the .mtx file into a sparse matrix
sparse_matrix = mmread(mtx_file_path)
print(f"Successfully loaded the MTX file: {mtx_file_path}")
print(f"Matrix format: {type(sparse_matrix)}")
print(f"Matrix dimensions: {sparse_matrix.shape}")
print(f"Number of stored elements: {sparse_matrix.nnz}")

# You can convert the sparse matrix to a dense NumPy array if needed
dense_array = sparse_matrix.toarray()

pandas.DataFrame(dense_array).to_csv('kb_count.csv',index=False)

# Notes on the Windows machine test capabilities:
 - 238 GB storage, 128MB graphics, 8GB RAM
 - Intel Core i5-9300H CPU - 2.4 GHz
- 64 Bit, Windows 11 home, Dell XPS 15
 - MIGHT MAX OUT AT 4 CORES - who knows about hyperthreading...

# Next up, the scale testing
- Note: the scale of the reference is a single lane, not all 4 lanes

In [None]:
%%bash

for i in 0.25 0.5 0.75 1.25 1.5 1.75 2;
do
    python asap_process.py \
	-cr testr1_$i.fastq.gz \
	-br testr2_$i.fastq.gz \
	-wl d101_barcodes.csv \
	-ref citeSeq_codes.csv \
	-tol 1 -mis 0.95 -proc 24 -awl False \
	-ass CITE -umi True \
	-out proc"$i"_Scaletest.out > proc$i.ScaleTime.out
done

# Now, do the analysis on all 4 lanes at once:
- Wanted to running lanes separate does not alter the reads
- There are slight differences, but not significant

In [None]:
$$bash
python asap_process.py \
-cr  $datPath/megaR1.fastq.gz \
-br $datPath/megaR2.fastq.gz \
-wl $datPath/d101_barcodes.csv \
-ref $datPath/citeSeq_codes.csv \
-tol 1 -mis 0.95 -proc 32 -awl False \
-ass CITE -umi True \
-out catLanes.out > proc.catLanes.out


# Lastly, umiDrop testing:

In [None]:
%%bash
# But test our umiDrop = False

datPath=benchmarking/sizeTests

python asap_process.py \
-cr  $datPath/megaR1.fastq.gz \
-br $datPath/megaR2.fastq.gz \
-wl ex_inputs/d101_barcodes.csv \
-ref ex_inputs/citeSeq_codes.csv \
-tol 1 -mis 0.95 -proc 32 -ass CITE \
-umi False -awl False \
-out proc_ASAPtest_umiDrop.csv > procASAP_umiDrop.out

# 