# Vector Subclassification 2.1

Steps for use:
1. enter directory for input directory (location of tiling output)
2. enter barcodes to run as a list
3. enter one payload size for each payload
4. enter the directory for output directory

Output format:
```
output_directory (from step 4 above)
    |-barcode1
        |-subparsed.tsv
        |-subparsed.pdf
        |-categories
            |-one counts file for each category found
    |-barcode2
        |-subparsed.tsv
        |-subparsed.pdf
        |-categories
            |-one counts file for each category found
    |-barcode3... etc
```


# Subparsing tiling data, results are grouped

In [10]:
%%bash
export PATH=/opt/oxb/ngs/binaries/python-3.10.12/bin:$PATH


input_directory=../../Inputs/OXB_Data/tiling/
barcodes=(
    bc1012
    bc1020
)
payload_sizes=(
    1831
    1872
)
# Put the desired output directory into this variable
# IMPORTANT NOTE: make sure the path ends with a "/" or the output file organization will not be correct
output_directory="grouped/"

# Do Not Modify anything below here unless you want to add optional arguments
vector_subparser="../../../CodeFiles/parse_file.py"
for i in ${!barcodes[@]}; do
    input_file=${input_directory}${barcodes[$i]}.tile.zmw.counts
    if python3 $vector_subparser -i ${input_file} -o $output_directory -p ${payload_sizes[$i]} -s -g five; then # add optional arguments on this line before the semicolon
        printf "succesfully run $input_file with payload size ${payload_sizes[$i]}\n"
    else
        printf 'failure in vector subparsing\n'
    fi
done

succesfully run ../../Inputs/OXB_Data/tiling/bc1012.tile.zmw.counts with payload size 1831
succesfully run ../../Inputs/OXB_Data/tiling/bc1020.tile.zmw.counts with payload size 1872


# Subparsing tiling data, results are not grouped

In [9]:
%%bash
export PATH=/opt/oxb/ngs/binaries/python-3.10.12/bin:$PATH


input_directory=../../Inputs/OXB_Data/tiling/
barcodes=(
    bc1012
    bc1020
)
payload_sizes=(
    1831
    1872
)
# Put the desired output directory into this variable
# IMPORTANT NOTE: make sure the path ends with a "/" or the output file organization will not be correct
output_directory="ungrouped/"

# Do Not Modify anything below here unless you want to add optional arguments
vector_subparser="../../../CodeFiles/parse_file.py"
for i in ${!barcodes[@]}; do
    input_file=${input_directory}${barcodes[$i]}.tile.zmw.counts
    if python3 $vector_subparser -i ${input_file} -o $output_directory -p ${payload_sizes[$i]} -s; then # add optional arguments on this line before the semicolon
        printf "succesfully run $input_file with payload size ${payload_sizes[$i]}\n"
    else
        printf 'failure in vector subparsing\n'
    fi
done

succesfully run ../../Inputs/OXB_Data/tiling/bc1012.tile.zmw.counts with payload size 1831
succesfully run ../../Inputs/OXB_Data/tiling/bc1020.tile.zmw.counts with payload size 1872


# Calculates the patterns with at least one non-VG (noncannonical) tile in tile files, or in other words sequences containing an alignment to any database sequence used in tiling other than the GOI sequence

In [49]:
%%bash
export PATH=/opt/oxb/ngs/binaries/python-3.10.12/bin:$PATH

# This code was used to calculate the number of noncannonical sequences in each sample. The subparser by default ignores
# noncanonnical sequences, excluding them from all counts and proportions. To include them, use the "-n" flag in the command.
# The subparser will currently classify all of these sequences as "other". 
# If you wish to further classify them, the subparser's grammar needs to be extended. 

# input_directory=../../Inputs/OXB_Data/tiling/
barcodes=(
    bc1012
    bc1020
)
payload_sizes=(
    1831
    1872
)
# Put the desired output directory into this variable
# IMPORTANT NOTE: make sure the path ends with a "/" or the output file organization will not be correct
output_directory="noncanon_ungrouped/"

# Do Not Modify anything below here unless you want to add optional arguments
vector_subparser="../../../CodeFiles/parse_file.py"
for i in ${!barcodes[@]}; do
    input_file=${input_directory}${barcodes[$i]}.tile.zmw.counts
    if python3 $vector_subparser -i ${input_file} -o $output_directory -p ${payload_sizes[$i]} -s -n; then # add optional arguments on this line before the semicolon
        printf "succesfully run $input_file with payload size ${payload_sizes[$i]}\n"
    else
        printf 'failure in vector subparsing\n'
    fi
done

# calculating the amount of noncanonnical sequences in each barcode.
# noncannonical sequences are classified as 'other', so by subtracting the amount of sequences classified as 'other'
# in the subparsing results with noncanonnical sequences by the subparsing results without them, we get the 
# number of noncannonical sequences
for barcode in ${barcodes[@]}; do
    canons_only_other_sequence_count=$(grep -m 1 'other' ungrouped/$barcode/$barcode.subparsed.tsv | awk '{print$2}')
    echo count of sequences classified as other without noncanon patterns: $canons_only_other_sequence_count
    noncanons_other_sequence_count=$(grep -m 1 'other' noncanon_ungrouped/$barcode/$barcode.subparsed.tsv | awk '{print$2}')
    echo count of sequences classified as other with noncanon patterns: $noncanons_other_sequence_count
    noncanon_seqs=$(awk "BEGIN {print $noncanons_other_sequence_count - $canons_only_other_sequence_count}")
    echo total noncanon sequences in $barcode: $noncanon_seqs
done

count of sequences classified as other without noncanon patterns: 158.0
count of sequences classified as other with noncanon patterns: 1934.0
total noncanon sequences in bc1012: 1776
count of sequences classified as other without noncanon patterns: 298.0
count of sequences classified as other with noncanon patterns: 4627.0
total noncanon sequences in bc1020: 4329


# Calculates the non-tiled sequences as total reads in fasta file - total tiled sequences, fastas are not included but this was the method

In [15]:
%%bash
# Calculates the untiled sequences in each sample
# fasta_files=("demultiplex.bc1012--bc1012.fasta"
#              "demultiplex.bc1020--bc1020.fasta") 
counts_files=("../../Inputs/OXB_Data/tiling/bc1012.tile.zmw.counts" 
              "../../Inputs/OXB_Data/tiling/bc1020.tile.zmw.counts")

# Calculate sum sequences in each fasta file; files aren't included but this is how we got the numbers: counting reads in ccs demux files from the Sequel II
# read_counts=()
# for file in "${fasta_files[@]}"; do
#     echo "Total reads $file"
#     read_count=$(grep '^>' "$file" | cut -f2 -d/ | sort -u | wc -l)
#     echo $read_count
#     read_counts+=($read_count)
# done

# fasta read counts since fasta files aren't included
read_counts=(220321 137934)

# Get totals of tiled sequences in each counts file
pattern_sums=()
for file in "${counts_files[@]}"; do
    echo "Total patterns $file"
    pattern_sum=$(awk 'BEGIN {sum=0} {sum += $1} END {print sum+0}' "$file")
    echo $pattern_sum
    pattern_sums+=($pattern_sum)
done

# Get differences: untiled sequence count = read count - tiled sequences 
for ((i=0; i < 2; i++)); do
    echo "untiled sequences from ${fasta_files[i]}"
    echo "$(( ${read_counts[i]} - ${pattern_sums[i]} ))"
done

Total patterns ../../Inputs/OXB_Data/tiling/bc1012.tile.zmw.counts
219452
Total patterns ../../Inputs/OXB_Data/tiling/bc1020.tile.zmw.counts
137115
untiled sequences from 
869
untiled sequences from 
819


# Unit Testing: Run to test the code if making modifications

In [50]:
%%bash

test_code="../../../CodeFiles/test_vector_subparser.py"

export PATH=/opt/oxb/ngs/binaries/python-3.10.12/bin:$PATH

python3 $test_code -v

test_FileParser_constructor (__main__.TestFileParser) ... ok
test_bin_tilelines (__main__.TestFileParser) ... ok
test_calculate_bin_proportions (__main__.TestFileParser) ... ok
test_extended_payload_name (__main__.TestFileParser) ... ok
test_group_categories (__main__.TestFileParser) ... ok
test_process_U_line (__main__.TestFileParser) ... ok
test_process_line (__main__.TestFileParser) ... ok
test_process_x_2_line (__main__.TestFileParser) ... ok
test_Tile_constructor (__main__.TestTile) ... ok
test_Tile_payload_sizing (__main__.TestTile) ... ok
test_compare_tiles (__main__.TestTile) ... ok
test_coordinates_are_equal (__main__.TestTile) ... ok
test_equality (__main__.TestTile) ... ok
test_set_is_full (__main__.TestTile) ... ok
test_TileLineBin_constructor (__main__.TestTileBin) ... ok
test_calculate_full_proportions (__main__.TestTileBin) ... ok
test_Tileline_constructor (__main__.TestTileLine) ... ok
test_check_full_payload (__main__.TestTileLine) ... ok
test_check_linearity (__main__