# Find the fastest way for getting information from depth files

In [6]:
from lib import variable_alignment

class args:
    def __init__(self):

        self.input = '100_genomes.txt'
        self.gaps = 0.9
        self.mindepth = 5
        self.repeats = 'resources/regions_blindspots_modlin_farhat.bed'
        self.drug = 'resources/20160911_DR_filter_pos_reseqTB.txt'
        self.outg = ''
        self.suffix = "mutect2.filtered.homo.snps.vcf"
        self.threads = 10
        self.output_prefix = 'test'
        
args = args()

mep = variable_alignment.mep(args)


Get variable positions

In [7]:
variable_positions = variable_alignment.variable_positions()
variable_positions.add_SNPs(mep)

print('Number of variable positions', len(variable_positions.variants))

Number of variable positions 20897


Now traverse depth files. Try different approaches to make this fast and efficient.

In [8]:
import time

missing_positions = variable_alignment.missing_positions()


First simply traverse whole files in a loop. 

In [None]:
start_traversal = time.time()
missing_positions.test_depth_traversal(mep, variable_positions, method='full')
end_traversal = time.time()
print('Time full traversal:', end_traversal - start_traversal)

Interrupted, didn't finish for 10 genomes in 20 minutes ...

Instead of traversing the whole file, use random access to variable positions.

In [16]:
start_traversal = time.time()
missing_positions.test_depth_traversal(mep, variable_positions, method='random')
end_traversal = time.time()
print('Time traversal with random access:', end_traversal - start_traversal)

Time traversal with random access: 1.3417222499847412


# Test get_alignment.py

1 thread vs 10.

In [None]:
%%bash
python get_alignment.py -i 100_genomes.txt -o 100_genomes -t 1

In [3]:
%%bash 
python get_alignment.py -i 100_genomes.txt -o 100_genomes.snp_aln

Getting variable positions
Added variable positions in 0.182604 seconds
Adding missing positions
Added missing positions in 9.474384 seconds
Total time: 21.201650 seconds


In [12]:
%%bash
# 1 core
sbatch example_t1.slrm 1k_genomes.txt

Submitted batch job 7662423


In [3]:
%%bash
cat stdout_t1.o

Getting variable positions
Added variable positions in 49.980899 seconds
Adding missing positions
Processing chunks: 100.00%
Added missing positions in 36.363978 seconds
Total time: 86.537455 seconds


In [13]:
%%bash
# 20 cores
sbatch example_t20.slrm 1k_genomes.txt

Submitted batch job 7662426


In [4]:
%%bash
cat stdout_t20.o

Getting variable positions
Added variable positions in 32.458854 seconds
Adding missing positions
Processing chunks: 100.00%
Added missing positions in 36.369771 seconds
Total time: 68.973356 seconds


-> Parellalization difficult to implement efficiently on queuing system!

Instead, traverse depth files in simple loop.

# Test and optimize non-parallel version

## Profiling

In [6]:
%%bash

python -m cProfile -o get_alignment.pstats get_alignment.py  -i 100_genomes.txt -o 100_genomes.snp_aln


Getting variable positions
Added variable positions in 0.571934 seconds
Adding missing positions
Added missing positions in 18.230969 seconds
Total time: 77.549257 seconds


In [7]:
import pstats

p = pstats.Stats('get_alignment.pstats')
p.sort_stats('cumulative')
p.print_stats()

Fri Oct  4 10:00:07 2024    get_alignment.pstats

         173587839 function calls (173573153 primitive calls) in 78.407 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    555/1    0.003    0.000   78.408   78.408 {built-in method builtins.exec}
        1    0.012    0.012   78.408   78.408 get_alignment.py:1(<module>)
        1    0.004    0.004   77.549   77.549 get_alignment.py:82(main)
        1    3.177    3.177   57.898   57.898 /scicore/home/gagneux/stritt0001/github/large_variable_alignment/lib/variable_alignment.py:227(get_seqs)
  2640948    8.144    0.000   49.809    0.000 /scicore/home/gagneux/stritt0001/miniconda3/lib/python3.12/site-packages/pandas/core/indexing.py:1176(__getitem__)
        1    0.004    0.004   18.231   18.231 /scicore/home/gagneux/stritt0001/github/large_variable_alignment/lib/variable_alignment.py:194(traverse_depth_files)
      101    2.234    0.022   18.019    0.178 /scicore/home/gagne

<pstats.Stats at 0x7fa5c6706a50>

The slowest part is the get_seqs function, since many values have to be accessed...

In [7]:
%%bash
python get_alignment.py  -i 100_genomes.txt -o 100_genomes.snp_aln

Getting variable positions
Added variable positions in 0.270303 seconds
Adding missing positions
Added missing positions in 9.843987 seconds
Total time: 11.100147 seconds


In [8]:
%%bash 
sbatch run26k_strains.slrm

Submitted batch job 8040839
