In [1]:
import subprocess, time
import pandas as pd
import numpy as np
import regex
import pysam
import csv
from Bio.Seq import Seq
from Bio import SeqIO
from os import listdir
from os.path import exists
from Bio import SeqRecord
import scipy

In [2]:
# written by Peter Culviner, PhD to enable command-line access through Jupyter
def quickshell(command, print_output=True, output_path=None, return_output=False):
    process_output = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout = process_output.stdout.decode('utf-8')
    stderr = process_output.stderr.decode('utf-8')
    output_string = f'STDOUT:\n{stdout}\nSTDERR:\n{stderr}\n'
    if print_output:
        print('$ ' + command)
        print(output_string)
    if output_path is not None:
        with open(output_path, 'w') as f:
            f.write(output_string)
    if return_output:
        return stdout, stderr

In [4]:
# Threads (CPU x2 is max)
threads = 18

# initializing locations of input .csv files, etc. 
main_path = '5enrich_CRP'
#main_path = '3enrich_NusAG'
main_path = 'gDNA'

readPrep_dir = f'{main_path}/readPrep'

# Pre-processing alignments prior to enriched end calling
initial_alignments_dir = f'{readPrep_dir}/initial_alignments'
dedup_alignments_dir = f'{readPrep_dir}/dedup_alignments'

In [None]:
sample_list = ['NusA_NusG2']

# Split large bam that is too big for de-duplication with umi-tools

In [None]:
# Split bams
for sample in sample_list:

    split_command = f'picard SplitSamByNumberOfReads -I {initial_alignments_dir}/sorted_{sample}.bam '  + \
       f'-O {initial_alignments_dir} --OUT_PREFIX sorted_{sample} -N_FILES 2'
       
    split = quickshell(split_command, print_output = False, return_output=False)

    print(f'Split initial bams: {sample} done')

In [None]:
# Sort and index split bams
for sample in sample_list:

    sort1_command = f'samtools sort -O BAM {initial_alignments_dir}/sorted_{sample}_0001.bam > ' + \
       f'{initial_alignments_dir}/sorted_{sample}1.bam'
       
    sort1 = quickshell(sort1_command, print_output = False, return_output = False)
   
    sort2_command = f'samtools sort -O BAM {initial_alignments_dir}/sorted_{sample}_0002.bam > ' + \
       f'{initial_alignments_dir}/sorted_{sample}2.bam'
       
    sort2 = quickshell(sort2_command, print_output = False, return_output = False)
       
    print(f'Sort split bams: {sample} done')

    index1_command = f'samtools index {initial_alignments_dir}/sorted_{sample}1.bam'
       
    index1 = quickshell(index1_command, print_output = False, return_output = False)
   
    index2_command = f'samtools index {initial_alignments_dir}/sorted_{sample}2.bam'
       
    index2 = quickshell(index2_command, print_output = False, return_output = False)
           
    print(f'Index split bams: {sample} done')

# De-duplicate each split bam file separately

In [None]:
# Dedup split bams
for sample in sample_list:

    dedup1_command = f'umi_tools dedup -I {initial_alignments_dir}/sorted_{sample}1.bam '  + \
            f'--paired --output-stats={initial_alignments_dir}/dedup_logs/sorted_{sample}1 --no-sort-output ' + \
            f'-S {initial_alignments_dir}/{sample}1_dedup.bam'
        
    dedup1 = quickshell(dedup1_command, print_output = False, return_output=False)

    dedup2_command = f'umi_tools dedup -I {initial_alignments_dir}/sorted_{sample}2.bam '  + \
            f'--paired --output-stats={initial_alignments_dir}/dedup_logs/sorted_{sample}2 --no-sort-output ' + \
            f'-S {initial_alignments_dir}/{sample}2_dedup.bam'
        
    dedup2 = quickshell(dedup2_command, print_output = False, return_output=False)
    
    print(f'Dedup split bams: {sample} done')

In [None]:
# Sort and index split dedup bams
for sample in sample_list:

    sort1_command = f'samtools sort -O BAM {initial_alignments_dir}/{sample}1_dedup.bam > ' + \
        f'{initial_alignments_dir}/sorted_{sample}1_dedup.bam'
        
    sort1 = quickshell(sort1_command, print_output = False, return_output = False)
    
    sort2_command = f'samtools sort -O BAM {initial_alignments_dir}/{sample}2_dedup.bam > ' + \
        f'{initial_alignments_dir}/sorted_{sample}2_dedup.bam'
        
    sort2 = quickshell(sort2_command, print_output = False, return_output = False)
        
    print(f'Sort split spike dedup bams: {sample} done')

    index1_command = f'samtools index {initial_alignments_dir}/sorted_{sample}1_dedup.bam'
        
    index1 = quickshell(index1_command, print_output = False, return_output = False)
    
    index2_command = f'samtools index {initial_alignments_dir}/sorted_{sample}2_dedup.bam'
        
    index2 = quickshell(index2_command, print_output = False, return_output = False)
            
    print(f'Index split dedup bams: {sample} done')

# Merge the de-duplicated bams

In [None]:
# Merge dedup bams
for sample in sample_list:
 
    merge_command = f'picard MergeSamFiles -I {initial_alignments_dir}/sorted_{sample}1_dedup.bam '  + \
        f'-I {initial_alignments_dir}/sorted_{sample}2_dedup.bam -O {initial_alignments_dir}/{sample}_dedup.bam'
        
    merge = quickshell(merge_command, print_output = False, return_output=False)

    print(f'Merge dedup bams: {sample} done')

In [None]:
# Sort and index the merged, de-duplicated bams
for sample in sample_list:

    sort_command = f'samtools sort -O BAM {initial_alignments_dir}/{sample}_dedup.bam > ' + \
        f'{dedup_alignments_dir}/sorted_{sample}_dedup.bam'
        
    sort = quickshell(sort_command, print_output = False, return_output = False)
        
    print(f'Sort merged dedup bam: {sample} done')
    
    index_command = f'samtools index {dedup_alignments_dir}/sorted_{sample}_dedup.bam'
        
    index = quickshell(index_command, print_output = False, return_output = False)
            
    print(f'Index merged dedup spike bam: {sample} done')