In [1]:
import subprocess, time
import pandas as pd
import numpy as np
import regex
import pysam
import csv
from Bio.Seq import Seq
from Bio import SeqIO
from os import listdir
import os
from os.path import exists
from Bio import SeqRecord
import scipy
import re

In [2]:
# written by Peter Culviner, PhD to enable command-line access through Jupyter
def quickshell(command, print_output=True, output_path=None, return_output=False):
    process_output = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout = process_output.stdout.decode('utf-8')
    stderr = process_output.stderr.decode('utf-8')
    output_string = f'STDOUT:\n{stdout}\nSTDERR:\n{stderr}\n'
    if print_output:
        print('$ ' + command)
        print(output_string)
    if output_path is not None:
        with open(output_path, 'w') as f:
            f.write(output_string)
    if return_output:
        return stdout, stderr

In [3]:
main_path = 'inCellulo'
genome_dir = f'{main_path}/genomes'

fastqc_dir = f'{main_path}/fastqc'

cortes_TSS_dir = f'{main_path}/cortes_TSS'
cortes_TSS_raw_fastq = f'{cortes_TSS_dir}/fastq'
cortes_TSS_processed_fastq = f'{cortes_TSS_dir}/processed_fastq'
cortes_ribodetect = f'{cortes_TSS_dir}/ribodetect_fastq'

arnvig_TTS_dir = f'{main_path}/arnvig_TTS'
arnvig_TTS_raw_fastq = f'{arnvig_TTS_dir}/fastq'
arnvig_TTS_processed_fastq = f'{arnvig_TTS_dir}/processed_fastq'
arnvig_ribodetect = f'{arnvig_TTS_dir}/ribodetect_fastq'

alignments = f'{main_path}/inCellulo_alignments'
downsampled = f'{main_path}/downsampled_alignments'
trimmed = f'{main_path}/trimmed_alignments'
summarizeOverlaps = f'{main_path}/summarized_overlaps'

In [None]:
!mkdir $fastqc_dir
!mkdir $cortes_TSS_dir
!mkdir $cortes_TSS_raw_fastq
!mkdir $cortes_TSS_processed_fastq
!mkdir $cortes_ribodetect
!mkdir $arnvig_TTS_dir
!mkdir $arnvig_TTS_raw_fastq
!mkdir $arnvig_TTS_processed_fastq
!mkdir $arnvig_ribodetect
!mkdir $alignments
!mkdir $downsampled
!mkdir $trimmed
!mkdir $summarizeOverlaps

In [None]:
CFG_5end_path = '5enrich_CRP'
CFG_3end_path = '3enrich_NusAG'

In [4]:
threads = 18
minimum_insert_length = 10
quality_threshold = 20

# Fetch raw fastq files from ArrayExpress

In [None]:
accession_number_list_5end = ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262985/ERR262985.fastq.gz',
                             'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262979/ERR262979.fastq.gz',
                             'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262986/ERR262986.fastq.gz']

for sample in accession_number_list_5end:
    fetch_5end_command = f'wget -P {cortes_TSS_raw_fastq}/ {sample}'

    fetch_5end_fastq = quickshell(fetch_5end_command, print_output = False, return_output = False)

In [None]:
accession_number_list_3end = ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR973/008/ERR9735888/ERR9735888.fastq.gz',
                             'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR973/009/ERR9735889/ERR9735889.fastq.gz',
                             'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR973/000/ERR9735890/ERR9735890.fastq.gz']

for sample in accession_number_list_3end:
    fetch_3end_command = f'wget -P {arnvig_TTS_raw_fastq}/ {sample}'

    fetch_3end_fastq = quickshell(fetch_3end_command, print_output = False, return_output = False)

# Run fastqc to get an overview of Cortes and Arnvig fastq files

In [None]:
fastq_list = [f'{cortes_TSS_raw_fastq}/Exp_R1_TSS.fastq.gz',
              f'{cortes_TSS_raw_fastq}/Exp_R2_TSS.fastq.gz',
              f'{cortes_TSS_raw_fastq}/Exp_R3_TSS.fastq.gz',
              f'{arnvig_TTS_raw_fastq}/termseq_expo_r1.fastq.gz',
              f'{arnvig_TTS_raw_fastq}/termseq_expo_r2.fastq.gz',
              f'{arnvig_TTS_raw_fastq}/termseq_expo_r3.fastq.gz']

for fq in fastq_list:

    command1 = f'fastqc {fq} -o {fastqc_dir} -t {threads}'

    quickshell(
            command1,
            print_output = True,
            return_output = False)

    print(f'fastqc: {fq} done')

# Cortes: quality filter only

fastqc showed no adapter contamination in Cortes, probably because the read length was shorter (50bp).

In [11]:
# list of reports for storage
dataframe_list = []

# trim adapters
n_too_short = []

# iterate through file pairs and trim adapters
trimming_log = f'{cortes_TSS_processed_fastq}/split_trim_log.txt'

sample_names = ['Exp_R1_TSS',
                'Exp_R2_TSS',
                'Exp_R3_TSS']

with open(trimming_log, 'w') as f:
    for sample_name in sample_names:
        # prepare input and output titles
        cutadapt_inputs = [
            f'{cortes_TSS_raw_fastq}/{sample_name}.fastq.gz']
        cutadapt_outputs = [
             f'{cortes_TSS_processed_fastq}/{sample_name}.processed.fastq.gz']
         # prepare cutadapt command
        command = f'cutadapt --overlap=1 --minimum-length={minimum_insert_length} -q {quality_threshold} ' + \
                  f'-j {threads} -o {cutadapt_outputs[0]} ' + \
                  f'{cutadapt_inputs[0]}'
        # run cutadapt
        output_trim, _ = quickshell(command, print_output=True, return_output=True)
        # parse output to pull number of reads below minimum_trimmed_length
        too_short = int(regex.search('too short:\s*\S*',output_trim).group().split(' ')[-1].replace(',',''))
        n_too_short.append(too_short)
        # write full output to log
        f.write(output_trim)

# Arnvig TTS: quality filter and adapter trim

fastqc showed overrepresentation of Illumina Universal Adapter at 3' end, especially in replicate 1.

In [12]:
# list of reports for storage
dataframe_list = []

# trim adapters
n_too_short = []

# iterate through file pairs and trim adapters
trimming_log = f'{cortes_TSS_processed_fastq}/split_trim_log.txt'

sample_names = ['termseq_expo_r1',
                'termseq_expo_r2',
                'termseq_expo_r3']

universal_adapter = 'AGATCGGAAGAG'

with open(trimming_log, 'w') as f:
    for sample_name in sample_names:
        # prepare input and output titles
        cutadapt_inputs = [
            f'{arnvig_TTS_raw_fastq}/{sample_name}.fastq.gz']
        cutadapt_outputs = [
             f'{arnvig_TTS_processed_fastq}/{sample_name}.processed.fastq.gz']
         # prepare cutadapt command
        command = f'cutadapt --overlap=1 --minimum-length={minimum_insert_length} -q {quality_threshold} ' + \
                  f'-j {threads} -a {universal_adapter} -o {cutadapt_outputs[0]} ' + \
                  f'{cutadapt_inputs[0]}'
        # run cutadapt
        output_trim, _ = quickshell(command, print_output=True, return_output=True)
        # parse output to pull number of reads below minimum_trimmed_length
        too_short = int(regex.search('too short:\s*\S*',output_trim).group().split(' ')[-1].replace(',',''))
        n_too_short.append(too_short)
        # write full output to log
        f.write(output_trim)

## Pause here, change conda environment to ribodetector, and carry out ribodepletion in ribodetector Jupyter Notebook

# Initial alignment to concatenated genome (Eco_Mtb)

In [14]:
Use BWA to align to concatenated genomes
fastq_list = [f'{cortes_ribodetect}/Exp_R1_TSS_riboremove.fastq.gz',
              f'{cortes_ribodetect}/Exp_R2_TSS_riboremove.fastq.gz',
              f'{cortes_ribodetect}/Exp_R3_TSS_riboremove.fastq.gz',
              f'{arnvig_ribodetect}/termseq_expo_r1_riboremove.fastq.gz',
              f'{arnvig_ribodetect}/termseq_expo_r2_riboremove.fastq.gz',
              f'{arnvig_ribodetect}/termseq_expo_r3_riboremove.fastq.gz']

sample_names = ['Exp_R1_TSS', 'Exp_R2_TSS','Exp_R3_TSS',
               'termseq_expo_r1', 'termseq_expo_r2', 'termseq_expo_r3']

# bwa algorithm. Mem is default for most applications, read documentation to decide
bwa_algorithm = 'mem'

# Set print_output = True to see command-line output
# Set return_output = False if assigning commmand-line output to a variable
print_output = True
return_output = False

for i in range(len(fastq_list)):

    map_command = f'bwa {bwa_algorithm} -t {threads} {genome_dir}/Eco_Mtb_genome.fasta ' + \
                       f'{fastq_list[i]} > {alignments}/{sample_names[i]}.sam'
    
    map = quickshell(map_command, print_output = print_output, return_output = return_output)
    print(f'Initial alignment: {sample_names[i]} Done')

# Sort and index bam files

In [15]:
# Process alignments: 
# sort, convert to bam, and index
sam_list = [f'{alignments}/Exp_R1_TSS.sam',
            f'{alignments}/Exp_R2_TSS.sam',
            f'{alignments}/Exp_R3_TSS.sam',
            f'{alignments}/termseq_expo_r1.sam',
            f'{alignments}/termseq_expo_r2.sam',
            f'{alignments}/termseq_expo_r3.sam']


sample_names = ['Exp_R1_TSS', 'Exp_R2_TSS','Exp_R3_TSS',
               'termseq_expo_r1', 'termseq_expo_r2', 'termseq_expo_r3']

# Set print_output = True to see command-line output
# Set return_output = False if assigning commmand-line output to a variable
print_output = True
return_output = False

for i in range(len(sam_list)):

    # Sort sam file and output as bam
    sort_enrich_command = f'samtools sort -O BAM {sam_list[i]} > ' + \
                          f'{alignments}/sorted_{sample_names[i]}.bam'

    sort_enrich = quickshell(sort_enrich_command, print_output = print_output, return_output=return_output)
    
    # Index sorted bam
    index_enrich_command = f'samtools index {alignments}/sorted_{sample_names[i]}.bam'
    
    index_enrich = quickshell(index_enrich_command, print_output = print_output, return_output=return_output)
    print(f'Sort and index alignments: {sample_names[i]} done')

# Count number of non-rRNA reads mapping to the Mtb genome

In [None]:
# 5' end samples
sample_names = ['Exp_R1_TSS', 'Exp_R2_TSS','Exp_R3_TSS']

bam_list = [f'{alignments}/sorted_Exp_R1_TSS.bam',
            f'{alignments}/sorted_Exp_R2_TSS.bam',
            f'{alignments}/sorted_Exp_R3_TSS.bam']

# Exclude rRNA region
Mtb_region1 = 'Eco_Mtb:4641653-6113234'
Mtb_region2 = 'Eco_Mtb:6118672-9053361'

# Whole genome
Mtb_region = 'Eco_Mtb:4641653-9053361'

read_counts = []

for i in range(len(bam_list)):

    count_reads_command = f'samtools view -c ' + \
                     f'{bam_list[i]} ' + \
                     f'"{Mtb_region1}" "{Mtb_region2}"'
    count_reads = int(quickshell(count_reads_command,
                                         print_output = True,
                                         return_output= True)[0].split('\n')[0])
    read_counts.append([sample_names[i], count_reads])
    
    print(f'Counting reads: {sample_names[i]} done')
    
read_counts_DF = pd.DataFrame(read_counts, columns = ['Sample_Name','Read_Counts'])

# Inspect read depths to decide on a minimum read depth to downsample all samples for subsequent analysis
read_counts_DF.to_csv(f'{alignments}/inCellulo_5end_read_counts.csv')

In [None]:
# 3' end samples
sample_names = ['termseq_expo_r1', 'termseq_expo_r2', 'termseq_expo_r3']

bam_list = [f'{alignments}/sorted_termseq_expo_r1.bam',
            f'{alignments}/sorted_termseq_expo_r2.bam',
            f'{alignments}/sorted_termseq_expo_r3.bam']

# Exclude rRNA region
Mtb_region1 = 'Eco_Mtb:4641653-6113234'
Mtb_region2 = 'Eco_Mtb:6118672-9053361'

# Whole genome
Mtb_region = 'Eco_Mtb:4641653-9053361'

read_counts = []

for i in range(len(bam_list)):

    count_reads_command = f'samtools view -c ' + \
                     f'{bam_list[i]} ' + \
                     f'"{Mtb_region1}" "{Mtb_region2}"'
    count_reads = int(quickshell(count_reads_command,
                                         print_output = True,
                                         return_output= True)[0].split('\n')[0])
    read_counts.append([sample_names[i], count_reads])
    
    print(f'Counting reads: {sample_names[i]} done')
    
read_counts_DF = pd.DataFrame(read_counts, columns = ['Sample_Name','Read_Counts'])

# Inspect read depths to decide on a minimum read depth to downsample all samples for subsequent analysis
read_counts_DF.to_csv(f'{alignments}/inCellulo_3end_read_counts.csv')

# Downsample cell-free samples (5' end and 3' end)

In [None]:
inCellulo_5end_read_counts = pd.read_csv(f'{alignments}/inCellulo_5end_read_counts.csv')

CFG_5end_alignments = f'{CFG_5end_path}/readPrep/R2_alignments'
original_CFG_5end_read_depths = pd.read_csv(f'{CFG_5end_alignments}/R2_read_counts.csv')

# Here, after manual inspection,
# the downsample depth is the minimum read depth across all samples. Can change if need be
downsample_depth = inCellulo_5end_read_counts['Read_Counts'].min()

In [None]:
# Downsample CFG 5' end RNA samples

sample_names = ['noCRP1', 'noCRP2', 'noCRP3',
               'CRP1', 'CRP2', 'CRP3']

downsampled_depths = []

for sample in sample_names:
    
    original_depth = original_CFG_5end_read_depths.loc[original_CFG_5end_read_depths['Sample_Name'] == sample,
                                                       'R2_read_counts']
    downsample_command = f'samtools view -b -s {(downsample_depth/int(original_depth))} ' + \
                         f'{CFG_5end_alignments}/{sample}_R2.bam > ' + \
                         f'{downsampled}/{sample}_downsample.bam'
    downsample = quickshell(downsample_command,
                            print_output = print_output,
                            return_output = return_output)
    
    # Index the downsampled R2 only bam
    index_downsample_command = f'samtools index {downsampled}/{sample}_downsample.bam'
    index_downsample = quickshell(index_downsample_command,
                                  print_output = print_output,
                                  return_output = return_output)
    
    count_downsample_command = f'samtools view -c ' + \
                                  f'{downsampled}/{sample}_downsample.bam'
    count_downsample = int(quickshell(count_downsample_command,
                                         print_output = False,
                                         return_output=True)[0].split('\n')[0])
    downsampled_depths.append([sample, count_downsample])
    
    print(f'Downsampling: {sample} done')
    
downsample_DF = pd.DataFrame(downsampled_depths, columns = ['Sample_Name','Downsampled_read_counts'])
downsample_DF.to_csv(f'{downsampled}/downsampled_depths_CFG_5end.csv')

In [None]:
# Downsample in cellulo 5' end RNA samples

sample_names = ['Exp_R1_TSS', 'Exp_R2_TSS','Exp_R3_TSS']

downsampled_depths = []

for sample in sample_names:
    
    original_depth = inCellulo_5end_read_counts.loc[inCellulo_5end_read_counts['Sample_Name'] == sample,
                                                       'Read_Counts']
    downsample_command = f'samtools view -b -s {(downsample_depth/int(original_depth))} ' + \
                         f'{CFG_5end_alignments}/{sample}_R2.bam > ' + \
                         f'{downsampled}/{sample}_downsample.bam'
    downsample = quickshell(downsample_command,
                            print_output = print_output,
                            return_output = return_output)
    
    # Index the downsampled R2 only bam
    index_downsample_command = f'samtools index {downsampled}/{sample}_downsample.bam'
    index_downsample = quickshell(index_downsample_command,
                                  print_output = print_output,
                                  return_output = return_output)
    
    count_downsample_command = f'samtools view -c ' + \
                                  f'{downsampled}/{sample}_downsample.bam'
    count_downsample = int(quickshell(count_downsample_command,
                                         print_output = False,
                                         return_output=True)[0].split('\n')[0])
    downsampled_depths.append([sample, count_downsample])
    
    print(f'Downsampling: {sample} done')
    
downsample_DF = pd.DataFrame(downsampled_depths, columns = ['Sample_Name','Downsampled_read_counts'])
downsample_DF.to_csv(f'{downsampled}/downsampled_depths_inCellulo_5end.csv')

In [None]:
inCellulo_3end_read_counts = pd.read_csv(f'{alignments}/inCellulo_3end_read_counts.csv')

CFG_3end_alignments = f'{CFG_3end_path}/readPrep/R2_alignments'
original_CFG_3end_read_depths = pd.read_csv(f'{CFG_3end_alignments}/R2_read_counts.csv')

CFG_gDNA_alignments = 'gDNA/readPrep/R2_alignments'
original_gDNA_read_depths = pd.read_csv(f'{CFG_gDNA_alignments}/R2_read_counts.csv')

In [None]:
# Downsample CFG 3' end RNA samples

sample_names = ['noTF1', 'noTF2', 'noTF3',
               'NusA1', 'NusA2', 'NusA3',
               'NusG1', 'NusG2', 'NusG3',
               'NusA_NusG1','NusA_NusG2','NusA_NusG3']

downsampled_depths = []

for sample in sample_names:
    
    original_depth = original_CFG_3end_read_depths.loc[original_CFG_3end_read_depths['Sample_Name'] == sample,
                                                       'R2_read_counts']
    downsample_command = f'samtools view -b -s {(downsample_depth/int(original_depth))} ' + \
                         f'{CFG_3end_alignments}/{sample}_R2.bam > ' + \
                         f'{downsampled}/{sample}_downsample.bam'
    downsample = quickshell(downsample_command,
                            print_output = print_output,
                            return_output = return_output)
    
    # Index the downsampled R2 only bam
    index_downsample_command = f'samtools index {downsampled}/{sample}_downsample.bam'
    index_downsample = quickshell(index_downsample_command,
                                  print_output = print_output,
                                  return_output = return_output)
    
    count_downsample_command = f'samtools view -c ' + \
                                  f'{downsampled}/{sample}_downsample.bam'
    count_downsample = int(quickshell(count_downsample_command,
                                         print_output = False,
                                         return_output=True)[0].split('\n')[0])
    downsampled_depths.append([sample, count_downsample])
    
    print(f'Downsampling: {sample} done')
    
downsample_DF = pd.DataFrame(downsampled_depths, columns = ['Sample_Name','Downsampled_read_counts'])
downsample_DF.to_csv(f'{downsampled}/downsampled_depths_CFG_3end.csv')

In [None]:
# Downsample CFG genomic DNA samples

sample_names = ['gDNA1', 'gDNA2', 'gDNA3']

downsampled_depths = []

for sample in sample_names:
    
    original_depth = original_gDNA_read_depths.loc[original_gDNA_read_depths['Sample_Name'] == sample,
                                                       'R2_read_counts']
    downsample_command = f'samtools view -b -s {(downsample_depth/int(original_depth))} ' + \
                         f'{CFG_gDNA_alignments}/{sample}_R2.bam > ' + \
                         f'{downsampled}/{sample}_downsample.bam'
    downsample = quickshell(downsample_command,
                            print_output = print_output,
                            return_output = return_output)
    
    # Index the downsampled R2 only bam
    index_downsample_command = f'samtools index {downsampled}/{sample}_downsample.bam'
    index_downsample = quickshell(index_downsample_command,
                                  print_output = print_output,
                                  return_output = return_output)
    
    count_downsample_command = f'samtools view -c ' + \
                                  f'{downsampled}/{sample}_downsample.bam'
    count_downsample = int(quickshell(count_downsample_command,
                                         print_output = False,
                                         return_output=True)[0].split('\n')[0])
    downsampled_depths.append([sample, count_downsample])
    
    print(f'Downsampling: {sample} done')
    
downsample_DF = pd.DataFrame(downsampled_depths, columns = ['Sample_Name','Downsampled_read_counts'])
downsample_DF.to_csv(f'{downsampled}/downsampled_depths_gDNA.csv')

In [None]:
# Downsample CFG 3' end RNA samples
sample_names = ['termseq_expo_r1', 'termseq_expo_r2', 'termseq_expo_r3']

downsampled_depths = []

for sample in sample_names:
    
    original_depth = inCellulo_3end_read_counts.loc[inCellulo_3end_read_counts['Sample_Name'] == sample,
                                                       'Read_Counts']
    downsample_command = f'samtools view -b -s {(downsample_depth/int(original_depth))} ' + \
                         f'{CFG_3end_alignments}/{sample}_R2.bam > ' + \
                         f'{downsampled}/{sample}_downsample.bam'
    downsample = quickshell(downsample_command,
                            print_output = print_output,
                            return_output = return_output)
    
    # Index the downsampled R2 only bam
    index_downsample_command = f'samtools index {downsampled}/{sample}_downsample.bam'
    index_downsample = quickshell(index_downsample_command,
                                  print_output = print_output,
                                  return_output = return_output)
    
    count_downsample_command = f'samtools view -c ' + \
                                  f'{downsampled}/{sample}_downsample.bam'
    count_downsample = int(quickshell(count_downsample_command,
                                         print_output = False,
                                         return_output=True)[0].split('\n')[0])
    downsampled_depths.append([sample, count_downsample])
    
    print(f'Downsampling: {sample} done')
    
downsample_DF = pd.DataFrame(downsampled_depths, columns = ['Sample_Name','Downsampled_read_counts'])
downsample_DF.to_csv(f'{downsampled}/downsampled_depths_inCellulo_3end.csv')

# Trim all reads mapped to bam files

Since minimum allowed read length during quality-processing was 10bp, trim all reads to 10bp.

In [16]:
sample_names = ['Exp_R1_TSS', 'Exp_R2_TSS','Exp_R3_TSS',
                'noCRP1', 'noCRP2', 'noCRP3',
               'CRP1', 'CRP2', 'CRP3',
               'termseq_expo_r1', 'termseq_expo_r2', 'termseq_expo_r3',
               'gDNA1', 'gDNA2', 'gDNA3',
               'noTF1', 'noTF2', 'noTF3',
               'NusA1', 'NusA2', 'NusA3',
               'NusG1', 'NusG2', 'NusG3',
               'NusA_NusG1','NusA_NusG2','NusA_NusG3']

for sample in sample_names:
    trim_command = f'reformat.sh in={downsampled}/{sample}_downsample.bam out={trimmed}/{sample}_trimmed.bam ' + \
                    f' allowidenticalnames=t overwrite=true forcetrimright=10'
    trim_read_lengths = quickshell(trim_command,
                                   print_output = True,
                                   return_output = True)

In [19]:
# Check whether read lengths are actually 10 using:
# samtools view bamname.bam | awk '{print length($10)}' | head -1000 | sort -u

# Sort and index new trimmed .bam files

In [20]:
sample_names = ['Exp_R1_TSS', 'Exp_R2_TSS','Exp_R3_TSS',
                'noCRP1', 'noCRP2', 'noCRP3',
               'CRP1', 'CRP2', 'CRP3',
               'termseq_expo_r1', 'termseq_expo_r2', 'termseq_expo_r3',
               'gDNA1', 'gDNA2', 'gDNA3',
               'noTF1', 'noTF2', 'noTF3',
               'NusA1', 'NusA2', 'NusA3',
               'NusG1', 'NusG2', 'NusG3',
               'NusA_NusG1','NusA_NusG2','NusA_NusG3']

print_output = True
return_output = False

for sample in sample_names:

    # Sort sam file and output as bam
    sort_command = f'samtools sort -O BAM {trimmed}/{sample}_trimmed.bam > ' + \
                          f'{trimmed}/sorted_{sample}_trimmed.bam'

    sort = quickshell(sort_command, print_output = print_output, return_output = return_output)
    
    # Index sorted bam
    index_command = f'samtools index {trimmed}/sorted_{sample}_trimmed.bam'
    
    index = quickshell(index_command, print_output = print_output, return_output=return_output)
    print(f'Sort and index alignments: {sample} done')

# Count number of reads mapping to each genomic region

Genomic region = genes and intergenic regions, both strands.
Need to generate a list of trimmed bam files here as a .txt file as input for summarizeOverlaps.

In [None]:
summarizeOverlaps_command = f'Rscript --vanilla Rscripts/CFG_summarizeOverlaps.R ' + \
            f'-b {summarizeOverlaps}/BamList.txt ' + \
            f'-g geneModel_genomicRegions.csv ' + \
            f'-o {summarizeOverlaps}/downsampled_trimmed_genomicRegions.csv'
quickshell(summarizeOverlaps_5end_command, print_output = True)

# Visualize coverage differences between in cellulo vs. cell-free transcription (Fig. 2C)

In [None]:
figure_dir = 'fig2_inCellulo_vs_CFG/fig2C_coverage_violinBoxPlots'

summarizeOverlaps_5end_command = f'Rscript --vanilla {figure_dir}/fig2C_inCellulo_vs_CFG_coverage.R ' + \
            f'-g geneModel_genomicRegions.csv ' + \
            f'-s {summarizeOverlaps}/downsampled_trimmed_genomicRegions.csv ' + \
            f'-f {figure_dir}/log2_5end_coverage.png ' + \
            f'-e {figure_dir}/log2_3end_coverage.png'
quickshell(summarizeOverlaps_5end_command, print_output = True)