In [1]:
import subprocess, time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib_venn
import upsetplot
import itertools
import plotly.express as px
from scipy import stats
from scipy.signal import find_peaks
from statsmodels.stats.multitest import fdrcorrection
from statistics import mean, stdev, median
import math
import statsmodels.api as sm
from statsmodels.formula.api import ols
from Bio import SeqIO
import csv
import re
import scipy
import RNA

In [2]:
# Disable warnings from pandas (lots of overwriting on DataFrames)
pd.options.mode.chained_assignment = None

In [None]:
terminator_annotations = '3enrich_NusAG/termination_figures/terminator_annotations'
!mkdir $terminator_annotations

# Required functions

In [4]:
# written by Peter Culviner, PhD to enable command-line access through Jupyter
def quickshell(command, print_output=True, output_path=None, return_output=False):
    process_output = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout = process_output.stdout.decode('utf-8')
    stderr = process_output.stderr.decode('utf-8')
    output_string = f'STDOUT:\n{stdout}\nSTDERR:\n{stderr}\n'
    if print_output:
        print('$ ' + command)
        print(output_string)
    if output_path is not None:
        with open(output_path, 'w') as f:
            f.write(output_string)
    if return_output:
        return stdout, stderr
    

In [None]:
def extract_coordinate_environment(genome_seq, coord_DF, upstream_end, downstream_end, samples,
                                   count_cutoff, output):
    """
    Extract the flanking sequence surrounding genomic coordinates in a strand-sensitive manner.
    """
    coord_DF['IndexingCoordinate'] = coord_DF['coordinate']
    double_genome = genome_seq + genome_seq
    
    # Update the indexing coordinate for instances where the indexing requires wrapping around the genome
    coord_DF.loc[(coord_DF['coordinate'] < upstream_end) & (coord_DF['strand'] == '-'),
                 'IndexingCoordinate'] = coord_DF.loc[(coord_DF['coordinate'] < upstream_end) & (coord_DF['strand'] == '-'),"coordinate"] + len(genome_seq)
    coord_DF.loc[(coord_DF['coordinate'] < downstream_end) & (coord_DF['strand'] == '+'),
                 'IndexingCoordinate'] = coord_DF.loc[(coord_DF['coordinate'] < downstream_end) & (coord_DF['strand'] == '+'),'coordinate'] + len(genome_seq)
    
    plus_DF = coord_DF.loc[coord_DF['strand'] == "+",]
    minus_DF = coord_DF.loc[coord_DF['strand'] == "-",]
  
    # Initializes the ranges of the genome to extract
    plus_DF['start'] = plus_DF['IndexingCoordinate'] - upstream_end
    plus_DF['end'] = plus_DF['IndexingCoordinate'] + downstream_end
    plus_DF['IndexCoordMin1'] = plus_DF['IndexingCoordinate'] - 1
    plus_DF['IndexCoordPlus1'] = plus_DF['IndexingCoordinate'] + 1
    plus_DF['Nontemplate_strand'] = ''
    plus_DF.reset_index(inplace = True)
                                                    
    for i in range(len(plus_DF.index)):
        # Breaks up the extracted region to capitalize the TSS
        # Promoter is the area directly upstream of the coordinate
        plus_upstream = double_genome[(int(plus_DF['start'][i])-2):int(plus_DF['IndexingCoordinate'][i])-1].lower()
        plus_coordinate = double_genome[int(plus_DF['IndexingCoordinate'][i])-1:int(plus_DF['IndexingCoordinate'][i])]
        plus_downstream = double_genome[int(plus_DF['IndexingCoordinate'][i]):(int(plus_DF['end'][i])-2)].lower()
        plus_DF['Nontemplate_strand'][i] = plus_upstream + plus_coordinate + plus_downstream
  
    # Initializes the ranges of the genome to extract
    minus_DF['start'] = minus_DF['IndexingCoordinate'] - (downstream_end-1)
    minus_DF['end'] = minus_DF['IndexingCoordinate'] + (upstream_end+1)
    minus_DF['IndexCoordMin1'] = minus_DF['IndexingCoordinate'] - 1
    minus_DF['IndexCoordPlus1'] = minus_DF['IndexingCoordinate'] + 1
    minus_DF['Nontemplate_strand'] = ''
    
    minus_DF.reset_index(inplace = True)
  
    for i in range(len(minus_DF.index)):
                                                    
        # For - strand regions turns the extracted region into a DNA sequence and takes the reverse complement
        # (converts the + strand to the - strand) to get the non-template strand
        minus_extract_all = double_genome[int(minus_DF['start'][i]):int(minus_DF['end'][i])]
        minus_NTstrand = minus_extract_all.reverse_complement()
        
        # Breaks up the extracted region to capitalize the TSS
        # Promoter is the area directly upstream of the coordinate
        minus_upstream = minus_NTstrand[:upstream_end+1].lower()
        minus_coordinate = minus_NTstrand[upstream_end+1:upstream_end+2]
        minus_downstream = minus_NTstrand[upstream_end+2:].lower()
        minus_DF['Nontemplate_strand'][i] = minus_upstream + minus_coordinate + minus_downstream

    complete_DF = pd.concat([plus_DF, minus_DF])
    complete_DF_ordered = complete_DF.sort_values(by = 'coordinate')
    complete_DF_ordered.to_csv(f'3enrich_NusAG/termination_figures/terminator_annotations/{samples}_{count_threshold}counts_seqExtract.csv')
    return complete_DF_ordered

In [None]:
def RNAfold_DF(DF, column_name, start, end):
    """
    Iteratively run RNA.fold on sequence regions of interest on multiple rows of a DataFrame.
    DF: DataFrame containing the relevant data. All rows will be included.
    column_name: the column containing the sequences to be run thru RNA.fold.
    start: start of the desired RNA region.
    end: end of the desired RNA region.
    """
    DF[f'RNAfold_{column_name}_structure'] = 0
    DF[f'RNAfold_{column_name}_deltaG'] = 0
    DF[f'RNAfold_{column_name}_spacerLength'] = 0
    DF[f'RNAfold_{column_name}_stemLength'] = 0
    DF[f'RNAfold_{column_name}_loopLength'] = 0

    
    for i in range(len(DF.index)):
        DF[f'RNAfold_{column_name}_structure'][i] = RNA.fold(DF[f'{column_name}'].str[start:end][i])[0]
        DF[f'RNAfold_{column_name}_deltaG'][i] = RNA.fold(DF[f'{column_name}'].str[start:end][i])[1]
        
        try:
            # Add hairpin spacer length
            last_endParen_position = DF[f'RNAfold_{column_name}_structure'][i].rindex(')')
            DF[f'RNAfold_{column_name}_spacerLength'][i] = (end - last_endParen_position) - 1

            # Add hairpin stem length
            last_openParen_position = DF[f'RNAfold_{column_name}_structure'][i].rindex('(')
            next_closedParen_position = DF[f'RNAfold_{column_name}_structure'][i][last_openParen_position:].index(')') + last_openParen_position
            DF[f'RNAfold_{column_name}_stemLength'][i] = (last_endParen_position - next_closedParen_position) + 1
            
            # Add hairpin loop length
            stem_length = (last_endParen_position - next_closedParen_position) + 1
            end_loop = last_endParen_position - stem_length
            start_loop = last_openParen_position + 1
            DF[f'RNAfold_{column_name}_loopLength'][i] = end_loop - start_loop + 1
        except:
            pass
        
    return DF

# Annotating terminator features from different sets of coordinates

Annotate cis-regulatory features for the set of TTSs at a given post-nonparametric resampling count threshold.

In [None]:
def terminator_annotation(count_threshold, genome_fasta, fit_type, outer_up, outer_down, hairpin_region_range):
    """
    For a given count threshold, extract all coordinates 
    and carry out RNAFold structural predictions on upstream region.
    
    Carries out comparisons for different hairpin regions. Lowest deltaG value is designated "most likely hairpin."
    
    count_threshold: raw count threshold for bootstrap-based peak calling.
    fit_type: DESeq2 model fit (mean, parametric or local. Usually local)
    outer_up: how much of the upstream region to extract (relative to coordinate). Positive number.
    outer_down: how much of the downstream region to extract (relative to coordinate). Positive number.
        NOTE: right now, outer_down will set the boundary as the input value - 2 (bug in script).
    hairpin_region_range: the regions to run RNAFold prediction on and compare to find most likely hairpin structure.
    """
    DESeq2_DF = pd.read_csv(f'3enrich_NusAG/selectThreshold/DESeq2/multifactor_{count_threshold}counts_ends_results_{fit_type}.csv')
    # Assumes that local is the best fit; change and repeat if need be                             
    coordinate_str = DESeq2_DF['coordID'].str[:-1]
    DESeq2_DF['coordinate'] = coordinate_str.astype(int)
    DESeq2_DF['strand'] = DESeq2_DF['coordID'].str[-1:]
            
    # For 3' end enrichment only: flip the forward and reverse strands for sequence extraction
    DESeq2_DF['strand2'] = DESeq2_DF['strand']
    DESeq2_DF.loc[DESeq2_DF['strand'] == '+','strand'] = '-'
    DESeq2_DF.loc[DESeq2_DF['strand2'] == '-','strand'] = '+'
    
    # Import genome
    genome_iterator = SeqIO.parse(f'{genome_fasta}', 'fasta')
    genome = 0

    for seq_record in genome_iterator:
        genome = seq_record.seq
        
    # Extract sequences
    seq_DF_out = extract_coordinate_environment(genome, DESeq2_DF, outer_up, outer_down,
                                               'multifactor', count_threshold)
    seq_DF = pd.read_csv(f'3enrich_NusAG/termination_figures/terminator_annotations/multifactor_{count_threshold}counts_seqExtract.csv')
    
    # Must re-read seqDF in for character vectors to be correct
    for i in hairpin_region_range:
        seq_DF[f'hairpin_region_{i}'] = seq_DF['Nontemplate_strand'].str[(outer_up - i):(outer_up+2)]
        column = f'hairpin_region_{i}'
        seq_DF = RNAfold_DF(seq_DF, column, 0, i)

    seq_DF.to_csv(f'3enrich_NusAG/termination_figures/terminator_annotations/multifactor_{count_threshold}counts_hairpins.csv')
    seq_DF = pd.read_csv(f'3enrich_NusAG/termination_figures/terminator_annotations/multifactor_{count_threshold}counts_hairpins.csv')
    seq_DF['most_likely_hairpin_deltaG'] = 0
    seq_DF['most_likely_hairpin_structure'] = 0
    seq_DF['most_likely_hairpin_region'] = 0
    
    for i in range(len(seq_DF.index)):
        delta_G_values = seq_DF.filter(like = 'deltaG').iloc[i]
        seq_DF['most_likely_hairpin_deltaG'][i] = min(delta_G_values)
        
        hairpin_structures = seq_DF.filter(like = 'structure').iloc[i]
        min_deltaG = delta_G_values.idxmin()[:-7]
        seq_DF['most_likely_hairpin_structure'][i] = hairpin_structures[f'{min_deltaG}_structure']
        try:
            seq_DF['most_likely_hairpin_region'][i] = int(min_deltaG[-2:])
            seq_DF['most_likely_hairpin_stemLength'][i] = seq_DF[f'RNAfold_hairpin_region_{int(min_deltaG[-2:])}_stemLength'][i]
            seq_DF['most_likely_hairpin_spacerLength'][i] = seq_DF[f'RNAfold_hairpin_region_{int(min_deltaG[-2:])}_spacerLength'][i]
 
        except:
            pass

    minus_strand = seq_DF.loc[seq_DF['strand'] == '-',]
    plus_strand = seq_DF.loc[seq_DF['strand'] == '+',]
    minus_strand['Pause_region'] = minus_strand['Nontemplate_strand'].str[(outer_up - 9):(outer_up+5)]
    plus_strand['Pause_region'] = plus_strand['Nontemplate_strand'].str[(outer_up - 11):(outer_up+3)]
    
    minus_strand['Seqlogo_region'] = minus_strand['Nontemplate_strand'].str[(outer_up - 48):(outer_up+7)]
    plus_strand['Seqlogo_region'] = plus_strand['Nontemplate_strand'].str[(outer_up - 50):(outer_up+5)]
    both_DF = pd.concat([minus_strand, plus_strand], axis = 0)

    pause_regex = '[cg][cg]........[ct]g'
    
    both_DF['Pause_lower'] = both_DF['Pause_region'].str.lower()
    both_DF['U_count'] = both_DF['Pause_lower'].str.count('t')
    
    both_DF['Utract_region'] = both_DF['Pause_lower'].str[3:-3]
    both_DF['U_count_tractRegion'] = both_DF['Utract_region'].str.count('t')
    
    both_DF['downstream_region'] = both_DF['Nontemplate_strand'].str[outer_up+3:outer_up+13]
    both_DF['U_count_downstreamRegion'] = both_DF['downstream_region'].str.count('t')
    both_DF['A_count_downstreamRegion'] = both_DF['downstream_region'].str.count('a')
    
    both_DF['Elemental_pause_match?'] = both_DF['Pause_lower'].str.match(pause_regex)
        
    sorted_DF = both_DF.sort_values(by = ['coordID'])
    sorted_DF.to_csv(f'3enrich_NusAG/termination_figures/terminator_annotations/multifactor_{count_threshold}counts_annotated.csv')
    return sorted_DF

In [None]:
count_cutoff_range = range(70,71,1)
for i in count_cutoff_range:
    terminator_annotation(count_threshold = i,
                          fit_type = 'local',
                          outer_up = 50,
                          outer_down = 40,
                          hairpin_region_range = [44])

### Annotating terminator features from randomly-selected coordinates

In [None]:
def terminator_annotation_randomCoords(count_threshold, fit_type, outer_up, outer_down,
                            hairpin_region_range, num):
    
    DESeq2_DF = pd.read_csv(f'3enrich_NusAG/selectThreshold/DESeq2/multifactor_{count_threshold}counts_ends_results_{fit_type}.csv')
    # Assumes that local is the best fit; change and repeat if need be                             
    coordinate_str = DESeq2_DF['coordID'].str[:-1]
    DESeq2_DF['strand'] = DESeq2_DF['coordID'].str[-1:]

    DESeq2_DF['coordinate'] = coordinate_str.astype(int)
    random_coordinates = np.random.randint(min(DESeq2_DF['coordinate']),
                                           max(DESeq2_DF['coordinate']),
                                           size = len(DESeq2_DF.index))
                                                       
    control_coord_dict = {'coordinate':list(random_coordinates),
                          'strand':list(DESeq2_DF['strand'])}
    control_coord_DF = pd.DataFrame(data = control_coord_dict)
    
    # Extract sequences
    seq_DF_out = extract_coordinate_environment(genome, control_coord_DF, outer_up, outer_down,
                                               f'multifactor_random{num}', count_threshold)
    seq_DF = pd.read_csv(f'3enrich_NusAG/termination_figures/terminator_annotations/multifactor_random{num}_{count_threshold}counts_seqExtract.csv')

    # Must re-read seqDF in for character vectors to be correct
    for i in hairpin_region_range:
        seq_DF[f'hairpin_region_{i}'] = seq_DF['Nontemplate_strand'].str[(outer_up - i):(outer_up+2)]
        column = f'hairpin_region_{i}'
        seq_DF = RNAfold_DF(seq_DF, column, 0, i)
        
    seq_DF.to_csv(f'3enrich_NusAG/termination_figures/terminator_annotations/multifactor_{count_threshold}counts_hairpins.csv')
    seq_DF = pd.read_csv(f'3enrich_NusAG/termination_figures/terminator_annotations/multifactor_{count_threshold}counts_hairpins.csv')
    seq_DF['most_likely_hairpin_deltaG'] = 0
    seq_DF['most_likely_hairpin_structure'] = 0
    seq_DF['most_likely_hairpin_region'] = 0
    seq_DF['most_likely_hairpin_stemLength'] = 0
    seq_DF['most_likely_hairpin_spacerLength'] = 0
    
    for i in range(len(seq_DF.index)):
        delta_G_values = seq_DF.filter(like = 'deltaG').iloc[i]
        seq_DF['most_likely_hairpin_deltaG'][i] = min(delta_G_values)
        
        hairpin_structures = seq_DF.filter(like = 'structure').iloc[i]
        min_deltaG = delta_G_values.idxmin()[:-7]
        seq_DF['most_likely_hairpin_structure'][i] = hairpin_structures[f'{min_deltaG}_structure']
        try:
            seq_DF['most_likely_hairpin_region'][i] = int(min_deltaG[-2:])
            seq_DF['most_likely_hairpin_stemLength'][i] = seq_DF[f'RNAfold_hairpin_region_{int(min_deltaG[-2:])}_stemLength'][i]
            seq_DF['most_likely_hairpin_spacerLength'][i] = seq_DF[f'RNAfold_hairpin_region_{int(min_deltaG[-2:])}_spacerLength'][i]
        except:
            pass
        
    minus_strand = seq_DF.loc[seq_DF['strand'] == '-',]
    plus_strand = seq_DF.loc[seq_DF['strand'] == '+',]
    minus_strand['Pause_region'] = minus_strand['Nontemplate_strand'].str[(outer_up - 9):(outer_up+5)]
    plus_strand['Pause_region'] = plus_strand['Nontemplate_strand'].str[(outer_up - 11):(outer_up+3)]
    
    minus_strand['Seqlogo_region'] = minus_strand['Nontemplate_strand'].str[(outer_up - 48):(outer_up+7)]
    plus_strand['Seqlogo_region'] = plus_strand['Nontemplate_strand'].str[(outer_up - 50):(outer_up+5)]
    both_DF = pd.concat([minus_strand, plus_strand], axis = 0)

    pause_regex = '[cg][cg]........[ct]g'
    
    both_DF['Pause_lower'] = both_DF['Pause_region'].str.lower()
    both_DF['U_count'] = both_DF['Pause_lower'].str.count('t')
    
    both_DF['Utract_region'] = both_DF['Pause_lower'].str[3:-3]
    both_DF['U_count_tractRegion'] = both_DF['Utract_region'].str.count('t')
    
    both_DF['downstream_region'] = both_DF['Nontemplate_strand'].str[outer_up+3:outer_up+13]
    both_DF['U_count_downstreamRegion'] = both_DF['downstream_region'].str.count('t')
    both_DF['A_count_downstreamRegion'] = both_DF['downstream_region'].str.count('a')

    both_DF['Elemental_pause_match?'] = both_DF['Pause_lower'].str.match(pause_regex)
        
    sorted_DF = both_DF.sort_values(by = ['coordinate'])
    sorted_DF.to_csv(f'3enrich_NusAG/termination_figures/terminator_annotations/multifactor_random{num}_{count_threshold}counts_annotated.csv')
    return sorted_DF

In [None]:
count_cutoff_range = [70]

# Generate 3 replicates
for i in count_cutoff_range:
    terminator_annotation_randomCoords(i, 'local', 50, 40, [44], 1)
    terminator_annotation_randomCoords(i, 'local', 50, 40, [44], 2)
    terminator_annotation_randomCoords(i, 'local', 50, 40, [44], 3)

### Annotating terminator features from Eco and Bsu terminators

In [None]:
def terminator_annotation_species(input_DF, bacteria, genome_fasta, outer_up, outer_down,
                                  hairpin_region_range, output):
    """
    For a given count threshold, extract all coordinates 
    and carry out RNAFold structural predictions on upstream region &
    other terminator cis-regulatory regions.
    
    bacteria: bacterial species (Bsu or Eco)
    genome_fasta: genome of bacterial species being examined
    input_DF: input DataFrame containing 'coordinate' and 'strand' columns for TTSs
    outer_up: how much of the upstream region to extract (relative to TTS). Positive number.
    outer_down: how much of the downstream region to extract (relative to TTS). Positive number.
        NOTE: right now, outer_down will set the boundary as the input value - 2 (bug in script).
    hairpin_region_range: the regions to run RNAFold prediction on
        and compare to find most likely hairpin structure.
    output: output path/csv file.
    """
    
    # Import genome
    genome_iterator = SeqIO.parse(f'{genome_fasta}', 'fasta')
    genome = 0

    for seq_record in genome_iterator:
        genome = seq_record.seq
        
    # Extract sequences
    seq_DF_out = extract_coordinate_environment_others(genome,
                                                       input_DF,
                                                       outer_up,
                                                       outer_down,
                                                       bacteria,
                                                      70)
    seq_DF = pd.read_csv(f'3enrich_NusAG/termination_figures/terminator_annotations/{bacteria}_70counts_seqExtract.csv')
    
    # Must re-read seqDF in for character vectors to be correct
    for i in hairpin_region_range:
        seq_DF[f'hairpin_region_{i}'] = seq_DF['Nontemplate_strand'].str[(outer_up - i):(outer_up+2)]
        column = f'hairpin_region_{i}'
        seq_DF = RNAfold_DF(seq_DF, column, 0, i)

    seq_DF.to_csv(f'3enrich_NusAG/termination_figures/terminator_annotations/{bacteria}_hairpins.csv')
    seq_DF = pd.read_csv(f'3enrich_NusAG/termination_figures/terminator_annotations/{bacteria}_hairpins.csv')
    seq_DF['most_likely_hairpin_deltaG'] = 0
    seq_DF['most_likely_hairpin_structure'] = 0
    seq_DF['most_likely_hairpin_region'] = 0
    
    for i in range(len(seq_DF.index)):
        delta_G_values = seq_DF.filter(like = 'deltaG').iloc[i]
        seq_DF['most_likely_hairpin_deltaG'][i] = min(delta_G_values)
        
        hairpin_structures = seq_DF.filter(like = 'structure').iloc[i]
        min_deltaG = delta_G_values.idxmin()[:-7]
        seq_DF['most_likely_hairpin_structure'][i] = hairpin_structures[f'{min_deltaG}_structure']
        try:
            seq_DF['most_likely_hairpin_region'][i] = int(min_deltaG[-2:])
            seq_DF['most_likely_hairpin_stemLength'][i] = seq_DF[f'RNAfold_hairpin_region_{int(min_deltaG[-2:])}_stemLength'][i]
            seq_DF['most_likely_hairpin_spacerLength'][i] = seq_DF[f'RNAfold_hairpin_region_{int(min_deltaG[-2:])}_spacerLength'][i]
 
        except:
            pass

    minus_strand = seq_DF.loc[seq_DF['direction'] == '-',]
    plus_strand = seq_DF.loc[seq_DF['direction'] == '+',]
    minus_strand['Pause_region'] = minus_strand['Nontemplate_strand'].str[(outer_up - 9):(outer_up+5)]
    plus_strand['Pause_region'] = plus_strand['Nontemplate_strand'].str[(outer_up - 11):(outer_up+3)]
    
    minus_strand['Seqlogo_region'] = minus_strand['Nontemplate_strand'].str[(outer_up - 48):(outer_up+7)]
    plus_strand['Seqlogo_region'] = plus_strand['Nontemplate_strand'].str[(outer_up - 50):(outer_up+5)]
    both_DF = pd.concat([minus_strand, plus_strand], axis = 0)

    pause_regex = '[cg][cg]........[ct]g'
    
    both_DF['Pause_lower'] = both_DF['Pause_region'].str.lower()
    both_DF['U_count'] = both_DF['Pause_lower'].str.count('t')
    
    both_DF['Utract_region'] = both_DF['Pause_lower'].str[5:-1]
    both_DF['U_count_tractRegion'] = both_DF['Utract_region'].str.count('t')
    
    both_DF['downstream_region'] = both_DF['Nontemplate_strand'].str[(outer_up+3):(outer_up+13)]
    both_DF['U_count_downstreamRegion'] = both_DF['downstream_region'].str.count('t')
    both_DF['A_count_downstreamRegion'] = both_DF['downstream_region'].str.count('a')
    
    both_DF['Elemental_pause_match?'] = both_DF['Pause_lower'].str.match(pause_regex)
        
    sorted_DF = both_DF.sort_values(by = ['coordinate'])
    sorted_DF.to_csv(f'3enrich_NusAG/termination_figures/terminator_annotations/{output}')
    return sorted_DF

In [None]:
bsu_DF = pd.read_csv('3enrich_NusAG/termination_figures/Bsu_terms.csv')
eco_DF = pd.read_csv('3enrich_NusAG/termination_figures/Eco_terms.csv')
mtb_DF = pd.read_csv('3enrich_NusAG/termination_figures/intergenic_terminators.csv')

In [None]:
terminator_annotation_species(bsu_DF,
                              'bsu',
                             outer_up = 50,
                             outer_down = 40,
                             hairpin_region_range = [44],
                             'Bsu_terms_annotated.csv')

In [None]:
terminator_annotation_species(eco_DF,
                             'eco',
                             outer_up = 50,
                             outer_down = 40,
                             hairpin_region_range = [44],
                             'Eco_terms_annotated.csv')

In [None]:
terminator_annotation_species(mtb_DF,
                             'mtb',
                             outer_up = 50,
                             outer_down = 40,
                             hairpin_region_range = [44],
                             'Mtb_terms_annotated.csv')