In [3]:
import subprocess, time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import upsetplot
import itertools
from scipy import stats
from scipy.signal import find_peaks
from statsmodels.stats.multitest import fdrcorrection
from statistics import mean, stdev, median
import math
import statsmodels.api as sm
from statsmodels.formula.api import ols
from Bio import SeqIO
import csv
from toolz import interleave
import re
import scipy

In [4]:
# written by Peter Culviner, PhD to enable command-line access through Jupyter
def quickshell(command, print_output=True, output_path=None, return_output=False):
    process_output = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout = process_output.stdout.decode('utf-8')
    stderr = process_output.stderr.decode('utf-8')
    output_string = f'STDOUT:\n{stdout}\nSTDERR:\n{stderr}\n'
    if print_output:
        print('$ ' + command)
        print(output_string)
    if output_path is not None:
        with open(output_path, 'w') as f:
            f.write(output_string)
    if return_output:
        return stdout, stderr

# Initializing inputs and settings

In [1]:
# Main variables to adjust
bootstrap_alpha = 1
bootstrap_sig_cutoff = 0.05
count_cutoff_range = range(10,210,10)
replicates = 3
DESeq2_sig_cutoff

# Length of spike genome (Eco) - adjust as needed
len_spike_genome = 4641651
# Length of experimental genome (Mtb) - adjust as needed
len_ref_genome = 4411709

# Pairwise comparison
condition_subset = ['noCRP','CRP']
blacklist_sample = 'core'

condition_subset = ['noTF', 'NusA', 'NusG', 'NusA_NusG']
blacklist_sample = 'gDNA'

condition_string = "_".join(condition_subset)

# Executable path for the MEME algorithm (depends on where you have it installed)
MEME_path = '/usr/local/bin/meme/bin'

In [None]:
# Motif information

# 5enrich experiment: 
# QC (quality control) motif: -10 element
# DE (differential expression) motif: CRP motif
outer_up = 100
outer_down = 30
QCmotifUp = 16
QCmotifDown = 4
QCmotifName = 'min10'
QCmotifRegex = 'A...U'
DEmotifUp = 100
DEmotifDown = -30
DEmotifName = 'min100plus30'
DEmotifRegex = 'GUG........CAC'

# 3enrich experiment: 
# QC motif: U-tract
# TF motif: none
outer_up = 50
outer_down = 10
QCmotifUp = -9
QCmotifDown = -4
QCmotifName = 'utract'
QCmotifRegex = 'UUUU'

In [6]:
# Inputs
main_path = '5enrich_CRP'
#main_path = '3enrich_NusAG'
gDNA_path = 'gDNA'

# Directories from readPrep
readPrep_dir = f'{main_path}/readPrep'
R2_alignments_dir = f'{readPrep_dir}/R2_alignments'

# Directories from identifyEnrichedEnds
identifyEnds_dir = f'{main_path}/identifyEnrichedEnds'
bigWig_dir = f'{identifyEnds_dir}/bigWigs'
bootstrap_calls_dir = f'{identifyEnds_dir}/bootstrap_calls'
ends_dir = f'{identifyEnds_dir}/ends'
coverage_dir = f'{identifyEnds_dir}/coverage'
spike_ends_dir = f'{identifyEnds_dir}/ends_spike'
spike_coverage_dir = f'{identifyEnds_dir}/cov_spike'
spike_bootstrap_calls_dir = f'{identifyEnds_dir}/bootstrap_calls_spike'

# Outputs
selectThreshold_dir = f'{main_path}/selectThreshold'
preBL_dir = f'{selectThreshold_dir}/transcriptEnds_preBlackList'
postBL_dir = f'{selectThreshold_dir}/transcriptEnds_postBlackList'
MEME_outputs_dir = f'{selectThreshold_dir}/MEME_outputs'
union_dir = f'{selectThreshold_dir}/transcriptEndUnions'
spike_union_dir = f'{identifyEnds_dir}/spikeTranscriptEndUnions'
DESeq2_dir = f'{selectThreshold_dir}/DESeq2'
seqExtract_dir = f'{selectThreshold_dir}/flankingSeqs'
motif_compilation_dir = f'{selectThreshold_dir}/allMotifResults'

In [None]:
!mkdir $selectThreshold_dir
!mkdir $preBL_dir
!mkdir $postBL_dir
!mkdir $MEME_outputs_dir
!mkdir $union_dir
!mkdir $spike_union_dir
!mkdir $DESeq2_dir
!mkdir $seqExtract_dir
!mkdir $motif_compilation_dir

In [7]:
sampleDF = pd.read_csv(f'{readPrep_dir}/downsampled_R2_bams/downsampled_depths.csv')
samples = sampleDF['Sample_Name']
sampleDF['condition'] = sampleDF['Sample_Name'].str[:-1]
sampleDF['replicate'] = sampleDF['Sample_Name'].str[-1:]

In [8]:
# Import genome
genome_iterator = SeqIO.parse(f'genome_files_misc/Eco_Mtb_genome.fasta', 'fasta')
genome = 0

for seq_record in genome_iterator:
    genome = seq_record.seq

# Functions

In [7]:
def consensus_peaks(condition, replicates, count_cutoff):
    """
    Generates a consensus dataframe of called peaks from a list of replicates.
    condition is a string containing the name of the experimental condition, e.g. NusG.
    replicates: number of replicates.
    count_cutoff: >= cutoff value in 'ends' column.
    """
    # Initialize empty lists
    plus_replicate_DF_list = []
    minus_replicate_DF_list = []
    plus_ends_list = []
    minus_ends_list = []
    
    replicate_numbers_DF = sampleDF.loc[sampleDF['condition'] == condition,]
    replicate_numbers_DF.reset_index(inplace = True)
    replicate_numbers = replicate_numbers_DF['replicate']
    
    # For each replicate:
    for i in range(replicates):
        
        # Read in the called peaks for each replicate
        next_DF = pd.read_table(f'{bootstrap_calls_dir}/{condition}{replicate_numbers[i]}' + \
                                f'_calls_alpha{bootstrap_alpha}.txt')
        # Add the replicate as a column
        next_DF['replicate'] = replicate_numbers[i]
        
        # Filter for ends that have a certain number of counts
        next_DF_highCov = next_DF.loc[next_DF['count'] >= count_cutoff,]
        
        # Adjust p-values again after applying coverage filter
        _, qvals = fdrcorrection(next_DF_highCov['pvalue'], method = "i")
        next_DF_highCov['qvalue_updated'] = qvals
        
        # Filter for significant peaks (q-val = post-multiple hypothesis testing correction)
        next_DF_sig = next_DF_highCov.loc[next_DF_highCov['qvalue_updated'] <= bootstrap_sig_cutoff,]
        
        # Separate + and - coordinates
        next_DF_sig_plus = next_DF_sig.loc[next_DF_sig['strand'] == '+',]
        next_DF_sig_minus = next_DF_sig.loc[next_DF_sig['strand'] == '-',]
        
        # Append plus and minus coordinates to empty lists
        plus_replicate_DF_list.append(next_DF_sig_plus)
        minus_replicate_DF_list.append(next_DF_sig_minus)
        plus_ends_list.append(set(next_DF_sig_plus['end']))
        minus_ends_list.append(set(next_DF_sig_minus['end']))
        
    # Put all individual replicates per condition into one large DataFrame
    plus_replicate_DFs = pd.concat(plus_replicate_DF_list)
    minus_replicate_DFs = pd.concat(minus_replicate_DF_list)
    
    # Only include enriched ends that were called in all replicates
    plus_ends_intersection = set.intersection(*plus_ends_list)
    minus_ends_intersection = set.intersection(*minus_ends_list)
    plus_replicate_DF_allReps = plus_replicate_DFs.loc[plus_replicate_DFs['end'].isin(plus_ends_intersection),]
    minus_replicate_DF_allReps = minus_replicate_DFs.loc[minus_replicate_DFs['end'].isin(minus_ends_intersection),]
    
    # Combine + and - DFs
    replicate_DF_allReps = pd.concat([plus_replicate_DF_allReps, minus_replicate_DF_allReps])
    
    replicate_DF_allReps['condition'] = condition
    
    # Write out final dataframe
    replicate_DF_allReps.to_csv(f'{preBL_dir}/' + \
                    f'preBlackList_{condition}_consensus_calls_{replicates}reps_{count_cutoff}counts.csv')
    print(f'Generating consensus peaks: {condition} done')
    return replicate_DF_allReps

In [2]:
def consensus_peaks_gDNA(replicates, count_cutoff):
    """
    Generates a consensus dataframe for genomic DNA peaks.
    replicates: number of replicates.
    count_cutoff: >= cutoff value in 'ends' column.
    """
    # Initialize empty lists
    plus_replicate_DF_list = []
    minus_replicate_DF_list = []
    plus_ends_list = []
    minus_ends_list = []
    
    # For each replicate:
    for i in range(replicates):
        
        # Read in the called peaks for each replicate
        next_DF = pd.read_table(f'{gDNA_dir}/identifyEnrichedEnds/bootstrap_calls/gDNA{replicate_numbers[i]}' + \
                                f'_calls_alpha{bootstrap_alpha}.txt')
        # Add the replicate as a column
        next_DF['replicate'] = replicate_numbers[i]
        
        # Filter for ends that have a certain number of counts
        next_DF_highCov = next_DF.loc[next_DF['count'] >= count_cutoff,]
        
        # Adjust p-values again after applying coverage filter
        _, qvals = fdrcorrection(next_DF_highCov['pvalue'], method = "i")
        next_DF_highCov['qvalue_updated'] = qvals
        
        # Filter for significant peaks (q-val = post-multiple hypothesis testing correction)
        next_DF_sig = next_DF_highCov.loc[next_DF_highCov['qvalue_updated'] <= bootstrap_sig_cutoff,]
        
        # Separate + and - coordinates
        next_DF_sig_plus = next_DF_sig.loc[next_DF_sig['strand'] == '+',]
        next_DF_sig_minus = next_DF_sig.loc[next_DF_sig['strand'] == '-',]
        
        # Append plus and minus coordinates to empty lists
        plus_replicate_DF_list.append(next_DF_sig_plus)
        minus_replicate_DF_list.append(next_DF_sig_minus)
        plus_ends_list.append(set(next_DF_sig_plus['end']))
        minus_ends_list.append(set(next_DF_sig_minus['end']))
        
    # Put all individual replicates per condition into one large DataFrame
    plus_replicate_DFs = pd.concat(plus_replicate_DF_list)
    minus_replicate_DFs = pd.concat(minus_replicate_DF_list)
    
    # Only include enriched ends that were called in all replicates
    plus_ends_intersection = set.intersection(*plus_ends_list)
    minus_ends_intersection = set.intersection(*minus_ends_list)
    plus_replicate_DF_allReps = plus_replicate_DFs.loc[plus_replicate_DFs['end'].isin(plus_ends_intersection),]
    minus_replicate_DF_allReps = minus_replicate_DFs.loc[minus_replicate_DFs['end'].isin(minus_ends_intersection),]
    
    # Combine + and - DFs
    replicate_DF_allReps = pd.concat([plus_replicate_DF_allReps, minus_replicate_DF_allReps])
    
    replicate_DF_allReps['condition'] = 'gDNA'
    
    # Write out final dataframe
    replicate_DF_allReps.to_csv(f'{gDNA_dir}/selectThreshold/enrichedEnds/' + \
                    f'gDNA_consensus_calls_{replicates}reps_{count_cutoff}counts.csv')
    print(f'Generating consensus peaks: gDNA done')
    return replicate_DF_allReps

In [8]:
def remove_blacklist_coordinates(coordinate_DF, blacklist_DF, column):
    '''
    Removes coordinates from coordinate_DF that are present in blacklist_DF.
    
    Preconditions: inputs must be Pandas DataFrames with 'end' columns.
    '''
    blacklist_coords = blacklist_DF[f'{column}']
    coordinate_DF_noBlackList = coordinate_DF.loc[~coordinate_DF[f'{column}'].isin(blacklist_coords),]
    return coordinate_DF_noBlackList

In [9]:
def extract_coordinate_environment(genome_seq, coord_DF, upstream_end, downstream_end, samples, count_cutoff):
    
    coord_DF['IndexingCoordinate'] = coord_DF['coordinate']
    double_genome = genome_seq + genome_seq
    
    # Update the indexing coordinate for instances where the indexing requires wrapping around the genome
    coord_DF.loc[(coord_DF['coordinate'] < upstream_end) & (coord_DF['strand'] == '-'),'IndexingCoordinate'] = coord_DF.loc[(coord_DF['coordinate'] < upstream_end) & (coord_DF['strand'] == '-'),"coordinate"] + len(genome_seq)
    coord_DF.loc[(coord_DF['coordinate'] < downstream_end) & (coord_DF['strand'] == '+'),'IndexingCoordinate'] = coord_DF.loc[(coord_DF['coordinate'] < downstream_end) & (coord_DF['strand'] == '+'),'coordinate'] + len(genome_seq)
    
    plus_DF = coord_DF.loc[coord_DF['strand'] == "+",]
    minus_DF = coord_DF.loc[coord_DF['strand'] == "-",]
  
    # Initializes the ranges of the genome to extract
    plus_DF['start'] = plus_DF['IndexingCoordinate'] - upstream_end
    plus_DF['end'] = plus_DF['IndexingCoordinate'] + downstream_end
    plus_DF['IndexCoordMin1'] = plus_DF['IndexingCoordinate'] - 1
    plus_DF['IndexCoordPlus1'] = plus_DF['IndexingCoordinate'] + 1
    plus_DF['Nontemplate_strand'] = ''
    plus_DF.reset_index(inplace = True)
                                                    
    for i in range(len(plus_DF.index)):
        # Breaks up the extracted region to capitalize the TSS
        # Promoter is the area directly upstream of the coordinate
        plus_upstream = double_genome[int(plus_DF['start'][i]):int(plus_DF['IndexingCoordinate'][i])].lower()
        plus_coordinate = double_genome[int(plus_DF['IndexingCoordinate'][i]):int(plus_DF['IndexCoordPlus1'][i])]
        plus_downstream = double_genome[int(plus_DF['IndexCoordPlus1'][i]):int(plus_DF['end'][i])].lower()
        plus_DF['Nontemplate_strand'][i] = plus_upstream + plus_coordinate + plus_downstream
  
    # Initializes the ranges of the genome to extract
    minus_DF['start'] = minus_DF['IndexingCoordinate'] - (downstream_end-1)
    minus_DF['end'] = minus_DF['IndexingCoordinate'] + (upstream_end+1)
    minus_DF['IndexCoordMin1'] = minus_DF['IndexingCoordinate'] - 1
    minus_DF['IndexCoordPlus1'] = minus_DF['IndexingCoordinate'] + 1
    minus_DF['Nontemplate_strand'] = ''
    
    minus_DF.reset_index(inplace = True)
  
    for i in range(len(minus_DF.index)):
                                                    
        # For - strand regions turns the extracted region into a DNA sequence and takes the reverse complement
        # (converts the + strand to the - strand) to get the non-template strand
        minus_extract_all = double_genome[int(minus_DF['start'][i]):int(minus_DF['end'][i])]
        minus_NTstrand = minus_extract_all.reverse_complement()
        
        # Breaks up the extracted region to capitalize the TSS
        # Promoter is the area directly upstream of the coordinate
        minus_upstream = minus_NTstrand[:upstream_end].lower()
        minus_coordinate = minus_NTstrand[upstream_end:upstream_end+1]
        minus_downstream = minus_NTstrand[upstream_end+1:].lower()
        minus_DF['Nontemplate_strand'][i] = minus_upstream + minus_coordinate + minus_downstream

    complete_DF = pd.concat([plus_DF, minus_DF])
    complete_DF_ordered = complete_DF.sort_values(by = 'coordinate')
    complete_DF_ordered.to_csv(f'{seqExtract_dir}/{samples}_{count_cutoff}counts_seqExtract.csv')
    return complete_DF_ordered

In [10]:
def meme_formatting(DF, region_column, sample_name, MEME_dir_counts):
    """
    Writing out FASTA files in the MEME format.
    DF = DataFrame.
    Region_column = the column containing sequences of interest. For MEME, needs to be >8bp.
    MEME_dir_counts = string with local directory to output files.
    """
    DF['fasta_ID'] = '>' + DF['coordinate'].astype('str') + DF['strand']
    fasta_out = DF['fasta_ID'] + '\n' + DF[f'{region_column}']
    fasta_out_name = f'{MEME_dir_counts}/{sample_name}.fasta'
    fasta_out.to_csv(fasta_out_name, index=False, quoting = csv.QUOTE_NONE, escapechar = '(', header=None)

In [11]:
def meme_validation(exp_DF, control_DF, outer_up, outer_down, motif_up, motif_down,
                    condition, count_cutoff, meme_algorithm, motif, MEME_dir_counts):
    """
    Extract sequences & carry out MEME validation of desired motif.
    
    exp_DF: a dataframe with enriched coordinates. Requires a 'coordinate' and 'strand' column.
    control_DF: a dataframe with control coordinates. 
        For bootstrap call threshold validation, likely to be a set of random coordinates.
        For differential expression validation, likely to be the union of coordinates called in the condition set.
    outer_up: an integer with the amount upstream of the coordinate to extract.
    outer_down: an integer with the amount upstream of the coordinate to extract
        (negative numbers to go upstream of the coordinate.)
    motif_up: upstream bound of the motif region (extracted via outer_up - motif_up)
    motif_down: downstream bound of the motif region. (extracted via outer_up - motif_down)
    condition: a string with the sample name (can be one condition, or multiple conditions joined from a list).
    count_cutoff: integer with the end count threshold for bootstrap calling.
    meme_algorithm: XSTREME, MEME or STREME. XSTREME encompasses MEME and STREME, so it is recommended.
    motif: string describing the motif being examined for output naming (e.g. min10, CRP).
    MEME_dir_counts: string with path to local directory for specific MEME instance.
    """
    # Extract sequence environment
    extract_coordinate_environment(genome, exp_DF, outer_up, outer_down, condition, count_cutoff)
    extract_coordinate_environment(genome, control_DF, outer_up, outer_down, f'{condition}_control', count_cutoff)
    # Have to re-read DFs in for correct sequence formatting
    exp_seqDF = pd.read_csv(f'{seqExtract_dir}/{condition}_{count_cutoff}counts_seqExtract.csv')
    control_seqDF = pd.read_csv(f'{seqExtract_dir}/{condition}_control_{count_cutoff}counts_seqExtract.csv')
    
    exp_seqDF['motif_region'] = exp_seqDF['Nontemplate_strand'].str[(outer_up-motif_up):(outer_up-motif_down)]
    control_seqDF['motif_region'] = control_seqDF['Nontemplate_strand'].str[(outer_up-motif_up):(outer_up-motif_down)]

    # Write out FASTA files for MEME input             
    meme_formatting(exp_seqDF, 'motif_region', f'{condition}_{motif}_{count_cutoff}counts', MEME_dir_counts)
    meme_formatting(control_seqDF, 'motif_region', f'{condition}_{motif}_control_{count_cutoff}counts', MEME_dir_counts)
    replace_command = f'sed -i s/\(//g {MEME_dir_counts}/*.fasta'
    replace = quickshell(replace_command, print_output = False, return_output = False)
    
    # Call MEME from local installation
    if meme_algorithm == "meme":
        meme_call = f'{MEME_path}/meme {MEME_dir_counts}/{condition}_{motif}_{count_cutoff}counts.fasta ' + \
                    f'-neg {MEME_dir_counts}/{condition}_{motif}_control_{count_cutoff}counts.fasta ' + \
                    f'-objfun de -oc {MEME_dir_counts}/{condition}_{count_cutoff}counts_{motif}_meme -evt 0.05'
        quickshell(meme_call, print_output = True)
        
    elif meme_algorithm == "streme":
        streme_call = f'{MEME_path}/streme --p {MEME_dir_counts}/{condition}_{motif}_{count_cutoff}counts.fasta ' + \
                    f'--n {MEME_dir_counts}/{condition}_{motif}_control_{count_cutoff}counts.fasta ' + \
                    f'--objfun de --oc {MEME_dir_counts}/{condition}_{count_cutoff}counts_{motif}_streme --rna'
        quickshell(streme_call, print_output = True)
                
    elif meme_algorithm == "xstreme":
        xstreme_call = f'{MEME_path}/xstreme --p {MEME_dir_counts}/{condition}_{motif}_{count_cutoff}counts.fasta ' + \
                    f'--n {MEME_dir_counts}/{condition}_{motif}_control_{count_cutoff}counts.fasta ' + \
                    f'--rna --oc {MEME_dir_counts}/{condition}_{count_cutoff}counts_{motif}_xstreme'
        quickshell(xstreme_call, print_output = True)

In [12]:
def import_singleBp_info(condition, replicates, strand):
    """
    Reads in single-bp resolution information about end and coverage counts.
    
    Preconditions: 
    Absolute paths to the output from bedtools genomecov must be specified.
    sample must be a string corresponding to a condition in the sequencing experiment, e.g. "NusG"
    replicates must be an integer, corresponding to the number of replicates per condition.
    strand must be a string stating "minus" or "plus".
    """
    ends_DFs = []
    cov_DFs = []
    
    replicate_numbers_DF = sampleDF.loc[sampleDF['condition'] == condition,]
    replicate_numbers_DF.reset_index(inplace = True)
    replicate_numbers = replicate_numbers_DF['replicate']
    
    for i in range(replicates):
        single_bp_ends = pd.read_table(f'{ends_dir}/{condition}{replicate_numbers[i]}' + \
                                    f'_ends_{strand}.txt', skiprows=len_spike_genome,
                                    header = 0, names = ['chr','coordinate','counts'])

        single_bp_cov = pd.read_table(f'{coverage_dir}/{condition}{replicate_numbers[i]}' + \
                                    f'_coverage_{strand}.txt', skiprows=len_spike_genome,
                                    header = 0, names = ['chr','coordinate','counts'])
        
        single_bp_ends['replicate'] = i+1
        single_bp_cov['replicate'] = i+1
        
        ends_DFs.append(single_bp_ends)
        cov_DFs.append(single_bp_cov)

    return ends_DFs, cov_DFs

In [13]:
def spike_import_singleBp_info(condition, replicates, strand):
    """
    Reads in single-bp resolution information about end and coverage counts.
    
    Preconditions: 
    Absolute paths to the output from bedtools genomecov must be specified.
    sample must be a string corresponding to a condition in the sequencing experiment, e.g. "NusG"
    replicates must be an integer, corresponding to the number of replicates per condition.
    strand must be a string stating "minus" or "plus".
    """
    ends_DFs = []
    cov_DFs = []
    
    replicate_numbers_DF = sampleDF.loc[sampleDF['condition'] == condition,]
    replicate_numbers_DF.reset_index(inplace = True)
    replicate_numbers = replicate_numbers_DF['replicate']
    
    for i in range(replicates):
        single_bp_ends = pd.read_table(f'{spike_ends_dir}/{condition}{replicate_numbers[i]}' + \
                                    f'_ends_{strand}.txt', skiprows=len_ref_genome,
                                    header = 0, names = ['chr','coordinate','counts'])

        single_bp_cov = pd.read_table(f'{spike_coverage_dir}/{condition}{replicate_numbers[i]}' + \
                                    f'_coverage_{strand}.txt', skiprows=len_ref_genome,
                                    header = 0, names = ['chr','coordinate','counts'])
        
        single_bp_ends['replicate'] = i+1
        single_bp_cov['replicate'] = i+1
        
        ends_DFs.append(single_bp_ends)
        cov_DFs.append(single_bp_cov)

    return ends_DFs, cov_DFs

In [14]:
def union_DF(condition_list, count_cutoff, multifactor):
    """
    Generate one DataFrame with information for the union of all coordinates present any sample,
    broken down by individual replicate.
    """
    # Read in consensus dataframes
    print('Reading in consensus DFs')
    
    consensus_DFs = []
    coord_IDs = []
    
    # Create a list of all coordinate IDs from all conditions
    for i in range(len(condition_list)):
    
        # Read in DF
        condition_DF = pd.read_csv(f'{postBL_dir}/postBlackList_{condition_list[i]}_{count_cutoff}counts.csv')
    
        # Generate coordinate IDs
        condition_DF['coordID'] = condition_DF['end'].astype(str) + condition_DF['strand']
        
        consensus_DFs.append(condition_DF)
        coord_IDs.append(condition_DF['coordID'])
        
    # Union of coordinates in both num and den conditions
    all_coords = set.union(*map(set,coord_IDs))
    # Generated an ordered list for iteration
    all_coords_list = list(all_coords)

    # Read in single-bp resolutions
    # Output format: lists of dataframes corresponding to the individual replicates for each condition
    print('Reading in single-bp resolution information')
    minus_ends = []
    minus_cov = []
    plus_ends = []
    plus_cov = []
    for i in range(len(condition_list)):
        print(f'Single-bp res: reading in {condition_list[i]}')
        minus_ends_DFs, minus_cov_DFs = import_singleBp_info(condition_list[i], replicates, "minus")
        minus_ends.append(minus_ends_DFs)
        minus_cov.append(minus_cov_DFs)
        
        plus_ends_DFs, plus_cov_DFs = import_singleBp_info(condition_list[i], replicates, "plus")
        plus_ends.append(plus_ends_DFs)
        plus_cov.append(plus_cov_DFs)
        
    # Initialize union dataframe
    union_DF = pd.DataFrame(data = all_coords_list, columns = ['coordID'])
    
    # Initialize lists, where each sub-list inside corresponds to a replicate
    # These will each become columns that populate union_DF to enable further calculations
    counts_ends_allSamples_allReps = []
    counts_cov_allSamples_allReps = []

    for i in range(len(condition_list)):
        
        counts_ends_allSamples_allReps.append([])
        counts_cov_allSamples_allReps.append([])
        
    # For each sample:
    for i in range(len(condition_list)):

        # For each replicate:
        for j in range(replicates):
            
            print(f'Iterating through {condition_list[i]} replicate {j+1} to extract counts of ends and total coverage')
            # Initialize dictionaries of coordID: count pairs
            counts_ends_currentRep = {}
            counts_cov_currentRep = {}

            # Iterate through each coordinate
            for k in range(len(all_coords_list)):
            
                coordID = all_coords_list[k]
                strand = coordID[-1]
                coordinate = coordID[:-1]
                # For all - strand coordinates:
                if strand == "-":
                    # Get the counts for both ends only and total coverage
                    ends_value = int(minus_ends[i][j].loc[minus_ends[i][j]['coordinate'] == int(coordinate),'counts'])
                    cov_value = int(minus_cov[i][j].loc[minus_cov[i][j]['coordinate'] == int(coordinate),'counts'])

                    # Add key, value pair of coordID, counts to replicate dictionary
                    counts_ends_currentRep[coordID] = ends_value
                    counts_cov_currentRep[coordID] = cov_value
                
                # For all + strand coordinates:    
                else:
                    # Get the counts for both ends only and total coverage
                    ends_value = int(plus_ends[i][j].loc[plus_ends[i][j]['coordinate'] == int(coordinate),'counts'])
                    cov_value = int(plus_cov[i][j].loc[plus_cov[i][j]['coordinate'] == int(coordinate),'counts'])

                    # Add key, value pair of coordID, counts to replicate dictionary
                    counts_ends_currentRep[coordID] = ends_value
                    counts_cov_currentRep[coordID] = cov_value
            
            # Add each replicate dictionary to a list of all replicate dictionaries
            counts_ends_allSamples_allReps[i].append(counts_ends_currentRep)
            counts_cov_allSamples_allReps[i].append(counts_cov_currentRep)        
    
    print("Populating union DF with both end and total coverage counts from each replicate")
    # Add columns to Union_DF
    # For each sample:
    for i in range(len(counts_ends_allSamples_allReps)):
        
        # For each replicate:
        for j in range(replicates):
            column_prefix = f'{condition_list[i]}{j+1}'
            union_DF[f'{column_prefix}_ends'] = union_DF['coordID'].map(counts_ends_allSamples_allReps[i][j])
            union_DF[f'{column_prefix}_cov'] = union_DF['coordID'].map(counts_cov_allSamples_allReps[i][j])
            union_DF[f'{column_prefix}_ER'] = union_DF[f'{column_prefix}_ends'] / union_DF[f'{column_prefix}_cov']

    # Add in new columns
    for i in range(len(condition_list)):
        
        union_DF[f'{condition_list[i]}_called'] = "No"
    
    print("Annotating conditions in which coordinate was called")
    # Add in whether or not a coordinate was called in a given condition
    # Iterate through each coordinate
    for i in range(len(union_DF.index)):
        
        coordID = union_DF['coordID'][i]
    
        # For each condition:
        for j in range(len(condition_list)):
            
            # Check if the coordID is in the corresponding set of coordIDs for the condition:
            if coordID in set(coord_IDs[j]):
                union_DF[f'{condition_list[j]}_called'][i] = 'Yes'
    
    condition_string = "_".join(condition_list)
    if multifactor == "Yes":
        union_DF.to_csv(f'{union_dir}/union_DF_multifactor_{count_cutoff}counts.csv')
    else:
        union_DF.to_csv(f'{union_dir}/union_DF_{condition_string}_{count_cutoff}counts.csv')
    print(' ')
    return union_DF

In [15]:
def spike_union_DF(condition_list, count_cutoff):
    """
    Generate one DataFrame with information for the union of all coordinates present any sample,
    broken down by individual replicate.
    """
    # Read in consensus dataframes
    print('Reading in consensus DFs')
    
    consensus_DFs = []
    coord_IDs = []
    
    # Create a list of all coordinate IDs from all conditions
    for i in range(len(condition_list)):
    
        # Read in DF
        condition_DF = pd.read_csv(f'{spike_bootstrap_calls_dir}/' + \
                                   f'spike_{condition_list[i]}_consensus_calls_' + \
                                   f'{replicates}reps_2counts.csv')
    
        # Generate coordinate IDs
        condition_DF['coordID'] = condition_DF['end'].astype(str) + condition_DF['strand']
        
        consensus_DFs.append(condition_DF)
        coord_IDs.append(condition_DF['coordID'])
        
    # Union of coordinates in both num and den conditions
    all_coords = set.union(*map(set,coord_IDs))
    # Generated an ordered list for iteration
    all_coords_list = list(all_coords)

    # Read in single-bp resolutions
    # Output format: lists of dataframes corresponding to the individual replicates for each condition
    print('Reading in single-bp resolution information')
    minus_ends = []
    minus_cov = []
    plus_ends = []
    plus_cov = []
    for i in range(len(condition_list)):
        print(f'Single-bp res: reading in {condition_list[i]}')
        minus_ends_DFs, minus_cov_DFs = spike_import_singleBp_info(condition_list[i], replicates, "minus")
        minus_ends.append(minus_ends_DFs)
        minus_cov.append(minus_cov_DFs)
        
        plus_ends_DFs, plus_cov_DFs = spike_import_singleBp_info(condition_list[i], replicates, "plus")
        plus_ends.append(plus_ends_DFs)
        plus_cov.append(plus_cov_DFs)
        
    # Initialize union dataframe
    union_DF = pd.DataFrame(data = all_coords_list, columns = ['coordID'])
    
    # Initialize lists, where each sub-list inside corresponds to a replicate
    # These will each become columns that populate union_DF to enable further calculations
    counts_ends_allSamples_allReps = []
    counts_cov_allSamples_allReps = []

    for i in range(len(condition_list)):
        
        counts_ends_allSamples_allReps.append([])
        counts_cov_allSamples_allReps.append([])
        
    # For each sample:
    for i in range(len(condition_list)):

        # For each replicate:
        for j in range(replicates):
            
            print(f'Iterating through {condition_list[i]} replicate {j+1} to extract counts of ends and total coverage')
            # Initialize dictionaries of coordID: count pairs
            counts_ends_currentRep = {}
            counts_cov_currentRep = {}

            # Iterate through each coordinate
            for k in range(len(all_coords_list)):
            
                coordID = all_coords_list[k]
                strand = coordID[-1]
                coordinate = coordID[:-1]
            
                # For all - strand coordinates:
                if strand == "-":
                    # Get the counts for both ends only and total coverage
                    ends_value = int(minus_ends[i][j].loc[minus_ends[i][j]['coordinate'] == int(coordinate),'counts'])
                    cov_value = int(minus_cov[i][j].loc[minus_cov[i][j]['coordinate'] == int(coordinate),'counts'])

                    # Add key, value pair of coordID, counts to replicate dictionary
                    counts_ends_currentRep[coordID] = ends_value
                    counts_cov_currentRep[coordID] = cov_value
                
                # For all + strand coordinates:    
                else:
                    # Get the counts for both ends only and total coverage
                    ends_value = int(plus_ends[i][j].loc[plus_ends[i][j]['coordinate'] == int(coordinate),'counts'])
                    cov_value = int(plus_cov[i][j].loc[plus_cov[i][j]['coordinate'] == int(coordinate),'counts'])

                    # Add key, value pair of coordID, counts to replicate dictionary
                    counts_ends_currentRep[coordID] = ends_value
                    counts_cov_currentRep[coordID] = cov_value
            
            # Add each replicate dictionary to a list of all replicate dictionaries
            counts_ends_allSamples_allReps[i].append(counts_ends_currentRep)
            counts_cov_allSamples_allReps[i].append(counts_cov_currentRep)        
    
    print("Populating union DF with both end and total coverage counts from each replicate")
    # Add columns to Union_DF
    # For each sample:
    for i in range(len(counts_ends_allSamples_allReps)):
        
        # For each replicate:
        for j in range(replicates):
            column_prefix = f'{condition_list[i]}{j+1}'
            union_DF[f'{column_prefix}_ends'] = union_DF['coordID'].map(counts_ends_allSamples_allReps[i][j])
            union_DF[f'{column_prefix}_cov'] = union_DF['coordID'].map(counts_cov_allSamples_allReps[i][j])
            union_DF[f'{column_prefix}_ER'] = union_DF[f'{column_prefix}_ends'] / union_DF[f'{column_prefix}_cov']

    # Add in new columns
    for i in range(len(condition_list)):
        
        union_DF[f'{condition_list[i]}_called'] = "No"
    
    print("Annotating conditions in which coordinate was called")
    # Add in whether or not a coordinate was called in a given condition
    # Iterate through each coordinate
    for i in range(len(union_DF.index)):
        
        coordID = union_DF['coordID'][i]
    
        # For each condition:
        for j in range(len(condition_list)):
            
            # Check if the coordID is in the corresponding set of coordIDs for the condition:
            if coordID in set(coord_IDs[j]):
                union_DF[f'{condition_list[j]}_called'][i] = 'Yes'
    
    condition_string = "_".join(condition_list)
    
    union_DF['coordID'] = "Eco_" + union_DF['coordID']
    if multifactor == "Yes":
        union_DF.to_csv(f'{spike_union_dir}/union_DF_multifactor_2counts.csv')
    else:
        union_DF.to_csv(f'{spike_union_dir}/union_DF_{condition_string}_2counts.csv')
    print(' ')
    return union_DF

In [16]:
def DESeq2_design_matrix(name_array, replicates, test):
    """
    test = must be 'ends' or 'readthru'
    """
    if test == 'ends':
        conditions = np.repeat(name_array, replicates, axis=0)
        design_matrix = pd.DataFrame(data = conditions,
                                    columns = ['condition'])
        design_matrix['replicate'] = np.tile(np.arange(1,replicates+1),len(name_array))
        design_matrix['count_type'] = 'ends'

        name_string = '_'.join(name_array)
        design_matrix.to_csv(f'{DESeq2_dir}/{name_string}_Wald_design_matrix.csv')
        return design_matrix
    
    elif test == 'readthru':
        conditions = np.repeat(name_array, replicates, axis=0)
        design_matrix = pd.DataFrame(data = conditions,
                                    columns = ['condition'])
        design_matrix['replicate'] = np.tile(np.arange(1,replicates+1),len(name_array))
        design_matrix['count_type'] = 'ends'

        name_string = '_'.join(name_array)
        design_matrix.to_csv(f'{DESeq2_dir}/{name_string}_Wald_design_matrix.csv')
        return design_matrix

In [18]:
def DESeq2_countdata_readthru(design_matrix, name_array, union_DF, count_cutoff, multifactor):
    
    ends_only = union_DF.filter(like = '_ends')
    cov_only = union_DF.filter(like = 'cov')
    
    ends_only.columns = ends_only.columns.str.replace('_ends', '')
    cov_only.columns = cov_only.columns.str.replace('_cov', '')

    readthru = cov_only.subtract(ends_only)
    
    ref_sample = name_array[0]
    ref_sample_columns = [x for x in readthru.columns if ref_sample in x]
    other_columns = [x for x in readthru.columns if not ref_sample in x]
    sorted_DF = readthru[ref_sample_columns + other_columns]
    indexed_DF = sorted_DF.set_index(union_DF['coordID'])
    
    design_matrix['sample_name'] = '^' + design_matrix['condition'] + design_matrix['replicate'].astype(str)

    DF_list = []
    for i in range(len(design_matrix['sample_name'])):
        sample_name = design_matrix['sample_name'][i]
        condition_subset_DF = indexed_DF.filter(regex = f'{sample_name}')
        DF_list.append(condition_subset_DF)
        
    final_DF = pd.concat(DF_list, axis = 1)
    
    name_string = '_'.join(name_array)
    final_DF.to_csv(f'{DESeq2_dir}/{name_string}_countdata_readthru_{count_cutoff}counts.csv')
    if multifactor == "Yes":
        final_DF.to_csv(f'{DESeq2_dir}/multifactor_countdata_readthru_{count_cutoff}counts.csv')
    
    return final_DF

In [19]:
def DESeq2_countdata_ends(design_matrix, name_array, union_DF, count_cutoff, multifactor):
    
    ends_only = union_DF.filter(like = '_ends')
    ref_sample = name_array[0]
    ref_sample_columns = [x for x in ends_only.columns if ref_sample in x]
    other_columns = [x for x in ends_only.columns if not ref_sample in x]
    sorted_DF = ends_only[ref_sample_columns + other_columns]
    indexed_DF = sorted_DF.set_index(union_DF['coordID'])
    
    name_string = '_'.join(name_array)
    indexed_DF.to_csv(f'{DESeq2_dir}/{name_string}_countdata_ends_{count_cutoff}counts.csv')
    if multifactor == "Yes":
        indexed_DF.to_csv(f'{DESeq2_dir}/multifactor_countdata_ends_{count_cutoff}counts.csv')
    
    return indexed_DF

In [21]:
def flexible_DESeq2(condition_subset, test, design, unionDF, count_cutoff):
    """
    Run DESeq2 on input counts.
    condition_subset = a list of conditions to compare. 
        For pairwise, reference sample should come first (denominator in log2 fold change calculations).
    test = must be 'ends' or 'readthru'
    design = must be 'pairwise' or 'multifactor'
    """
    condition_string = '_'.join(condition_subset)
    design_matrix = DESeq2_design_matrix(condition_subset, replicates, test)
    
    if test == 'ends' and design == 'pairwise':
        DESeq2_countdata_ends(design_matrix, condition_subset, unionDF, count_cutoff, "No")
        
        design_formula = '~ condition'
        
        DESeq2_command = f'Rscript --vanilla Rscripts/CFG_DESeq2_pairwise.R ' + \
            f'-t "Wald" ' + \
            f'-r "{condition_subset[0]}" ' + \
            f'-n "{condition_subset[1]}" ' + \
            f'-d {DESeq2_dir}/{condition_string}_Wald_design_matrix.csv ' + \
            f'-c {DESeq2_dir}/{condition_string}_countdata_ends_{count_cutoff}counts.csv ' + \
            f'-f "{design_formula}" ' + \
            f'-s "{bigWig_dir}/{condition_string}_sizeFactors_ends.csv" ' + \
            f'-l {DESeq2_dir}/{condition_string}_{count_cutoff}counts_ends_results_Wald_local.csv ' + \
            f'-p {DESeq2_dir}/{condition_string}_{count_cutoff}counts_ends_results_Wald_parametric.csv ' + \
            f'-m {DESeq2_dir}/{condition_string}_{count_cutoff}counts_ends_results_Wald_mean.csv'
        quickshell(DESeq2_command, print_output = True)
        DESeq2_results_local = pd.read_csv(f'{DESeq2_dir}/{condition_string}_{count_cutoff}counts_readthru_results_Wald_local.csv')
        DESeq2_results_parametric = pd.read_csv(f'{DESeq2_dir}/{condition_string}_{count_cutoff}counts_readthru_results_Wald_parametric.csv')        
        DESeq2_results_mean = pd.read_csv(f'{DESeq2_dir}/{condition_string}_{count_cutoff}counts_readthru_results_Wald_mean.csv')
        min_residuals = min(float(DESeq2_results_local['median_residuals'][0]),
                            float(DESeq2_results_parametric['median_residuals'][0]),
                            float(DESeq2_results_mean['median_residuals'][0]))
        if float(DESeq2_results_local['median_residuals'][0]) == min_residuals:
            return DESeq2_results_local
        elif float(DESeq2_results_parametric['median_residuals'][0]) == min_residuals:
            return DESeq2_results_parametric
        elif float(DESeq2_results_mean['median_residuals'][0]) == min_residuals:
            return DESeq2_results_mean
    
    elif test == 'readthru' and design == 'pairwise':
        DESeq2_countdata_readthru(design_matrix, condition_subset, unionDF, count_cutoff, "No")
        
        design_formula = '~ condition'
        
        DESeq2_command = f'Rscript --vanilla Rscripts/CFG_DESeq2_pairwise.R ' + \
            f'-t "Wald" ' + \
            f'-r "{condition_subset[0]}" ' + \
            f'-n "{condition_subset[1]}" ' + \
            f'-d {DESeq2_dir}/{condition_string}_Wald_design_matrix.csv ' + \
            f'-c {DESeq2_dir}/{condition_string}_countdata_readthru_{count_cutoff}counts.csv ' + \
            f'-f "{design_formula}" ' + \
            f'-s "{bigWig_dir}/{condition_string}_sizeFactors_readthru.csv" ' + \
            f'-l {DESeq2_dir}/{condition_string}_{count_cutoff}counts_readthru_results_Wald_local.csv ' + \
            f'-p {DESeq2_dir}/{condition_string}_{count_cutoff}counts_readthru_results_Wald_parametric.csv ' + \
            f'-m {DESeq2_dir}/{condition_string}_{count_cutoff}counts_readthru_results_Wald_mean.csv'
        quickshell(DESeq2_command, print_output = True)
        DESeq2_results_local = pd.read_csv(f'{DESeq2_dir}/{condition_string}_{count_cutoff}counts_readthru_results_Wald_local.csv')
        DESeq2_results_parametric = pd.read_csv(f'{DESeq2_dir}/{condition_string}_{count_cutoff}counts_readthru_results_Wald_parametric.csv')        
        DESeq2_results_mean = pd.read_csv(f'{DESeq2_dir}/{condition_string}_{count_cutoff}counts_readthru_results_Wald_mean.csv')
        min_residuals = min(float(DESeq2_results_local['median_residuals'][0]),
                            float(DESeq2_results_parametric['median_residuals'][0]),
                            float(DESeq2_results_mean['median_residuals'][0]))
        if float(DESeq2_results_local['median_residuals'][0]) == min_residuals:
            return DESeq2_results_local
        elif float(DESeq2_results_parametric['median_residuals'][0]) == min_residuals:
            return DESeq2_results_parametric
        elif float(DESeq2_results_mean['median_residuals'][0]) == min_residuals:
            return DESeq2_results_mean
        
    elif test == 'ends' and design == 'multifactor':
        design_matrix = DESeq2_design_matrix_interaction(condition_subset, replicates, "raw_ends_counts", True)
        DESeq2_countdata_ends(design_matrix, condition_subset, unionDF, count_cutoff, "yes")
                
        DESeq2_command = f'Rscript --vanilla Rscripts/CFG_DESeq2_multifactor.R ' + \
            f'-d {DESeq2_dir}/multifactor_LRT_int_design_matrix.csv ' + \
            f'-c {DESeq2_dir}/multifactor_countdata_ends_{count_cutoff}counts.csv ' + \
            f'-s "{bigWig_dir}/multifactor_sizeFactors_ends.csv" ' + \
            f'-l {DESeq2_dir}/multifactor_{count_cutoff}counts_ends_results_local.csv ' + \
            f'-p {DESeq2_dir}/multifactor_{count_cutoff}counts_ends_results_parametric.csv ' + \
            f'-m {DESeq2_dir}/multifactor_{count_cutoff}counts_ends_results_mean.csv'
        quickshell(DESeq2_command, print_output = True)
        DESeq2_results_local = pd.read_csv(f'{DESeq2_dir}/multifactor_{count_cutoff}counts_ends_results_local.csv')
        DESeq2_results_parametric = pd.read_csv(f'{DESeq2_dir}/multifactor_{count_cutoff}counts_ends_results_parametric.csv')        
        DESeq2_results_mean = pd.read_csv(f'{DESeq2_dir}/multifactor_{count_cutoff}counts_ends_results_mean.csv')
        min_residuals = min(float(DESeq2_results_local['median_residuals'][0]),
                            float(DESeq2_results_parametric['median_residuals'][0]),
                            float(DESeq2_results_mean['median_residuals'][0]))
        if float(DESeq2_results_local['median_residuals'][0]) == min_residuals:
            return DESeq2_results_local
        elif float(DESeq2_results_parametric['median_residuals'][0]) == min_residuals:
            return DESeq2_results_parametric
        elif float(DESeq2_results_mean['median_residuals'][0]) == min_residuals:
            return DESeq2_results_mean

    elif test == 'readthru' and design == 'multifactor':
        design_matrix = DESeq2_design_matrix_interaction(condition_subset, replicates, "raw_ends_counts", True)
        DESeq2_countdata_readthru(design_matrix, condition_subset, unionDF, count_cutoff, "Yes")
                
        DESeq2_command = f'Rscript --vanilla Rscripts/CFG_DESeq2_multifactor.R ' + \
            f'-d {DESeq2_dir}/multifactor_LRT_int_design_matrix.csv ' + \
            f'-c {DESeq2_dir}/multifactor_countdata_readthru_{count_cutoff}counts.csv ' + \
            f'-s "{bigWig_dir}/multifactor_sizeFactors_readthru.csv" ' + \
            f'-l {DESeq2_dir}/multifactor_{count_cutoff}counts_readthru_results_local.csv ' + \
            f'-p {DESeq2_dir}/multifactor_{count_cutoff}counts_readthru_results_parametric.csv ' + \
            f'-m {DESeq2_dir}/multifactor_{count_cutoff}counts_readthru_results_mean.csv'
        quickshell(DESeq2_command, print_output = True)
        DESeq2_results_local = pd.read_csv(f'{DESeq2_dir}/multifactor_{count_cutoff}counts_readthru_results_local.csv')
        DESeq2_results_parametric = pd.read_csv(f'{DESeq2_dir}/multifactor_{count_cutoff}counts_readthru_results_parametric.csv')        
        DESeq2_results_mean = pd.read_csv(f'{DESeq2_dir}/multifactor_{count_cutoff}counts_readthru_results_mean.csv')
        min_residuals = min(float(DESeq2_results_local['median_residuals'][0]),
                            float(DESeq2_results_parametric['median_residuals'][0]),
                            float(DESeq2_results_mean['median_residuals'][0]))
        if float(DESeq2_results_local['median_residuals'][0]) == min_residuals:
            return DESeq2_results_local
        elif float(DESeq2_results_parametric['median_residuals'][0]) == min_residuals:
            return DESeq2_results_parametric
        elif float(DESeq2_results_mean['median_residuals'][0]) == min_residuals:
            return DESeq2_results_mean

# Run script for range of counts

In [24]:
def motif_recovery_5enrich_pairwise(count_range, outer_up, outer_down,
                                    QCmotifUp, QCmotifDown, QCmotifName,
                                    DEmotifUp, DEmotifDown, DEmotifName):
    """
    count_range: which post-nonparametric resampling count thresholds to test for motif recovery or
        differentially-expressed TSSs.
    outer_up: how much flanking sequence to extract upstream of the TSS coordinate (positive number).
    outer_down: how much flanking sequence to extract downstream of the TSS coordinate (positive number).
    QCmotifUp: how far upstream of the TSS to scan for the quality-control motif.
    QCmotifDown: how far downstream of the TSS to scan for the quality-control motif.
    QCmotifName: name of quality-control motif (for file names).
    DEmotifUp: how far upstream of the TSS to scan for the differentially-expressed motif.
        i.e. motif associated with differentially-expressed genes in response to a TF.
    DEmotifDown: how far downstream of the TSS to scan for the differentially-expressed motif.
    DEmotifName: name of differentially-expressed motif (for file names).
    """
    # For each end count cutoff:
    for count_cutoff in count_range:
        
        print("count_cutoff: " + str(count_cutoff))
        MEME_dir_counts = f'{MEME_dir}/{condition_string}_{count_cutoff}counts'
        quickshell(f'mkdir {MEME_dir_counts}')
        
        # Generate consensus peaks for blacklist DF
        # (consensus = present in all 3 replicates and all of them meet the count cutoff)
        if blacklist_sample != 'gDNA':
            blacklist_DF = consensus_peaks(blacklist_sample, replicates, count_cutoff)
        if blacklist_sample == 'gDNA':
            blacklist_DF = consensus_peaks_gDNA(replicates, count_cutoff)
                
        # For all conditions examined
        for condition in condition_subset:
            
            # Generate consensus peaks for experimental samples
            preBL = consensus_peaks(condition, replicates, count_cutoff)
            # Filter out coordinates found in the blacklist sample (e.g. gDNA, core)
            postBL = remove_blacklist_coordinates(preBL, blacklist_DF, 'end')
            postBL.to_csv(f'{postBL_dir}/' + \
                                    f'postBlackList_{condition}_{count_cutoff}counts.csv')
            
            postBL = pd.read_csv(f'{postBL_dir}/' + \
                                    f'postBlackList_{condition}_{count_cutoff}counts.csv')
            
            # Prepare for sequence extraction
            # Since each replicate gets its own row, only take replicate 1 for seq extraction
            seqExtractDF = postBL.loc[postBL['replicate'] == 1,]
            seqExtractDF['coordinate'] = seqExtractDF['end']
            # Generate DF with random coordinates as control sequences
            random_coordinates = np.random.randint(min(seqExtractDF['coordinate']),
                                                   max(seqExtractDF['coordinate']),
                                                   size = len(seqExtractDF.index))
                                                      
            control_coord_dict = {'coordinate':list(random_coordinates), 'strand':list(seqExtractDF['strand'])}
            control_coord_DF = pd.DataFrame(data = control_coord_dict)
           
            meme_validation(seqExtractDF, control_coord_DF, outer_up, outer_down,
                            coordMotifUp, coordMotifDown, condition, count_cutoff,
                            'xstreme', coordMotifName, MEME_dir_counts)

                                        
        # Generate union of called coordinates
        unionDF = union_DF(condition_subset, count_cutoff)
        unionDF = pd.read_csv(f'{union_dir}/union_DF_{condition_string}_{count_cutoff}counts.csv')
       
        # Separate coordinate and strand for sequence extraction
        coordinate_str = unionDF['coordID'].str[:-1]
        unionDF['coordinate'] = coordinate_str.astype(int) - 2
        unionDF['strand'] = unionDF['coordID'].str[-1:]
              
        # Generate DF with random coordinates as control sequences
        random_coordinates = np.random.randint(min(unionDF['coordinate']),
                                                max(unionDF['coordinate']),
                                                size = len(unionDF.index))
                                                       
        control_coord_dict = {'coordinate': list(random_coordinates), 'strand': list(unionDF['strand'])}
        control_coord_DF = pd.DataFrame(data = control_coord_dict)
        
        meme_validation(unionDF, control_coord_DF, outer_up, outer_down,
                        coordMotifUp, coordMotifDown, condition_string, count_cutoff,
                        'xstreme', coordMotifName, MEME_dir_counts)

        
        # Generate union of called coordinates (spike)
        spikeUnionDF = spike_union_DF(condition_subset, 2)
        spikeUnionDF = pd.read_csv(f'{spike_union_dir}/union_DF_{condition_string}_2counts.csv')
        
        coordinate_str = spikeUnionDF['coordID'].str[4:-1]
        spikeUnionDF['coordinate'] = coordinate_str.astype(int) - 2
        spikeUnionDF['strand'] = spikeUnionDF['coordID'].str[-1:]
        
        # Add spike coords to experimental coords
        union_addSpike = pd.concat([unionDF, spikeUnionDF], axis = 0)
        
        # Run DESeq2 on read-thru to filter out coords
        DESeq2_readthru = flexible_DESeq2(condition_subset, 'readthru', 'pairwise', union_addSpike, count_cutoff)
        DESeq2_readthru_sig = DESeq2_readthru.loc[DESeq2_readthru['padj'] <= 0.05,]
        readthru_filtered_DF = remove_blacklist_coordinates(union_addSpike, DESeq2_readthru_sig, 'coordID')
        
        # Actual differential expression analysis:
        # Run DESeq2
        DESeq2_results = flexible_DESeq2(condition_subset, 'ends', 'pairwise', readthru_filtered_DF, count_cutoff)
        
        padj_005 = DESeq2_results.loc[DESeq2_results['padj'] < 0.05,]
            
        # Separate coordinate and strand for sequence extraction
        coordinate_str = padj_005['coordID'].str[:-1]
        padj_005['coordinate'] = coordinate_str.astype(int) - 2
        padj_005['strand'] = padj_005['coordID'].str[-1:]
        
        readthru_DF = readthru_filtered_DF[~readthru_filtered_DF['coordID'].str.contains("Eco")]
        coordinate_str = readthru_DF['coordID'].str[:-1]
        readthru_DF['coordinate'] = coordinate_str.astype(int) - 2
        readthru_DF['strand'] = readthru_DF['coordID'].str[-1:]
        
        # Search in general for motif in entire promoter region
        meme_validation(padj_005, readthru_DF, outer_up, outer_down,
                        TFmotifUp, TFmotifDown, condition_string, count_cutoff,
                        'xstreme', TFmotifName, MEME_dir_counts)
            
         # Activation region
        activated_padj005 = padj_005.loc[padj_005['log2FoldChange'] > 0,]
        meme_validation(activated_padj005, readthru_filtered_DF, outer_up, outer_down, TFmotifUp, TFmotifDown,
                         condition_string, count_cutoff, 'xstreme', f'{TFmotifName}_activated', MEME_dir_counts)
        
         # Inhibition region
        inhibited_padj005 = padj_005.loc[padj_005['log2FoldChange'] < 0,]
        meme_validation(inhibited_padj005, readthru_filtered_DF, outer_up, outer_down, TFmotifUp, TFmotifDown,
                         condition_string, count_cutoff, 'xstreme', f'{TFmotifName}_inhibited', MEME_dir_counts)

In [3]:
motif_recovery_5enrich_pairwise(count_cutoff_range, outer_up, outer_down,
                       QCmotifUp, QCmotifDown, QCmotifName,
                       DEmotifUp, DEmotifDown, DEmotifName)

In [25]:
def multifactor_count_thresholds(count_range, enrich_type):

    # For each end count cutoff:
    for count_cutoff in count_range:
                
        # Generate consensus peaks for blacklist DF
        # (consensus = present in all 3 replicates and all of them meet the count cutoff)
        if enrich_type == "5end":
            blacklist_DF = consensus_peaks(blacklist_sample, replicates, count_cutoff)
            blacklist_DF = blacklist_DF.loc[blacklist_DF['end'] == 'a',]
        elif enrich_type == "3end":
            blacklist_DF = consensus_peaks_gDNA(replicates, count_cutoff)
        
        # For all conditions examined (for optimization of motif recovery: +/- CRP):
        for condition in condition_subset:
            
            # Generate consensus peaks for experimental samples
            preBL = consensus_peaks(condition, replicates, count_cutoff)
            # Filter out coordinates found in the noise/artifact sample (e.g. gDNA, core)
            postBL = remove_blacklist_coordinates(preBL, blacklist_DF, 'end')
            postBL.to_csv(f'{postBL_dir}/' + \
                                    f'postBlackList_{condition}_{count_cutoff}counts.csv')
            
            postBL = pd.read_csv(f'{postBL_dir}/' + \
                                    f'postBlackList_{condition}_{count_cutoff}counts.csv')
                                        
        # Generate union of called coordinates
        unionDF = union_DF(condition_subset, count_cutoff, "Yes")

        # Separate coordinate and strand for sequence extraction
        coordinate_str = unionDF['coordID'].str[:-1]
        unionDF['coordinate'] = coordinate_str.astype(int) - 2
        unionDF['strand'] = unionDF['coordID'].str[-1:]
        
        # Generate union of called coordinates (spike)
        spikeUnionDF = spike_union_DF(condition_subset, 2, "Yes")
        coordinate_str = spikeUnionDF['coordID'].str[4:-1]
        spikeUnionDF['coordinate'] = coordinate_str.astype(int) - 2
        spikeUnionDF['strand'] = spikeUnionDF['coordID'].str[-1:]
        
        # Add spike coords to normal coords
        union_addSpike = pd.concat([unionDF, spikeUnionDF], axis = 0)
        
        # Run DESeq2 on read-thru to filter out coords
        DESeq2_readthru = flexible_DESeq2(condition_subset, 'readthru', 'multifactor',
                                          union_addSpike, count_cutoff)
        DESeq2_readthru_sig = DESeq2_readthru.loc[DESeq2_readthru['padj'] <= 0.05,]
        readthru_filtered_DF = remove_blacklist_coordinates(union_addSpike, DESeq2_readthru_sig, 'coordID')

        # Actual differential expression analysis:
        # Run DESeq2
        DESeq2_results = flexible_DESeq2(condition_subset, 'ends', 'multifactor',
                                         readthru_filtered_DF, count_cutoff)
        
        padj_005 = DESeq2_results.loc[DESeq2_results['padj'] < 0.05,]

# Compile stats in table

In [8]:
def compile_results(count_range, QCmotifName, QCmotifRegex,
                   DESeq2_sig_cutoff, DEmotifName, DEmotifRegex):

    
    condition_string = "_".join(condition_subset)
    
    # Initialize DF
    stats_DF = pd.DataFrame(data = count_range, columns = ['count_cutoff'])
    values_for_DF = []
    
    for count_cutoff in count_range:
        # MEME results location
        MEME_dir_counts = f'{MEME_dir}/{condition_string}_{count_cutoff}counts'
        
        sample_dict = {}
        
        sample_dict['count_cutoff'] = count_cutoff
        
        for condition in condition_subset:
            
            preBL = pd.read_csv(f'{preBL_dir}/' + \
                    f'preBlackList_{condition}_consensus_calls_{replicates}reps_{count_cutoff}counts.csv')
            sample_dict[f'{condition}_preBL_numCoords'] = (len(preBL.index)/replicates)
            
            postBL = pd.read_csv(f'{postBL_dir}/' + \
                                    f'postBlackList_{condition}_{count_cutoff}counts.csv')
            sample_dict[f'{condition}_postBL_numCoords'] = (len(postBL.index)/replicates)
            
            xstreme_results = pd.read_table(f'{MEME_dir_counts}/' + \
                                            f'{condition}_{count_cutoff}counts_{QCmotifName}_xstreme/' + \
                                            f'xstreme.tsv')
            consensus = xstreme_results['CONSENSUS'][0]
            sample_dict[f'{condition}_top_motif'] = consensus
            match_motif = re.search(QCmotifRegex, consensus)
            if match_motif == None:
                sample_dict[f'{QCmotifName}?'] = False
            else:
                sample_dict[f'{QCmotifName}?'] = True
            
            try:
                sea_results = pd.read_table(f'{MEME_dir_counts}/' + \
                                            f'{condition}_{count_cutoff}counts_{QCmotifName}_xstreme/' + \
                                            f'sea_out/sea.tsv')
            
                sample_dict[f'{condition}_SEA_LogPvalue'] = sea_results['LOG_PVALUE'][0]
                sample_dict[f'{condition}_SEA_Sig?'] = sea_results['LOG_PVALUE'][0] < sea_results['LOG_EVALUE'][0]
                sample_dict[f'{condition}_TP'] = sea_results['TP'][0]
                sample_dict[f'{condition}_TP%'] = sea_results['TP%'][0]
                sample_dict[f'{condition}_SEA_EnrichRatio'] = sea_results['ENR_RATIO'][0]
            
                fimo_results = pd.read_table(f'{MEME_dir_counts}/' + \
                                            f'{condition}_{count_cutoff}counts_{QCmotifName}_xstreme/' + \
                                            f'fimo_out_1/fimo.tsv')   
            
                sample_dict[f'{condition}_FIMO_match'] = len(fimo_results.dropna().index)
            except:
                pass
            
        union_DF = pd.read_csv(f'{union_dir}/union_DF_{condition_string}_{count_cutoff}counts.csv')
        sample_dict[f'union_numCoords_preBL'] = len(union_DF.index)
        
        DESeq2_results_local = pd.read_csv(f'{DESeq2_dir}/{condition_string}_{count_cutoff}counts_{DESeq_test}_' + \
                                        f'results_Wald_local.csv')
        DESeq2_results_parametric = pd.read_csv(f'{DESeq2_dir}/{condition_string}_{count_cutoff}counts_{DESeq_test}_' + \
                                        f'results_Wald_parametric.csv')
        DESeq2_results_mean = pd.read_csv(f'{DESeq2_dir}/{condition_string}_{count_cutoff}counts_{DESeq_test}_' + \
                                        f'results_Wald_mean.csv')
        min_residuals = min(float(DESeq2_results_local['median_residuals'][0]),
                            float(DESeq2_results_parametric['median_residuals'][0]),
                            float(DESeq2_results_mean['median_residuals'][0]))
        if float(DESeq2_results_local['median_residuals'][0]) == min_residuals:
                DESeq_results = DESeq2_results_local
        elif float(DESeq2_results_parametric['median_residuals'][0]) == min_residuals:
                DESeq_results = DESeq2_results_parametric
        elif float(DESeq2_results_mean['median_residuals'][0]) == min_residuals:
                DESeq_results = DESeq2_results_mean
            
        sample_dict['DESeq_fitType'] = DESeq_results['fitType'][0]
        sample_dict['DESeq_medianAbsResiduals'] = DESeq_results['median_residuals'][0]
        
        sample_dict[f'union_numCoords_postBL'] = len(DESeq_results.index)
        sig_DESeq = DESeq_results.loc[DESeq_results['padj'] <= padj_cutoff,]
        sample_dict[f'num_padj005'] = len(sig_DESeq.index)
        activated = sig_DESeq.loc[sig_DESeq['log2FoldChange'] > 0,]
        sample_dict[f'num_activated'] = len(activated.index)
        inhibited = sig_DESeq.loc[sig_DESeq['log2FoldChange'] < 0,]
        sample_dict[f'num_inhibited'] = len(inhibited.index)
        
        xstreme_results = pd.read_table(f'{MEME_dir_counts}/' + \
                                        f'{condition_string}_{count_cutoff}counts_{DEmotifName}_xstreme/' + \
                                        f'xstreme.tsv')
        consensus = xstreme_results['CONSENSUS'][0]

        sample_dict[f'top_motif'] = consensus

        try:
            match_motif = re.search(DE_motif_regex, consensus)            
            
            if match_motif == None:
                sample_dict[f'{DEmotifName}?'] = False
            else:
                sample_dict[f'{DEmotifName}?'] = True
        
            xstreme_results = pd.read_table(f'{MEME_dir_counts}/' + \
                                           f'{condition_string}_{count_cutoff}counts_{DEmotifName}_xstreme/' + \
                                           f'xstreme.tsv')
        
            sample_dict['xstreme_Evalue'] = xstreme_results['EVALUE'][0]
        
            try:
                sea_results = pd.read_table(f'{MEME_dir_counts}/' + \
                                            f'{condition_string}_{count_cutoff}counts_{DEmotifName}_xstreme/' + \
                                            f'sea_out/sea.tsv')
            
                sample_dict[f'SEA_LogPvalue'] = sea_results['LOG_PVALUE'][0]
                sample_dict[f'SEA_Sig?'] = sea_results['LOG_PVALUE'][0] < sea_results['LOG_EVALUE'][0]
                sample_dict[f'TP'] = sea_results['TP'][0]
                sample_dict[f'TP%'] = sea_results['TP%'][0]
                sample_dict[f'SEA_EnrichRatio'] = sea_results['ENR_RATIO'][0]
            
                fimo_results = pd.read_table(f'{MEME_dir_counts}/' + \
                                            f'{condition_string}_{count_cutoff}counts_{DEmotifName}_xstreme/' + \
                                            f'fimo_out_1/fimo.tsv')   
            
                sample_dict[f'FIMO_match'] = len(fimo_results.dropna().index)
        
                # Find most abundant positions
                motif_length = len(fimo_results['motif_id'][0])
                fimo_results['center'] = fimo_results['start'] - (TFmotifUp - (motif_length / 2))
                freqs = plt.hist(fimo_results['center'])
                freqs_include_right = np.append(freqs[0], 0)
                freqs_include_both = np.insert(freqs_include_right, 0, 0)
                peaks, properties = find_peaks(freqs_include_both)
                peaks_norm = peaks - 1
                center_peaks = freqs[1][peaks_norm]

                for i in range(len(center_peaks)):
    
                    sample_dict[f'peak{i+1}_center'] = center_peaks[i]
                    sample_dict[f'peak{i+1}_height'] = freqs[0][peaks_norm][i]
        
                values_for_DF.append(sample_dict)
            except:
                pass
        except:
            pass
            
    final_DF = pd.DataFrame.from_dict(values_for_DF)
    final_DF.to_csv(f'{motif_compilation_dir}/{condition_string}_{replicates}reps_' + \
                    f'{DEmotifName}motif_optimization.csv')

In [7]:
consensus = compile_results(count_cutoff_range, coordMotifName, coordMotifRegex,
                  DESeq2_sig_cutoff, DEmotifName, DEmotifRegex)

consensus_activated = compile_results(count_cutoff_range, QCMotifName, QCMotifRegex,
                  DESeq2_sig_cutoff, f'{DEmotifName}_activated', DEmotifRegex)

consensus_inhibited = compile_results(count_cutoff_range, QCMotifName, QCMotifRegex,
                  DESeq2_sig_cutoff, f'{DEmotifName}_inhibited', DEmotifRegex)

# Add all motif hits to DESeq2 dataframe

In [None]:
def sea_get_matches(motif_dir, MEME_dir_counts, condition_string, count_cutoff, motif_name):
    """Re-run SEA without the --no-seqs argument hard-coded into xstreme to get the true positives."""
    sea_call = f'{MEME_path}/sea --verbosity 4 ' + \
                    f'--oc {motif_dir}/{condition_string}_{count_cutoff}counts_{motif_name}_seaSeqs ' + \
                    f'--qvalue --thresh 1 --order 2 --bfile {motif_dir}/background ' + \
                    f'--seed 0 --align center --motif-pseudo 0.01 ' + \
                    f'--m {motif_dir}/meme_out/meme.xml --m {motif_dir}/streme_out/streme.xml ' + \
                    f'--p {MEME_dir_counts}/{condition_string}_{motif_name}_{count_cutoff}counts.fasta ' + \
                    f'--n {MEME_dir_counts}/{condition_string}_{motif_name}_control_{count_cutoff}counts.fasta'
    quickshell(sea_call, print_output = True)

In [1]:
def fimo_changeThreshold(motif_dir, MEME_dir_counts, discovery_algorithm, condition_string, count_cutoff, motif,
                        motif_name):
    """Re-run fimo with a lower threshold to extract all true positives from SEA analysis."""
    fimo_call = f'{MEME_path}/fimo ' + \
                f'--parse-genomic-coord --verbosity 4 --oc {motif_dir}/fimo_out_lowThres ' + \
                f'--bfile {motif_dir}/background --motif {motif} --thresh 5e-2 ' + \
                f'{motif_dir}/{discovery_algorithm}_out/{discovery_algorithm}.xml ' + \
                f'{MEME_dir_counts}/{condition_string}_{motif_name}_{count_cutoff}counts.fasta'
    quickshell(fimo_call, print_output = True)

In [None]:
def integrate_DESeq2_motif(motif_name, optimal_count_threshold):
    
    all_counts = pd.read_csv(f'{motif_compilation_dir}/{condition_string}_{replicates}reps_' + \
                    f'{DEmotifName}motif_optimization.csv')
    # Find peaks in true positives (TP)
    freqs = np.array(all_counts['TP'])
    freqs_include_right = np.append(freqs, 0)
    freqs_include_both = np.insert(freqs_include_right, 0, 0)
    peaks, properties = find_peaks(freqs_include_both)
    peaks_norm = peaks - 1
    TP_peaks = freqs[peaks_norm]
    
    # Here, if you don't provide an optimal count threshold based on manual inspection,
    # the optimal count threshold will be the threshold with the highest TP #
    max_counts = all_counts.loc[all_counts['TP'] == max(TP_peaks),'count_cutoff']
    max_counts_indices = max_counts.index
#    optimal_count_threshold = max_counts[max_counts_indices[0]]

    all_coords_DF = pd.read_csv(f'{DESeq2_dir}/{condition_string}_{optimal_count_threshold}counts_' + \
                        f'ends_results_Wald_local.csv')

    # Annotate whether each coordinate is a hit at other count thresholds
    for count in count_cutoff_range:
    
        count_DF = pd.read_csv(f'{DESeq2_dir}/{condition_string}_{count}counts_' + \
                        f'ends_results_Wald.csv')
        count_sig_DF = count_DF.loc[count_DF['padj'] <= sig_cutoff,]
        all_coords_DF[f'hit_in_{count}counts'] = all_coords_DF['coordID'].isin(count_sig_DF['coordID']) 
    
    count_cols = all_coords_DF.filter(like = 'hit_in')
    all_coords_DF['num_thresholds_met'] = count_cols.sum(axis = 1)
    
    # Annotate motif information
    MEME_count_dir = f'{MEME_outputs_dir}/{condition_string}_{optimal_count_threshold}counts'
    TFmotif_dir = f'{MEME_count_dir}/{condition_string}_{optimal_count_threshold}counts_{motif_name}_xstreme'
    sea_get_matches(TFmotif_dir, MEME_count_dir, condition_string, optimal_count_threshold, motif_name)
    sea_DF = pd.read_table(f'{TFmotif_dir}/{condition_string}_{optimal_count_threshold}counts_{motif_name}_seaSeqs/' + \
                           f'sequences.tsv')
    sea_DF_onlyTop = sea_DF.loc[sea_DF['motif_ID'] == sea_DF['motif_ID'][0],]
    sea_DF_TP = sea_DF_onlyTop.loc[sea_DF_onlyTop['seq_Class'] == 'tp',]
    sea_DF_FP = sea_DF_onlyTop.loc[sea_DF_onlyTop['seq_Class'] == 'fp',]

    motif = sea_DF['motif_ID'][0]
    if 'MEME' in sea_DF['motif_ALT_ID'][0]:
        fimo_changeThreshold(TFmotif_dir, MEME_count_dir, 'meme',
                             condition_string, optimal_count_threshold, motif, motif_name)
    elif 'STREME' in sea_DF['motif_ALT_ID'][0]:
        fimo_changeThreshold(TFmotif_dir, MEME_count_dir, 'streme',
                             condition_string, optimal_count_threshold, motif, motif_name)


    fimo_DF = pd.read_table(f'{TFmotif_dir}/fimo_out_lowThres/' + \
                           f'fimo.tsv').dropna()
    fimo_truePositives = fimo_DF.loc[fimo_DF['sequence_name'].isin(sea_DF_TP['seq_ID']),]
    fimo_truePositives['seq_Class'] = 'tp'
    fimo_falsePositives = fimo_DF.loc[fimo_DF['sequence_name'].isin(sea_DF_FP['seq_ID']),]
    fimo_falsePositives['seq_Class'] = 'fp'
    fimo_allPos = pd.concat([fimo_truePositives, fimo_falsePositives], axis = 0)

    fimo_allPos['top_ranked?'] = False

    for coord_ID in fimo_allPos['sequence_name'].unique():
    
        sub_DF = fimo_allPos.loc[(fimo_allPos['sequence_name'] == coord_ID),]
        sub_DF.loc[sub_DF['score'] == max(sub_DF['score']),'top_ranked?'] = True
        fimo_allPos.loc[(fimo_truePositives['sequence_name'] == coord_ID),'top_ranked?'] = sub_DF['top_ranked?']
    
    fimo_allPositives_topScore = fimo_allPos.loc[fimo_allPos['top_ranked?'] == True,]
    fimo_allPositives_topScore['coord'] = fimo_allPositives_topScore['sequence_name'].str[:-1]
    fimo_allPositives_topScore['strand'] = fimo_allPositives_topScore['sequence_name'].str[-1:]
    fimo_allPositives_topScore['correct_coord'] = fimo_allPositives_topScore['coord'].astype(int) + 2
    
    fimo_allPositives_topScore['coordID'] = fimo_allPositives_topScore['correct_coord'].astype(str) + fimo_allPositives_topScore['strand']
    new_DF = pd.merge(all_coords_DF, fimo_allPositives_topScore, on='coordID', how='left')
    motifs = new_DF['motif_id'].dropna()
    motif_indices = motifs.index
    motif_length = len(motifs[motif_indices[0]])
    upstream_end = TFmotifUp
    new_DF['motif_center'] = new_DF['start'] - (upstream_end - (motif_length / 2))
    colsToKeep = ['coordID','baseMean','log2FoldChange','padj','num_thresholds_met','motif_id',
                 'score','p-value','q-value','matched_sequence','motif_center']
    filtered_cols = new_DF[new_DF.columns[new_DF.columns.isin(colsToKeep)]]
    unique_DF = filtered_cols.drop_duplicates(subset = ['coordID'], keep = 'last')
    unique_DF.to_csv(f'{motif_compilation_dir}/' + \
                         f'{condition_string}_{motif_name}_{optimal_count_threshold}counts_DESeq2.csv')   
    return unique_DF

In [None]:
integrate_DESeq2_motif(motif_name = 'min100plus30',
                       optimal_count_threshold = 20)