# Visualizing/assessing genomic coverage in CFG

To generate Fig. 1B (circos), Extended Data Fig. 5c-d (correlation matrices for 5' ends/TSSs),
and Extended Data Fig. 6c-d (correlation matrices for 3' ends/TTSs).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib_venn
import upsetplot
import itertools
import plotly.express as px
from scipy import stats
from statsmodels.stats.multitest import fdrcorrection
from statistics import mean, stdev
import math
import statsmodels.api as sm
from statsmodels.formula.api import ols
import seaborn as sns
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio

In [None]:
# written by Peter Culviner, PhD to enable command-line access through Jupyter
def quickshell(command, print_output=True, output_path=None, return_output=False):
    process_output = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout = process_output.stdout.decode('utf-8')
    stderr = process_output.stderr.decode('utf-8')
    output_string = f'STDOUT:\n{stdout}\nSTDERR:\n{stderr}\n'
    if print_output:
        print('$ ' + command)
        print(output_string)
    if output_path is not None:
        with open(output_path, 'w') as f:
            f.write(output_string)
    if return_output:
        return stdout, stderr

# Circos plot (Fig. 1b)

### Set inputs

In [None]:
condition_string_5end = 'noCRP_CRP'
count_cutoff_5end = 20

condition_string_3end = 'multifactor'
count_cutoff_3end = 70

### Generate coverage file using summarizeOverlaps

Circos visualization works best using evenly-spaced bins of the genome.
Note that the summarizeOverlaps script requires a list of bam files as input.
You need to generate this as a .txt file with the full file path.

In [5]:
figure_dir = 'fig1B_circos'

summarizeOverlaps_5end_command = f'Rscript --vanilla Rscripts/CFG_summarizeOverlaps.R ' + \
            f'-b {figure_dir}/BamList_5end.txt ' + \
            f'-g genome_files_misc/geneModel_evenSpace_550bp.csv ' + \
            f'-o {figure_dir}/5end_coverage_circos.csv'
quickshell(summarizeOverlaps_5end_command, print_output = True)

In [None]:
figure_dir = 'Rscripts/fig1B_circos'

summarizeOverlaps_3end_command = f'Rscript --vanilla Rscripts/CFG_summarizeOverlaps.R ' + \
            f'-b {figure_dir}/BamList_3end.txt ' + \
            f'-g genome_files_misc/geneModel_evenSpace_550bp.csv ' + \
            f'-o {figure_dir}/3end_coverage_circos.csv'
quickshell(summarizeOverlaps_3end_command, print_output = True)

### Prepare gene model

In [3]:
geneModel = pd.read_csv('genome_files_misc/geneModel_evenSpace_550bp.csv')
geneModel_noEco = geneModel.iloc[2:]
geneModel_noEco.reset_index(inplace=True)

coord_ranges = []
for i in range(len(geneModel_noEco.index)):

    start = geneModel_noEco['start'][i]
    end = geneModel_noEco['end'][i]
    coord_ranges.append(range(start,end))
    
geneModel_noEco['coord_range'] = coord_ranges
geneModel_noEco['counts_per_gene'] = 0
geneModel = geneModel_noEco.explode('coord_range')
geneModel['coordID'] = geneModel['coord_range'].astype(str) + geneModel['strand']

### Add in TSS data to gene model

In [None]:
union_5end = pd.read_csv(f'5enrich_CRP/selectThreshold/transcriptEndUnions/' + \
                        f'union_DF_{condition_string_5end}_{count_cutoff_5end}counts.csv') 

merged_DF_5end = union_5end.merge(geneModel,
                how = 'outer',
                on = 'coordID')

counts_per_gene_5end = []

for ID in merged_DF_5end['unique_row_ID'].unique().tolist():
    counts_per_gene_5end.append([ID, len(merged_DF_5end.loc[(merged_DF_5end['unique_row_ID'] == ID) &
                                            ~pd.isnull(merged_DF_5end['RC_CRP1_ends']),].index)])
    
counts_DF_5end = pd.DataFrame(counts_per_gene_5end, columns = ['unique_row_ID',
                                                        f'5end_enriched_end_count_20'])

geneModel_noEco_5end = geneModel_noEco_5end.merge(counts_DF_5end,
                                        how = 'outer',
                                        on = 'unique_row_ID')
geneModel_noEco_5end.to_csv('fig1B_circos/5end_TSSs_circos.csv')

### Add in TTS data to gene model

In [None]:
union_5end = pd.read_csv(f'3enrich_NusAG/selectThreshold/transcriptEndUnions/' + \
                        f'union_DF_{condition_string_3end}_{count_cutoff_3end}counts.csv') 

merged_DF_3end = union_3end.merge(geneModel,
                how = 'outer',
                on = 'coordID')

counts_per_gene_3end = []

for ID in merged_DF_3end['unique_row_ID'].unique().tolist():
    counts_per_gene_3end.append([ID, len(merged_DF_3end.loc[(merged_DF_3end['unique_row_ID'] == ID) &
                                            ~pd.isnull(merged_DF_3end['noTF1_ends']),].index)])
    
counts_DF_3end = pd.DataFrame(counts_per_gene_3end, columns = ['unique_row_ID',
                                                        '3end_enriched_end_count_70'])

geneModel_noEco_3end = geneModel_noEco.merge(counts_DF_3end,
                                        how = 'outer',
                                        on = 'unique_row_ID')
geneModel_noEco_3end.to_csv('fig1B_circos/3end_TTSs_circos.csv')

### Call circos plot script in R

In [None]:
figure_dir = 'fig1B_circos'

circos_command = f'Rscript --vanilla {figure_dir}/circos_plot.R ' + \
            f'-f {figure_dir}/5end_coverage_circos.csv ' + \
            f'-e {figure_dir}/3end_coverage_circos.csv ' + \
            f'-s {figure_dir}/5end_TSSs_circos.csv ' + \
            f'-t {figure_dir}/3end_TTSs_circos.csv ' + \
            f'-g genome_files_misc/geneModel_evenSpace_550bp.csv ' + \
            f'-s 4641652 ' + \
            f'-o {figure_dir}/circos_plot_TSS.png ' + \
            f'-x {figure_dir}/circos_plot_TTS.png'
quickshell(circos_command, print_output = True)

# Generating correlation plots (Ext Data Fig. 5c-d and 6c-d)

### Generate coverage file using summarizeOverlaps

Chose to generate the coverage file using actual genomic regions for the correlation (more meaningful than evenly spaced bins).

In [None]:
figure_dir = 'extDataFig5_Fig6_correlationMatrices'

summarizeOverlaps_5end_command = f'Rscript --vanilla Rscripts/CFG_summarizeOverlaps.R ' + \
            f'-b {figure_dir}/BamList_5end.txt ' + \
            f'-g genome_files_misc/geneModel_genomicRegions.csv ' + \
            f'-o {figure_dir}/5end_coverage_genomicRegions.csv'
quickshell(summarizeOverlaps_5end_command, print_output = True)

In [None]:
figure_dir = 'extDataFig5_Fig6_correlationMatrices'

summarizeOverlaps_3end_command = f'Rscript --vanilla Rscripts/CFG_summarizeOverlaps.R ' + \
            f'-b {figure_dir}/BamList_3end.txt ' + \
            f'-g genome_files_misc/geneModel_genomicRegions.csv ' + \
            f'-o {figure_dir}/3end_coverage_genomicRegions.csv'
quickshell(summarizeOverlaps_3end_command, print_output = True)

### Generate TSS data

In [8]:
nonparametric_calls_5end = '5enrich_CRP/identifyEnrichedEnds/bootstrap_calls'

# List of samples (not including the first)
sample_list = ['core2','core3',
               'noCRP1','noCRP2','noCRP3',
               'CRP1','CRP2','CRP3']

# Read in first file
TSS_DF = pd.read_table(f'{nonparametric_calls_5end}/core1_calls_alpha1.txt')[['end','count']]
TSS_DF.rename(columns = {'end': 'end',
                         'count': 'core1_count'},
              inplace = True)

for sample in sample_list:
    new_DF = pd.read_table(f'{nonparametric_calls_5end}/{sample}_calls_alpha1.txt')[['end','count']]
    new_DF.rename(columns = {'end': 'end',
                             'count': f'{sample}_count'},
                  inplace = True)   
    TSS_DF = TSS_DF.merge(new_DF,
                          how = 'outer',
                          on = 'end')

TSS_DF.fillna(0, inplace = True)
TSS_DF.to_csv('extDataFig5_Fig6_correlationMatrices/TSS_corr.csv')

### Generate TTS data

In [7]:
nonparametric_calls_3end = '3enrich_NusAG/identifyEnrichedEnds/bootstrap_calls'
nonparametric_calls_gDNA = 'gDNA/identifyEnrichedEnds/bootstrap_calls'

# List of samples (not including the first)
gDNA_sample_list = ['gDNA2','gDNA3']
sample_list = ['noTF1','noTF2','noTF3','NusA1','NusA2','NusA3',
               'NusG1','NusG2','NusG3','NusA_NusG1','NusA_NusG2','NusA_NusG3']

# Read in first file
TTS_DF = pd.read_table(f'{nonparametric_calls_gDNA}/gDNA1_calls_alpha1.txt')[['end','count']]
TTS_DF.rename(columns = {'end': 'end',
                         'count': 'gDNA1_count'},
              inplace = True)

for sample in gDNA_sample_list:
    new_DF = pd.read_table(f'{nonparametric_calls_gDNA}/{sample}_calls_alpha1.txt')[['end','count']]
    new_DF.rename(columns = {'end': 'end',
                             'count': f'{sample}_count'},
                  inplace = True)   
    TTS_DF = TTS_DF.merge(new_DF,
                          how = 'outer',
                          on = 'end')

for sample in sample_list:
    new_DF = pd.read_table(f'{nonparametric_calls_3end}/{sample}_calls_alpha1.txt')[['end','count']]
    new_DF.rename(columns = {'end': 'end',
                             'count': f'{sample}_count'},
                  inplace = True)   
    TTS_DF = TTS_DF.merge(new_DF,
                          how = 'outer',
                          on = 'end')

TTS_DF.fillna(0, inplace = True)
TTS_DF.to_csv('extDataFig5_Fig6_correlationMatrices/TTS_corr.csv')

### Call correlation plot script in R

In [None]:
figure_dir = 'extDataFig5_Fig6_correlationMatrices'

correlation_fig_command = f'Rscript --vanilla {figure_dir}/CFG_correlation_matrix.R ' + \
            f'-g genome_files_misc/geneModel_genomicRegions.csv ' + \
            f'-f {figure_dir}/5end_coverage_genomicRegions.csv ' + \
            f'-e {figure_dir}/3end_coverage_genomicRegions.csv ' + \
            f'-s {figure_dir}/TSS_corr.csv ' + \
            f'-t {figure_dir}/TTS_corr.csv ' + \
            f'-o {figure_dir}/corr_5end_heatmap.png ' + \
            f'-x {figure_dir}/corr_3end_heatmap.png ' + \
            f'-y {figure_dir}/corr_TSS_heatmap.png ' + \
            f'-z {figure_dir}/corr_TTS_heatmap.png'
quickshell(circos_command, print_output = True)