### Figure 3D - Histogram of fold changes of early vs late reads for both capsule and lysate

Here we look at the distribution of fold changes for both the capsule and the lysate samples. 

In [1]:
import pandas as pd
import os
import numpy as np
from bokeh.io import push_notebook, show, output_notebook, export_svg
output_notebook()
from bokeh.plotting import figure
from bokeh.layouts import column
import datetime

First we will read our data frames into a file:

In [2]:
def into_RPM_theshold_remove_cap(df_dict, name):
    '''This felper function gets the dataframe for a particular experiment and converts its reads into RPM'''
    sample_df = df_dict[name].copy()
    sample_df.drop(sample_df[sample_df['Name'] == 'error'].index, inplace = True) # currently dropping any gene without reads in both
    sample_df.drop(sample_df[sample_df['Name'] == 'Capsule'].index, inplace = True) # currently dropping capsule reads from plot
    sample_df.drop(sample_df[sample_df['Name'] == 'Capsule_rev'].index, inplace = True) # currently dropping capsule reads from plot
    sample_df.drop(sample_df[sample_df['Name'] == 'lacI'].index, inplace = True) # currently dropping lacI from plot
    sample_df.drop(sample_df[sample_df['Counts'] <= 100].index, inplace = True)
    sample_df = sample_df.rename(columns = {'Counts':name})
    sample_df[name] = sample_df[name]/(sum(sample_df[name])/1000000) #RPM]
    
    return sample_df

In [3]:
df_dir = '../../../Processed_Data_Files/dataframe_files/230712LiA/snapshot/'
df_dict = dict()
names = []
# Fetch all the relevant dataframe files and put them in a useable format:
for file in os.listdir(df_dir):
    if file.endswith('_dataframe.txt'):
        name = file[:]
        name = name[:name.find('_D')]
        new_df  = pd.read_csv(''.join([df_dir,file]))
        new_df['Length'] = new_df['Stop'] - new_df['Start']
        df_dict[name] = new_df
        names.append(name)
print(f'\nHere is the list of unique samples for which there are dataframe files: \n\n{names}\n')


Here is the list of unique samples for which there are dataframe files: 

['Early_Cap', 'Early_Lys', 'Late_Cap', 'Late_Lys_2', 'Late_Lys']



Now we will generate our "dif-df" which will contian the fold changes we are interested in plotting:

In [5]:
dif_df = pd.DataFrame()
ll_df = into_RPM_theshold_remove_cap(df_dict, 'Late_Lys')
el_df = into_RPM_theshold_remove_cap(df_dict, 'Early_Lys')
lysate_df = ll_df.merge(el_df)
lysate_df.drop(lysate_df[lysate_df['Name'] == 'error'].index, inplace = True) # currently dropping any gene without reads in both
lysate_df.drop(lysate_df[lysate_df['Late_Lys'] == 0].index, inplace = True)
lysate_df.drop(lysate_df[lysate_df['Early_Lys'] == 0].index, inplace = True)
lysate_df['Log_Dif'] = np.log10(lysate_df['Late_Lys']) - np.log10(lysate_df['Early_Lys'])
lysate_df['Abs_Log_Dif'] = abs(lysate_df['Log_Dif'])


lc_df = into_RPM_theshold_remove_cap(df_dict, 'Late_Cap')
ec_df = into_RPM_theshold_remove_cap(df_dict, 'Early_Cap')
capsule_df = lc_df.merge(ec_df)
capsule_df.drop(capsule_df[capsule_df['Name'] == 'error'].index, inplace = True) # currently dropping any gene without reads in both
capsule_df.drop(capsule_df[capsule_df['Late_Cap'] == 0].index, inplace = True)
capsule_df.drop(capsule_df[capsule_df['Early_Cap'] == 0].index, inplace = True)
capsule_df['Log_Dif'] = np.log10(capsule_df['Late_Cap']) - np.log10(capsule_df['Early_Cap'])
capsule_df['Abs_Log_Dif'] = abs(capsule_df['Log_Dif'])

Now we plot the distributions of those fold changes as histograms:

In [27]:
p = figure(width = 500, height = 200,
          output_backend = "svg")

# Histogram
cutoff = 1
bins = np.linspace(0, cutoff, 10)
lys_hist, lys_edges = np.histogram(lysate_df['Abs_Log_Dif'], density=False, bins=bins)
lys_hist[-1] += len(lysate_df['Abs_Log_Dif'].loc[lysate_df['Abs_Log_Dif'] > cutoff])
p.quad(top = lys_hist, bottom = 0, left = lys_edges[:-1], right = lys_edges[1:],
         fill_color="darkcyan", line_color="white", fill_alpha = 0.5,
         legend_label="Lysate")
cap_hist, cap_edges = np.histogram(capsule_df['Abs_Log_Dif'], density=False, bins=bins)
cap_hist[-1] += len(capsule_df['Abs_Log_Dif'].loc[capsule_df['Abs_Log_Dif'] > cutoff])
p.quad(top = cap_hist, bottom = 0, left = cap_edges[:-1], right = cap_edges[1:],
         fill_color="darkorange", line_color="white", fill_alpha = 0.5,
         legend_label="MTC")

p.xaxis.axis_label = "Abs Log 10 Fold Change (Post/Pre Stress)"
p.yaxis.axis_label = "Count"
p.axis.axis_label_text_font_style = "normal"
p.axis.major_label_text_font = "arial"
p.axis.major_label_text_font_size = "20px"
p.yaxis.ticker = [0,1000, 2000]
p.axis.axis_label_text_font = "arial"
p.axis.axis_label_text_font_size = "20px"

p.background_fill_color = "whitesmoke"
p.grid.grid_line_color = "white"
p.grid.grid_line_width = 4

p.legend.label_text_font_size = '20pt'
p.legend.label_text_font_style = "normal"
p.legend.location = (100, 55)
p.legend.orientation = "horizontal"

show(p)
export_svg(p, filename = f'./3D_{datetime.date.today()}.svg')

['./3D_2023-08-14.svg']

In [14]:
p1 = figure(width =500, height = 170,
          output_backend = "svg")

# Histogram
cutoff = 1
bins = np.linspace(0, cutoff, 20)
lys_hist, lys_edges = np.histogram(lysate_df['Abs_Log_Dif'], density=False, bins=bins)
lys_hist[-1] += len(lysate_df['Abs_Log_Dif'].loc[lysate_df['Abs_Log_Dif'] > cutoff])
p1.quad(top = lys_hist, bottom = 0, left = lys_edges[:-1], right = lys_edges[1:],
         fill_color="darkcyan", line_color="white", fill_alpha = 1,
         legend_label="Lysate Sample")

p2 = figure(width = 500, height = 170,
          output_backend = "svg")
cap_hist, cap_edges = np.histogram(capsule_df['Abs_Log_Dif'], density=False, bins=bins)
cap_hist[-1] += len(capsule_df['Abs_Log_Dif'].loc[capsule_df['Abs_Log_Dif'] > cutoff])
p2.quad(top = cap_hist, bottom = 0, left = cap_edges[:-1], right = cap_edges[1:],
         fill_color="darkorange", line_color="white", fill_alpha = 1,
         legend_label="MTC Sample")

p1.xaxis.visible = False
p2.xaxis.axis_label = "Abs Log 10 Fold Change (Post Stress/Pre Stress)"

p1.axis.axis_label_text_font_style = "normal"
p1.axis.major_label_text_font = "arial"
p1.axis.major_label_text_font_size = "20px"
p1.axis.axis_label_text_font = "arial"
p1.axis.axis_label_text_font_size = "20px"

p2.axis.axis_label_text_font_style = "normal"
p2.axis.major_label_text_font = "arial"
p2.axis.major_label_text_font_size = "20px"
p2.axis.axis_label_text_font = "arial"
p2.axis.axis_label_text_font_size = "20px"

p2.yaxis.ticker = [500, 1000, 1500]
p1.yaxis.ticker = [50, 150, 250]

p1.background_fill_color = "whitesmoke"
p1.grid.grid_line_color = "white"
p1.grid.grid_line_width = 4

p2.background_fill_color = "whitesmoke"
p2.grid.grid_line_color = "white"
p2.grid.grid_line_width = 4

p1.legend.label_text_font_size = '20pt'
p1.legend.label_text_font_style = "normal"
p1.legend.location = (170, 95)


p2.legend.label_text_font_size = '20pt'
p2.legend.label_text_font_style = "normal"
p2.legend.location = (170, 45)


p3 = column(p1, p2)
show(p3)
export_svg(p3, filename = f'./3D_stacked_{datetime.date.today()}.svg')

['./3D_stacked_2023-08-14.svg']

In [35]:
dif = abs(np.log10(1.5))
lys_df = lysate_df[lysate_df['Abs_Log_Dif'] > dif]
cap_df = capsule_df[capsule_df['Abs_Log_Dif'] > dif]
print(f'{100*(1-len(lys_df)/len(lysate_df))}% of Lysate genes are < {dif} orders of magnitide dif versus {100*(1-len(cap_df)/len(capsule_df))} for the Capsule samples')

41.20836560805577% of Lysate genes are < 0.17609125905568124 orders of magnitide dif versus 85.15943418844401 for the Capsule samples
