### Figure 3D - Histogram of fold changes of early vs late reads for both capsule and lysate

Here we look at the distribution of fold changes for both the capsule and the lysate samples. 

In [3]:
import pandas as pd
import os
import numpy as np
from bokeh.io import push_notebook, show, output_notebook, export_svg
output_notebook()
from bokeh.plotting import figure

First we will read our data frames into a file:

In [4]:
df_dir = '../../../Processed Data Files/dataframe files/230322Li/'
names = []
df_list = []
RPM_thresh = 10**1
for file in os.listdir(df_dir):
    if file.endswith('dataframe.txt'):
        name = file[:]
        name = name[:name.find('_2')]
        names.append(name)
        new_df  = pd.read_csv(''.join([df_dir,file]))
        new_df['Length'] = new_df['Stop'] - new_df['Start']
        new_df['Counts'] = new_df['Counts']/((sum(new_df['Counts']/1000000.)))
        if len(df_list) == 0:
            df_list.append(new_df.rename(columns = {'Counts' : name}))
        else:
            df_list.append(new_df.rename(columns = {'Counts' : name})[name])

s_df = pd.concat(df_list, axis = 1)
s_df = s_df.loc[~s_df['Name'].str.contains("Capsule", case = True)].reset_index(drop = True)
s_df = s_df.loc[(s_df[names[:]] > RPM_thresh).all(axis=1)].reset_index(drop = True)
print(f'\nHere is the list of unique samples for which there are dataframe files: \n\n{names}\n')

# a name dictionary for converting between the datafram name and a form nicer for plots:
names_dict = {'Before-Ind': 'Before Induction', 'Early-Cap': "Early Capsule",
              'Early-Lys': "Early Lysate", 'Late-Cap': "Late Capsule",
              'Late-Lys': "Late Lysate", 'Mid-Stress': "Mid-Stress, Total Lysate",
              'No-Ind': "No Induction", 'Post-Ind': "Post Induction, Total Lysate"}


Here is the list of unique samples for which there are dataframe files: 

['Before-Ind', 'Early-Cap', 'Early-Lys', 'Late-Cap', 'Late-Lys', 'Mid-Stress', 'No-Ind', 'Post-Ind']



Now we will generate our "dif-df" which will contian the fold changes we are interested in plotting:

In [5]:
dif_df = pd.DataFrame()
dif_df['Gene_Name'] = s_df['Name']
dif_df['Lysate_Dif'] = np.log10(s_df['Late-Lys']) - np.log10(s_df['Early-Lys'])
dif_df['Cap_Dif'] = np.log10(s_df['Late-Cap']) - np.log10(s_df['Early-Cap'])

Now we plot the distributions of those fold changes as histograms:

In [19]:
p = figure(width = 1000, height = 400,
           title = "Distribution of Fold Changes: Lysate vs Capsule",
          output_backend = "svg")

# Histogram
cutoff = 1.5
bins = np.linspace(-cutoff, cutoff, 75)
lys_hist, lys_edges = np.histogram(dif_df['Lysate_Dif'], density=False, bins=bins)
lys_hist[0] += len(dif_df['Lysate_Dif'].loc[dif_df['Lysate_Dif'] < -cutoff])
lys_hist[-1] += len(dif_df['Lysate_Dif'].loc[dif_df['Lysate_Dif'] > cutoff])
p.quad(top = lys_hist, bottom = 0, left = lys_edges[:-1], right = lys_edges[1:],
         fill_color="darkslateblue", line_color="white", fill_alpha = 0.5,
         legend_label="Lysate")
cap_hist, cap_edges = np.histogram(dif_df['Cap_Dif'], density=False, bins=bins)
cap_hist[0] += len(dif_df['Cap_Dif'].loc[dif_df['Cap_Dif'] < -cutoff])
cap_hist[-1] += len(dif_df['Cap_Dif'].loc[dif_df['Cap_Dif'] > cutoff])
p.quad(top = cap_hist, bottom = 0, left = cap_edges[:-1], right = cap_edges[1:],
         fill_color="orangered", line_color="white", fill_alpha = 0.5,
         legend_label="Capsule")

p.title.text_font = "arial"
p.title.text_font_size = "35px"
p.title.align = "center"

p.xaxis.axis_label = "Log Fold Change (Post/Pre)"
p.yaxis.axis_label = "Count"
p.axis.major_label_text_font = "arial"
p.axis.major_label_text_font_style = "bold"
p.axis.major_label_text_font_size = "35px"
p.yaxis.ticker = [100, 200, 300]
p.axis.axis_label_text_font = "arial"
p.axis.axis_label_text_font_style = "bold"
p.axis.axis_label_text_font_size = "35px"

p.background_fill_color = "white"
p.grid.grid_line_color = "whitesmoke"
p.grid.grid_line_width = 4

p.legend.label_text_font_size = '25pt'

show(p)
export_svg(p, filename = './3D.svg')

['./3D.svg']