### Figure 3D - Histogram of fold changes of early vs late reads for both capsule and lysate

Here we look at the distribution of fold changes for both the capsule and the lysate samples. 

In [1]:
import pandas as pd
import os
import numpy as np
from bokeh.io import push_notebook, show, output_notebook, export_svg
output_notebook()
from bokeh.plotting import figure
import datetime

First we will read our data frames into a file:

In [2]:
def into_RPM(df_dict, name):
    '''This felper function gets the dataframe for a particular experiment and converts its reads into RPM'''
    sample_df = df_dict[name].copy()
    sample_df = sample_df.rename(columns = {'Counts':name})
    sample_df[name] = sample_df[name]/(sum(sample_df[name])/1000000) #RPM]
    
    return sample_df

In [3]:
df_dir = '../../../Processed_Data_Files/dataframe_files/230322Li/'
df_dict = dict()
names = []
# Fetch all the relevant dataframe files and put them in a useable format:
for file in os.listdir(df_dir):
    if file.endswith('_dataframe.txt'):
        name = file[:]
        name = name[:name.find('_2')]
        new_df  = pd.read_csv(''.join([df_dir,file]))
        new_df['Length'] = new_df['Stop'] - new_df['Start']
        df_dict[name] = new_df
        names.append(name)
print(f'\nHere is the list of unique samples for which there are dataframe files: \n\n{names}\n')


Here is the list of unique samples for which there are dataframe files: 

['Before-Ind', 'Early-Cap', 'Early-Lys', 'Late-Cap', 'Late-Lys', 'Mid-Stress', 'No-Ind', 'Post-Ind']



Now we will generate our "dif-df" which will contian the fold changes we are interested in plotting:

In [4]:
dif_df = pd.DataFrame()
ll_df = into_RPM(df_dict, 'Late-Lys')
el_df = into_RPM(df_dict, 'Early-Lys')
lysate_df = ll_df.merge(el_df)
lysate_df.drop(lysate_df[lysate_df['Name'] == 'error'].index, inplace = True) # currently dropping any gene without reads in both
lysate_df.drop(lysate_df[lysate_df['Late-Lys'] == 0].index, inplace = True)
lysate_df.drop(lysate_df[lysate_df['Early-Lys'] == 0].index, inplace = True)
lysate_df['Log_Dif'] = np.log10(lysate_df['Late-Lys']) - np.log10(lysate_df['Early-Lys'])
lysate_df['Abs_Log_Dif'] = abs(lysate_df['Log_Dif'])


lc_df = into_RPM(df_dict, 'Late-Cap')
ec_df = into_RPM(df_dict, 'Early-Cap')
capsule_df = lc_df.merge(ec_df)
capsule_df.drop(capsule_df[capsule_df['Name'] == 'error'].index, inplace = True) # currently dropping any gene without reads in both
capsule_df.drop(capsule_df[capsule_df['Late-Cap'] == 0].index, inplace = True)
capsule_df.drop(capsule_df[capsule_df['Early-Cap'] == 0].index, inplace = True)
capsule_df['Log_Dif'] = np.log10(capsule_df['Late-Cap']) - np.log10(capsule_df['Early-Cap'])
capsule_df['Abs_Log_Dif'] = abs(capsule_df['Log_Dif'])

Now we plot the distributions of those fold changes as histograms:

In [9]:
p = figure(width = 850, height = 250,
          output_backend = "svg")

# Histogram
cutoff = 1
bins = np.linspace(0, cutoff, 60)
lys_hist, lys_edges = np.histogram(lysate_df['Abs_Log_Dif'], density=False, bins=bins)
lys_hist[-1] += len(lysate_df['Abs_Log_Dif'].loc[lysate_df['Abs_Log_Dif'] > cutoff])
p.quad(top = lys_hist, bottom = 0, left = lys_edges[:-1], right = lys_edges[1:],
         fill_color="darkcyan", line_color="white", fill_alpha = 0.5,
         legend_label="Lysate")
cap_hist, cap_edges = np.histogram(capsule_df['Abs_Log_Dif'], density=False, bins=bins)
cap_hist[-1] += len(capsule_df['Abs_Log_Dif'].loc[capsule_df['Abs_Log_Dif'] > cutoff])
p.quad(top = cap_hist, bottom = 0, left = cap_edges[:-1], right = cap_edges[1:],
         fill_color="darkorange", line_color="white", fill_alpha = 0.5,
         legend_label="MTC")

p.xaxis.axis_label = "Log 10 Fold Change (Post Stress/Pre Stress)"
p.yaxis.axis_label = "Count"
p.axis.axis_label_text_font_style = "normal"
p.axis.major_label_text_font = "arial"
p.axis.major_label_text_font_size = "20px"
p.yaxis.ticker = [100, 200, 300]
p.axis.axis_label_text_font = "arial"
p.axis.axis_label_text_font_size = "20px"

p.background_fill_color = "whitesmoke"
p.grid.grid_line_color = "white"
p.grid.grid_line_width = 4

p.legend.label_text_font_size = '20pt'
p.legend.label_text_font_style = "normal"
p.legend.location = (300, 75)

show(p)
export_svg(p, filename = f'./3D_{datetime.date.today()}.svg')

['./3D_2023-08-02.svg']

In [9]:
dif = np.log10(2)
lys_df = lysate_df[lysate_df['Abs_Log_Dif'] > dif]
cap_df = capsule_df[capsule_df['Abs_Log_Dif'] > dif]
print(f'{100*(1-len(lys_df)/len(lysate_df))}% of Lysate genes are < {dif} orders of magnitide dif versus {100*(1-len(cap_df)/len(capsule_df))} for the Capsule samples')

62.78409090909092% of Lysate genes are < 0.3010299956639812 orders of magnitide dif versus 91.49242970439798 for the Capsule samples


In [16]:
lysate_std = np.std(lysate_df['Log_Dif'])
capsule_std = np.std(capsule_df['Log_Dif'])
print(f'std of Lysate samples: {lysate_std}, for the capusle: {capsule_std}')

std of Lysate samples: 0.4217408260874773, for the capusle: 0.1909679602377791
