### Figure 3C - Early vs Late Capsule Scatter Plot

Here we look at a read by read scatter plot of the early capsule sample vs the late capsule sample to see if the encapsulated reads have changed significantly while the background cellular context is changing. 

In [2]:
import pandas as pd
import numpy as np
import os
from bokeh.io import push_notebook, show, output_notebook, export_svg
output_notebook()
from bokeh.plotting import figure
import datetime

First we will read our data frames into a file:

In [3]:
df_dir = '../../../Processed_Data_Files/dataframe_files/230712LiA/snapshot/'
df_dict = dict()
names = []
# Fetch all the relevant dataframe files and put them in a useable format:
for file in os.listdir(df_dir):
    if file.endswith('_dataframe.txt'):
        name = file[:]
        name = name[:name.find('_D')]
        new_df  = pd.read_csv(''.join([df_dir,file]))
        new_df['Length'] = new_df['Stop'] - new_df['Start']
        df_dict[name] = new_df
        names.append(name)
print(f'\nHere is the list of unique samples for which there are dataframe files: \n\n{names}\n')

# a name dictionary for converting between the datafram name and a form nicer for plots:
names_dict = {'Before-Ind': 'Before Induction', 'Early-Cap': "Early Capsule",
              'Early-Lys': "Early Lysate", 'Late-Cap': "Late Capsule",
              'Late-Lys': "Late Lysate", 'Mid-Stress': "Mid-Stress, Total Lysate",
              'No-Ind': "No Induction", 'Post-Ind': "Post Induction, Total Lysate"}


Here is the list of unique samples for which there are dataframe files: 

['Early_Cap', 'Early_Lys', 'Late_Cap', 'Late_Lys_2', 'Late_Lys']



Some helper functions to help us with the plotting:

In [4]:
def into_RPM_theshold_remove_cap(df_dict, name):
    '''This felper function gets the dataframe for a particular experiment and converts its reads into RPM'''
    sample_df = df_dict[name].copy()
    sample_df.drop(sample_df[sample_df['Name'] == 'error'].index, inplace = True) # currently dropping any gene without reads in both
    sample_df.drop(sample_df[sample_df['Name'] == 'Capsule'].index, inplace = True) # currently dropping capsule reads from plot
    sample_df.drop(sample_df[sample_df['Name'] == 'Capsule_rev'].index, inplace = True) # currently dropping capsule reads from plot
    sample_df.drop(sample_df[sample_df['Name'] == 'lacI'].index, inplace = True) # currently dropping lacI from plot
    sample_df.drop(sample_df[sample_df['Counts'] <= 50].index, inplace = True)
    sample_df = sample_df.rename(columns = {'Counts':name})
    sample_df[name] = sample_df[name]/(sum(sample_df[name])/1000000) #RPM]
    
    return sample_df

Now we will plot a simple scatter plot comparing the early and late Lysate samples on a read by read basis:

In [14]:
x = 'Early_Cap' # Early encapsulated reads
y = 'Late_Cap' # Late encapsulated reads

x_df = into_RPM_theshold_remove_cap(df_dict, x)
y_df = into_RPM_theshold_remove_cap(df_dict, y)
plot_df = x_df.merge(y_df)
plot_df.drop(plot_df[plot_df['Name'] == 'error'].index, inplace = True) # currently dropping any gene without reads in both
plot_df.drop(plot_df[plot_df[x] == 0].index, inplace = True)
plot_df.drop(plot_df[plot_df[y] == 0].index, inplace = True)
plot_df['Log_Dif'] = np.log10(plot_df[x]) - np.log10(plot_df[y])

p = figure(
    y_axis_type = "log", x_axis_type = "log",
    aspect_scale = 1, width = 385, height = 385,
    output_backend = "svg", tooltips = [("Gene", "@Name")]
)

#configure title appearance:
p.title.text_font = "arial"
p.title.text_font_size = "20px"
p.title.align = "center"
p.title.text_font_style = 'normal'

p.xaxis.axis_label = "Pre-Stress RNA Level (RPM)"
p.yaxis.axis_label = "Post-Stress RNA Level (RPM)"
p.axis.axis_label_text_font_style = 'normal'
p.axis.ticker = [10**0, 10**2, 10**4, 10**6]
p.axis.major_label_text_font = "arial"
p.axis.major_label_text_font_size = "20px"
p.axis.axis_label_text_font = "arial"
p.axis.axis_label_text_font_size = "20px"

p.background_fill_color = "whitesmoke"
p.grid.grid_line_color = "white"
p.grid.grid_line_width = 4

within_order_mag = plot_df[np.abs(plot_df['Log_Dif']) < 1]
greater_than_order_mag = plot_df[np.abs(plot_df['Log_Dif']) >= 1]

p.circle(x=x, y=y, source=within_order_mag, size=7, fill_alpha=0.8, line_alpha=0, color="orange", legend_label = "MTC")
p.circle(x=x, y=y, source=greater_than_order_mag, size=7, fill_alpha=0.8, line_alpha=0, color="orangered",)
p.line(x=[10**(1.), 10**5], y=[10**(1.), 10**5], color = "black", width = 3.5)
p.line(x=[10*10**(1.), 10**5], y=[10**(1.), 10**4], color = "black", width = 4)
p.line(x=[10**(1.), 10**4], y=[10**(2.), 10**5], color = "black", width = 4)

p.legend.label_text_font_size = '20pt'
p.legend.label_text_font_style = "normal"
p.legend.location = (10, 200)
p.legend.orientation = "horizontal"

show(p)
export_svg(p, filename = f'./3C_{datetime.date.today()}.svg')

['./3C_2023-08-14.svg']

In [30]:
print(max(np.abs(plot_df['Log_Dif'])))
print(10**(max(np.abs(plot_df['Log_Dif']))))
print(len(greater_than_order_mag))

0.6999434247655174
5.011219485738238
0


In [31]:
print(float(plot_df.loc[plot_df['Name'] == "tnaA"][y]/plot_df.loc[plot_df['Name'] == "tnaA"][x]))

1.4747650197456919


In [32]:
print(np.corrcoef(np.log10(plot_df[x]), np.log10(plot_df[y]))[0,1])

0.9924583659353944
