### Figure 2B - Capsule vs Capsule Transcriptome:

Here we plot the data comparing the two capsule samples - the capsule sample purified from the E. coli sample alone, and the sample purified from the Mixed sample.  The plot generates shows a scatterplot where each point is plotted at the RPM (Reads per Million) value of the sample from the E. coli sample and the Mixed sample. 

In [1]:
import pandas as pd
import os
from bokeh.io import push_notebook, show, output_notebook, export_svg
output_notebook()
from bokeh.plotting import figure
from bokeh.models import LinearColorMapper
import datetime
import numpy as np
from scipy.stats import gaussian_kde

First we will read our data frames into a file:

In [2]:
df_dir = '../../../Processed_Data_Files/dataframe_files/230117Li/'
prefix = '230117Li_Mixing_'
df_dict = dict()
names = []
# Fetch all the relevant dataframe files and put them in a useable format:
for file in os.listdir(df_dir):
    if file.endswith('_dataframe.txt') and file.startswith(prefix):
        name = file[file.find(prefix)+len(prefix):]
        name = name[:name.find('_1')]
        new_df  = pd.read_csv(''.join([df_dir,file]))
        new_df['Length'] = new_df['Stop'] - new_df['Start']
        df_dict[name] = new_df
        names.append(name)
print(f'\nHere is the list of unique samples for which there are dataframe files: \n\n{names}\n')

# a name dictionary for converting between the datafram name and a form nicer for plots:
names_dict = {'Ecoli_Cap':'E. coli Capsule', 'Mix_Cap':'Mixture Capsule', 'Ecoli_Lysate':'E. coli Lysate', 
              'Mix_Lysate':'Mixture Lysate', 'Bsub_Lysate':'B. subtilis Lysate'}


Here is the list of unique samples for which there are dataframe files: 

['Bsub_Lysate', 'Ecoli_Cap', 'Ecoli_Lysate', 'Mix_Cap', 'Mix_Lysate']



Some helper functions to help us with the plotting:

In [9]:
def into_RPM_theshold_remove_cap(df_dict, name):
    '''This felper function gets the dataframe for a particular experiment and converts its reads into RPM'''
    sample_df = df_dict[name].copy()
    sample_df.drop(sample_df[sample_df['Name'] == 'error'].index, inplace = True) # currently dropping any gene without reads in both
    sample_df.drop(sample_df[sample_df['Name'] == 'Capsule'].index, inplace = True) # currently dropping capsule reads from plot
    sample_df.drop(sample_df[sample_df['Name'] == 'Capsule_rev'].index, inplace = True) # currently dropping capsule reads from plot
    sample_df.drop(sample_df[sample_df['Name'] == 'lacI'].index, inplace = True) # currently dropping lacI from plot
    sample_df.drop(sample_df[sample_df['Counts'] <= 100].index, inplace = True)
    sample_df = sample_df.rename(columns = {'Counts':name})
    sample_df[name] = sample_df[name]/(sum(sample_df[name])/1000000) #RPM]
    
    return sample_df

Lets get the data we need in order to look at the comparative abundance between the two capsule samples.  This notebook will output both the an interactive plot below and an svg image of the plot.  In order for the svg export to work you need to have selenium and the firefox/geckodriver libraries installed via conda.  Feel free to comment these parts of the code out if all you want is the interactive plot. 

In [10]:
x = 'Ecoli_Cap' # capsule - e coli
y = 'Ecoli_Lysate' # capsule - mixture sample

x_df = into_RPM_theshold_remove_cap(df_dict, x)
y_df = into_RPM_theshold_remove_cap(df_dict, y)
plot_df = x_df.merge(y_df)

plot_df.drop(plot_df[plot_df[x] <= 1].index, inplace = True)
plot_df.drop(plot_df[plot_df[y] <= 1].index, inplace = True)
plot_df['Log_Dif'] = np.log10(plot_df[x]) - np.log10(plot_df[y])
plot_df['Abs_Log_Dif'] = np.abs(plot_df['Log_Dif'])
xy = np.vstack([np.log(plot_df[x]), np.log(plot_df[y])])
plot_df['Density'] = gaussian_kde(xy)(xy)
color_mapper = LinearColorMapper(
    palette='Viridis256',
    low = min(plot_df['Density']),
    high = max(plot_df['Density']),
)

p = figure(
    y_axis_type = "log", x_axis_type = "log",
    aspect_scale = 1, width = 450, height = 450,
    output_backend = "svg", tooltips = [("Gene", "@Name")]
)

#configure title appearance:
p.title = "MTC and Lysate RNA Level Agreement"
p.title.text_font = "arial"
p.title.text_font_size = "22px"
p.title.align = "center"
p.title.text_font_style = "normal"

p.xaxis.axis_label = "MTC RNA Level (RPM)"
p.yaxis.axis_label = "Lysate RNA Level (RPM)"
p.axis.ticker = [10**0, 10**2, 10**4, 10**6]
p.axis.major_label_text_font = "arial"
p.axis.major_label_text_font_size = "20px"
p.axis.axis_label_text_font_style = "normal"
p.axis.axis_label_text_font = "arial"
p.axis.axis_label_text_font_size = "22px"

p.background_fill_color = "whitesmoke"
p.grid.grid_line_color = "white"
p.grid.grid_line_width = 4

p.circle(x=x,
         y=y,
         source=plot_df,
         size=5,
         fill_alpha=0.8,
         line_alpha=0,
         color={'field': 'Density', 'transform': color_mapper},
        )
p.line(x=[10**(1.5), 10**4], y=[10**(1.5), 10**4], color = "black", width = 4)


show(p)
export_svg(p, filename = f'./1C_{datetime.date.today()}.svg')

['./1C_2023-08-03.svg']

In [24]:
p = figure(width = 600, height = 200,
          output_backend = "svg")

# Histogram
bins = np.linspace(-.5, .5, 30)
plt_hist, plt_edges = np.histogram(plot_df['Log_Dif'], density=False, bins=bins)
p.quad(top = plt_hist, bottom = 0, left = plt_edges[:-1], right = plt_edges[1:],
         fill_color="steelblue", line_color="white", fill_alpha = 1,
         )

p.xaxis.axis_label = "Log 10 Fold Change (MTC/Lysate)"
p.yaxis.axis_label = "Count"
p.axis.axis_label_text_font_style = "normal"
p.axis.major_label_text_font = "arial"
p.axis.major_label_text_font_size = "20px"
p.yaxis.ticker = [50, 150, 250]
p.axis.axis_label_text_font = "arial"
p.axis.axis_label_text_font_size = "20px"

p.background_fill_color = "whitesmoke"
p.grid.grid_line_color = "white"
p.grid.grid_line_width = 4

show(p)
export_svg(p, filename = f'./3D_distribution_{datetime.date.today()}.svg')

['./3D_distribution_2023-08-03.svg']

In [11]:
top_changers = plot_df.nlargest(10, 'Abs_Log_Dif')
print(10**(max(plot_df['Abs_Log_Dif'])))

4.566497484283334


In [38]:
dif = .5
dif_less_than = plot_df[plot_df['Abs_Log_Dif']< abs(np.log10(dif))]
print(10**(-np.log10(dif)))
print(1/10**(-np.log10(dif)))
print(f'{100*len(dif_less_than)/len(plot_df)}% have changed less than {dif} fold ({len(dif_less_than)} out of {len(plot_df)})')

2.0
0.5
95.5925155925156% have changed less than 0.5 fold (2299 out of 2405)


In [14]:
print(np.corrcoef(np.log10(plot_df[x]), np.log10(plot_df[y]))[0,1])

0.970552732083311
