### Figure 2E - Pre and Post stress gene-by-gene comparison of MTC-encapsulated transcripts and general lysate transcripts.

Here we will look at a gene by gene comparison of transcripts either collected from purified MTCs or from the general Lysate. 

Note - this notebook makes use of bokeh's export svg functionality to create svgs of each image to inlcude in Adobe illustrator.  However each figure is also generated as a preview in Jupyter notebook.  Simply don't run cells that save the image to svg if this is an issue for you and you should still be able to preview the interactive figures. 

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
import os
from bokeh.io import push_notebook, show, output_notebook, export_svg
from bokeh.plotting import figure
from bokeh.layouts import column
import datetime

output_notebook()

In [2]:
def normalization(gene_counts):
    '''Convert gene_counts into RPM after removing capsule reads and discarding reads with < 100 reads.'''
    sample_df = gene_counts.copy()
    sample_df.drop(sample_df[sample_df['Name'] == 'error'].index, inplace = True) # currently dropping any gene without reads in both
    sample_df.drop(sample_df[sample_df['Name'] == 'Capsule'].index, inplace = True) # currently dropping capsule reads from plot
    sample_df.drop(sample_df[sample_df['Name'] == 'Capsule_rev'].index, inplace = True) # currently dropping capsule reads from plot
    sample_df.drop(sample_df[sample_df['Name'] == 'lacI'].index, inplace = True) # currently dropping lacI from plot
    sample_df.drop(sample_df[sample_df['Counts'] <= 100].index, inplace = True)
    sample_df = sample_df.rename(columns = {'Counts':name})
    sample_df[name] = sample_df[name]/(sum(sample_df[name])/1000000) #RPM]
    
    return sample_df


df_dir = "../../Processed Sequencing Files/230712LiA_snapshot/"
df_dict = dict()
names = []
# Fetch all the relevant dataframe files and put them in a useable format:
for file in os.listdir(df_dir):
    if file.endswith('_dataframe.txt'):
        name = file[:file.find('_D')]
        new_df  = pd.read_csv(''.join([df_dir ,file]))
        new_df['Length'] = new_df['Stop'] - new_df['Start']
        df_dict[name] = normalization(new_df)
        names.append(name)
print(f'\nHere is the list of unique samples for which there are dataframe files: \n\n{names}\n')


Here is the list of unique samples for which there are dataframe files: 

['Early_Cap', 'Early_Lys', 'Late_Cap', 'Late_Lys']



In [3]:
def changeAppearance(p):
    """Will set various things about plot appearance. """
    p.legend.label_text_font_size = '20pt'
    p.legend.label_text_font_style = "normal"
    p.legend.label_text_color = '#000000'
    p.legend.location = (5, 290)
    p.legend.orientation = "horizontal"
    p.toolbar_location = None
    p.axis.ticker = [10**0, 10**2, 10**4, 10**6]
    p.axis.major_label_text_font = "arial"
    p.axis.major_label_text_font_size = "20px"
    p.xaxis.visible = False
    p.yaxis.visible = False
    p.background_fill_color = "white"
    p.grid.grid_line_color = "white"
    p.grid.grid_line_width = 4
    
    return p

def changeInsetAppearance(p):
    """Will set various things about plot appearance. """
    p.toolbar_location = None
    p.axis.ticker = [10**0, 10**2, 10**4, 10**6]
    p.axis.major_label_text_font = "arial"
    p.axis.major_label_text_font_size = "15px"
    p.xaxis.visible=True
    p.xaxis.ticker = [-1, -.5, 0, .5, 1]
    p.xaxis.axis_label ="log-10 ratio"
    p.yaxis.visible = False
    p.background_fill_color = "white"
    p.grid.grid_line_color = "white"
    p.grid.grid_line_width = 4
    
    
    p.axis.axis_label_text_font_style = "normal"
    
    return p

#### Main Figure Pre versus Post (Early vs Late) Lysate Transcripts:

In [4]:
x2 = 'Early_Lys' # Pre-stress whole cell lysate reads
y2 = 'Late_Lys' # Post-stress whole cell lysate reads

x2_df = df_dict[x2]
y2_df = df_dict[y2]
plot_df2 = x2_df.merge(y2_df)
plot_df2['Log_Dif'] = np.log10(plot_df2[x2]) - np.log10(plot_df2[y2])

p2 = figure(
    y_axis_type = "log", x_axis_type = "log",
    aspect_scale = 1, width = 385, height = 385,
    output_backend = "svg", tooltips = [("Gene", "@Name")]
)

print(f"n = {len(plot_df2[x2])}")
within_order_mag2 = plot_df2[np.abs(plot_df2['Log_Dif']) < 1]
greater_than_order_mag2 = plot_df2[np.abs(plot_df2['Log_Dif']) >= 1]

p2.circle(x=x2, y=y2, source=within_order_mag2, size=7, fill_alpha=0.8, line_alpha=0, color="darkcyan", legend_label = "Lysate")
p2.circle(x=x2, y=y2, source=greater_than_order_mag2, size=7, fill_alpha=0.8, line_alpha=1, color="darkcyan", line_color = "orangered", line_width = 2)

p2 = changeAppearance(p2)

show(p2)

n = 2477


In [5]:
export_svg(p2, filename = f'3B_Lys_Main_{datetime.date.today()}.svg')

['3B_Lys_Main_2023-10-23.svg']

In [6]:
p2_insert = figure(width = 150, height = 125,
          output_backend = "svg")

# Histogram
bins2 = np.linspace(-1, 1, 10)
plt2_hist, plt2_edges = np.histogram(plot_df2['Log_Dif'], density=False, bins=bins2)
p2_insert.quad(top = plt2_hist, bottom = 0, left = plt2_edges[:-1], right = plt2_edges[1:],
         fill_color="steelblue", line_color="white", fill_alpha = 1,
         )
p2_insert = changeInsetAppearance(p2_insert)

show(p2_insert)

r_lys = np.corrcoef(np.log10(plot_df2[x2]), np.log10(plot_df2[y2]))[0,1]

print(f"Pearson Coefficient: {r_lys}")
print(f"Standard Deviation of Log Fold Change Distribution: {np.std(plot_df2['Log_Dif'])}")

Pearson Coefficient: 0.7440591509338925
Standard Deviation of Log Fold Change Distribution: 0.3968152051661924


In [8]:
export_svg(p2_insert, filename = f'3B_Lys_Density_Insert_{datetime.date.today()}.svg')

['3B_Lys_Density_Insert_2023-10-23.svg']

#### Main Figure Pre versus Post (Early vs Late) MTC Protected Transcripts:

In [9]:
x = 'Early_Cap' # Pre-Stress encapsulated reads
y = 'Late_Cap' # Post-Stress encapsulated reads

x_df = df_dict[x]
y_df = df_dict[y]
plot_df = x_df.merge(y_df)
plot_df['Log_Dif'] = np.log10(plot_df[x]) - np.log10(plot_df[y])

p = figure(
    y_axis_type = "log", x_axis_type = "log",
    aspect_scale = 1, width = 385, height = 385,
    output_backend = "svg", tooltips = [("Gene", "@Name")]
)
print(f"n = {len(plot_df[x])}")
within_order_mag = plot_df[np.abs(plot_df['Log_Dif']) < 1]

names = plot_df2[np.abs(plot_df2['Log_Dif']) >= 1]["Name"]
greater_than_df = plot_df[plot_df["Name"].isin(set(names))]



p.circle(x=x, y=y, source=within_order_mag, size=7, fill_alpha=0.8, line_alpha=0, color="orange", legend_label = "MTC")
p.circle(x=x, y=y, source=greater_than_df, size=7, fill_alpha=1, line_alpha=1, color="orange", line_color="orangered", line_width=2)

p = changeAppearance(p)

show(p)

n = 2543


In [10]:
export_svg(p, filename = f'./3C_MTC_Main_{datetime.date.today()}.svg')

['./3C_MTC_Main_2023-10-23.svg']

Now we will make the figure for the log-fold change distribution. We will also print out the R (Pearson) coefficient:

In [11]:
p_insert = figure(width = 150, height = 125,
          output_backend = "svg")

# Histogram
bins = np.linspace(-1, 1, 10)
plt_hist, plt_edges = np.histogram(plot_df['Log_Dif'], density=False, bins=bins)
p_insert.quad(top = plt_hist, bottom = 0, left = plt_edges[:-1], right = plt_edges[1:],
         fill_color="steelblue", line_color="white", fill_alpha = 1,
         )
p_insert = changeInsetAppearance(p_insert)

show(p_insert)

r_mtc = np.corrcoef(np.log10(plot_df[x]), np.log10(plot_df[y]))[0,1]
print(f"Pearson Coefficient: {r_mtc}")
print(f"Standard Deviation of Log Fold Change Distribution: {np.std(plot_df['Log_Dif'])}")

Pearson Coefficient: 0.9913704752824712
Standard Deviation of Log Fold Change Distribution: 0.0767996753038378


In [12]:
export_svg(p_insert, filename = f'3C_MTC_Density_Insert_{datetime.date.today()}.svg')

['3C_MTC_Density_Insert_2023-10-23.svg']