### Figure 2B - Capsule vs Capsule Transcriptome:

Here we plot the data comparing the two capsule samples - the capsule sample purified from the E. coli sample alone, and the sample purified from the Mixed sample.  The plot generates shows a scatterplot where each point is plotted at the RPM (Reads per Million) value of the sample from the E. coli sample and the Mixed sample. 

In [3]:
import pandas as pd
import os
from bokeh.io import push_notebook, show, output_notebook

First we will read our data frames into a file:

In [5]:
df_dir = '../../../Processed Data Files/dataframe files/230117Li/'
prefix = '230117Li_Mixing_'
df_dict = dict()
names = []
for file in os.listdir(df_dir):
    if file.endswith('_dataframe.txt') and file.startswith(prefix):
        name = file[file.find(prefix)+len(prefix):]
        name = name[:name.find('_1')]
        new_df  = pd.read_csv(''.join([df_dir,file]))
        new_df['Length'] = new_df['Stop'] - new_df['Start']
        df_dict[name] = new_df
        names.append(name)
print(f'\nHere is the list of unique samples for which there are dataframe files: \n\n{names}\n')
names_dict = {'Ecoli_Cap':'E. coli Capsule', 'Mix_Cap':'Mixture Capsule', 'Ecoli_Lysate':'E. coli Lysate', 
              'Mix_Lysate':'Mixture Lysate', 'Bsub_Lysate':'B. subtilis Lysate'}


Here is the list of unique samples for which there are dataframe files: 

['Bsub_Lysate', 'Ecoli_Cap', 'Ecoli_Lysate', 'Mix_Cap', 'Mix_Lysate', 'ot_Early_Cap', 'ot_Early_Lysate', 'ot_Late_Cap', 'ot_Late_Lysate']



Some helper functions to help us with the plotting:

In [11]:
def into_RPKM(df_dict, name):
    sample_df = df_dict[name].copy()
    sample_df = sample_df.rename(columns = {'Counts':name})
    sample_df[name] = sample_df[name]/(sum(sample_df[name])/1000000) #RPM
    sample_df[name] = sample_df[name].div(sample_df['Length']) #RPKM
    
    return sample_df

Comparative abundance plots:

In [12]:
x = 'Ecoli_Cap' # capsule - e coli
y = 'Mix_Cap' # capsule - mixture sample

x_df = into_RPKM(df_dict, x)
y_df = into_RPKM(df_dict, y)
plot_df = x_df.merge(y_df)
plot_df.drop(plot_df[plot_df['Name'] == 'error'].index, inplace = True)
plot_df.drop(plot_df[plot_df[x] == 0].index, inplace = True)
plot_df.drop(plot_df[plot_df[y] == 0].index, inplace = True)
cap_ecoli_same_df = deepcopy(plot_df)
genome_map = {'bsubtilis':'B. subtilis','capsule':'E. coli',
            'ecoli':'E. coli'}
cap_ecoli_same_df['Genome'] = cap_ecoli_same_df['Genome'].map(genome_map)
dot_source = ColumnDataSource(cap_ecoli_same_df)

theil_sen_regressor = TheilSenRegressor()
theil_sen_regressor.fit(np.log(np.asarray(plot_df[x])).reshape(-1, 1), np.log(np.asarray(plot_df[y])))

fit_x_vals = np.sort(np.asarray(plot_df[x]))
fit_y_vals = np.exp(theil_sen_regressor.predict(np.log(fit_x_vals).reshape(-1,1)))
fit_r2 = r2_score(plot_df[y], np.exp(theil_sen_regressor.predict(np.log(np.asarray(plot_df[x])).reshape(-1,1))))
line_source = ColumnDataSource({"x_line": fit_x_vals, "y_line": fit_y_vals,
                                "Name": [f'Fit, R^2 = {fit_r2:.3f}' for _ in fit_x_vals]})

Lets check out what Genomes we have to work with:

In [13]:
print(list(cap_ecoli_same_df.Genome.unique()))

['B. subtilis', 'E. coli']


Noe lets make the plot:

In [14]:
name_1 = names_dict[x].replace('\n', '')
name_2 = names_dict[y].replace('\n', '')
title_str = f"{name_1} vs. {name_2} RPKM"
font = "Arial"
hover = HoverTool(tooltips = [('Name', '@Name')])
mapper = CategoricalColorMapper(palette=["skyblue","sandybrown"], factors=["B. subtilis", "E. coli"])
p = figure(title = title_str, y_axis_type = 'log',
           x_axis_type = 'log', tools=[hover, 'wheel_zoom', 'box_zoom', 'save', 'reset'],
           x_axis_label = f'{names_dict[x]} Sample RPKM',
           y_axis_label = f'{names_dict[y]} Sample RPKM',
           plot_width = 1200, plot_height = 1200)
data = p.circle(x, y, source = dot_source ,fill_alpha=0.35, size=15,
         line_alpha = 0.05, color={'field': 'Genome', 'transform': mapper},
          legend_field = 'Genome')

#fit = p.line("x_line", "y_line", source = line_source, line_width=6, line_alpha = 0.5, line_color = "orange")
p.title.text_font_size = '35pt'
p.title.align = "center"
p.title.text_font = font
p.xaxis.axis_label_text_font_size = "30pt"
p.xaxis.axis_label_text_font = font
p.xaxis.axis_label_text_font_style = "bold"
p.yaxis.axis_label_text_font_size = "30pt"
p.yaxis.axis_label_text_font = font
p.yaxis.axis_label_text_font_style = "bold"

p.xaxis.major_label_text_font_size = "25pt"
p.yaxis.major_label_text_font_size = "25pt"
p.xgrid.grid_line_alpha = 0.5
p.ygrid.grid_line_alpha = 0.5

#legend = Legend(items=[
#    (f'Observed RPKM of a given gene',   [data]),
#    (f'Linear Fit R^2: {fit_r2:.3f}', [fit]),
#], location=(380, 200), label_text_font_size = "25px",
#   border_line_alpha = 1)
#p.add_layout(legend)
p.legend.label_text_font_size = '35pt'
p.legend.glyph_height = 50
p.legend.glyph_width = 90
p.legend.label_height = 50
p.legend.label_width = 50
p.legend.background_fill_color = 'ghostwhite'
p.legend.location = (120, 700)
p.legend.border_line_width = 2
html = file_html(p, CDN, f"{x}_vs_{y}")
with open( f"./{x}_vs_{y}.html", 'w') as f:
    f.write(html)

We can plot the relative abundances of each genome. 

In [17]:
samples = ['Mix_Cap', 'Mix_Lysate', 'Ecoli_Cap', 'Ecoli_Lysate', 'Bsub_Lysate']
genome_map = {'bsubtilis':'B. subtilis','capsule':'E. coli', 'ecoli':'E. coli'}

real_names = []
bar_plot_data = dict()
genomes = list(cap_ecoli_same_df.Genome.unique())
for g in genomes:
    bar_plot_data[g] = []

for n in samples:
    real_names.append(names_dict[n])
    data_df = into_RPKM(df_dict, n)
    df_copy = deepcopy(data_df)
    df_copy['Genome'] = df_copy['Genome'].map(genome_map)
    for g in genomes:
        arr = bar_plot_data[g]
        arr.append(sum(df_copy[df_copy.Genome == g][n])/sum(df_copy[n]))
        bar_plot_data[g] = arr
bar_plot_data['samples'] = real_names

mapper = CategoricalColorMapper(palette=["skyblue", "sandybrown"], factors=["B. subtilis", "E. coli"])
hover = HoverTool(tooltips = [('Name', '@Name')])
p = figure(y_range = [0, 1.25], x_range = real_names, height = 1000, width = 1400, 
           title="Percent Reads Mapping to Each Genome")
p.vbar_stack(genomes, x = 'samples', width=0.9, source = bar_plot_data,
             color = ["skyblue", "sandybrown"],
             legend_label = genomes)

p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None
p.legend.location = (360, 675)
p.legend.label_text_font_size = '35pt'
p.legend.orientation = "horizontal"

p.title.text_font_size = "40pt"
p.title.align = 'center'

p.xaxis.major_label_text_font_size = "35pt"

p.yaxis.axis_label="Percentage Mapped"
p.yaxis.axis_label_text_font_size = "35pt"
p.yaxis.major_label_text_font_size = "35pt"
p.xaxis.axis_label_text_font_style = "bold"
p.yaxis.axis_label_text_font_style = "bold"

p.legend.glyph_height = 50
p.legend.glyph_width = 90
p.legend.label_height = 50
p.legend.label_width = 50

html = file_html(p, CDN, f"Percent Genome Plot {'_'.join(samples)}.html")
with open(f"Percent Genome Plot {'_'.join(samples)}.html", 'w') as f:
    f.write(html)

We can also create a 2d plot comparing the relative abundances of each genome:

In [22]:
df_2d_plot = {"sample_ids": ['Mix_Cap', 'Mix_Lysate', 'Ecoli_Cap', 'Ecoli_Lysate', 'Bsub_Lysate'], "B. subtilis":[], "E. coli":[], "real_names": [],
             "colors": ['lightseagreen', 'lightsalmon', 'lightpink', 'khaki', 'darkturquoise']}
genome_map = {'bsubtilis':'B. subtilis','capsule':'E. coli', 'ecoli':'E. coli'}

for n in df_2d_plot["sample_ids"]:
    df_2d_plot["real_names"].append(names_dict[n])
    data_df = into_RPKM(df_dict, n)
    df_copy = deepcopy(data_df)
    df_copy['Genome'] = df_copy['Genome'].map(genome_map)
    for g in list(df_copy.Genome.unique()):
        df_2d_plot[g].append((sum(df_copy[df_copy.Genome == g][n])/sum(df_copy[n])))

mapper = CategoricalColorMapper(palette=["skyblue", "sandybrown"], factors=["B. subtilis", "E. coli"])
hover = HoverTool(tooltips = [('Sample', '@real_names')])
p = figure(title = "Percent Reads Mapped to Each Genome", y_axis_type = 'log',
           x_axis_type = 'log', tools=[hover, 'wheel_zoom', 'box_zoom', 'save', 'reset'],
           x_axis_label = "Percent Mapping to B. Subtilis Genome",
           y_axis_label = "Percent Mapping to E. coli Genome",
           plot_width = 1200, plot_height = 1200, y_range = [0.00001, 2.5], x_range = [0.00001, 2.5],)
data = p.circle("B. subtilis", "E. coli", source = ColumnDataSource(df_2d_plot), size=60, 
                legend_field = "real_names", fill_color = "colors", line_color = "colors")

p.legend.label_text_font_size = '35pt'

p.title.text_font_size = "30pt"
p.title.align = 'center'

p.xaxis.axis_label_text_font_size = "30pt"
p.xaxis.major_label_text_font_size = "30pt"

p.yaxis.axis_label_text_font_size = "30pt"
p.yaxis.major_label_text_font_size = "30pt"
p.xaxis.axis_label_text_font_style = "normal"
p.yaxis.axis_label_text_font_style = "normal"


p.legend.glyph_height = 100
p.legend.glyph_width = 90
p.legend.label_height = 100
p.legend.label_width = 100
p.legend.location = (200, 200)

html = file_html(p, CDN, "2d_Genome_plot_v2.html")
with open("2d_Genome_plot_v2.html", 'w') as f:
    f.write(html)