In [1]:
# Generates basic stats for ccs-pacbio runs from CCS subreads

In [None]:
# this is taking a while to import alignparse.targets
import time
import yaml
import numpy as np
import pandas as pd

import alignparse
import alignparse.ccs
import alignparse.targets
import alignparse.minimap2
from alignparse.constants import CBPALETTE

import plotnine as p9
from plotnine import *
import math
import dms_variants.plotnine_themes
import dms_variants.utils
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_context("talk")
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

In [None]:
# alignparse save directory
save_dir = "../../results/alignparse"
os.makedirs(save_dir, exist_ok=True)

# alignparase img directory
img_dir = "../../results/alignparse/img"
os.makedirs(img_dir, exist_ok=True)

In [None]:
data_dir = '../../data/pacbio'

config_file = os.path.join(save_dir, 'pkr_config.yaml')
with open(config_file) as f:
    config = yaml.safe_load(f)

In [None]:
# setup pacbio run info for summaries
# adding a fastq column in the .csv
run_file = os.path.join(save_dir, 'pacbio_runs_summary.csv')
    pd.read_csv(run_file, dtype=str)
    .drop(columns=['subreads'])
    .assign(name=lambda x: x['library'] + '.' + x['run'])
    )
pacbio_runs

In [None]:
# make zwm (subreads) figure
df_list = []
lib = ['Run 1','Run 2']
for file, library in zip(pacbio_runs.report, lib):
    df = alignparse.ccs.report_to_stats(file)
    df['name'] = library
    df_list.append(df)
    
df = pd.concat(df_list)

status_list = [
    'Success -- CCS generated', 
    'Failed -- Lacking full passes', 
    'Failed -- CCS below minimum RQ']

In [None]:
# could be nice to use better colors
# also use sci-notation for y-axis
temp_df = df[df.status.isin(status_list)]
p = (p9.ggplot(temp_df, p9.aes(x='name', y='number', fill='status')) +
     p9.geom_col(position=p9.position_stack(reverse=True), width=0.8) +
     p9.theme(axis_text_x=p9.element_text(angle=90,
                                          vjust=1,
                                          hjust=0.5),
              figure_size=(0.4 * len(df['name'].unique()), 2.5)
              ) +
     p9.ylab('# Sub-Reads (ZMWs)') +
     p9.xlab('')
     )

if len(df['status'].unique()) < len(CBPALETTE):
    p = p + p9.scale_fill_manual(CBPALETTE[1:])
p

plot_name = os.path.join(img_dir, 'sub-read_summary.png')
p.save(plot_name)

In [None]:
# run some stats on the quality and count of the ccs reads
file_1 = os.path.join(data_dir, 'BCZ0017_1/CCS_1592/demultiplex.bc1009_BAK8A_OA--bc1009_BAK8A_OA.ccs_statistics.csv')
file_2 = os.path.join(data_dir, 'BCZ0017_1/CCS_1608/demultiplex.bc1009_BAK8A_OA--bc1009_BAK8A_OA.Q20.stats.csv')

ccs_file_list = [file_1, file_2]
df_list = []
lib = ['Run 1','Run 2']
for file, library in zip(ccs_file_list, lib):
    df = pd.read_csv(file)
    df['library'] = library
    df_list.append(df)
df = pd.concat(df_list)
df.reset_index(drop=True, inplace=True)
df.rename(columns={'library':'Library'}, inplace=True)
df.head()

In [None]:
# length
df['length'].plot.hist(bins=50)

In [None]:
# length
sns.set(rc={'figure.figsize':(4.5,4)})
ax = sns.histplot(data=df, x='length', hue='Library', bins=50, alpha=1)
ax.set(xlabel='Read Length')
plt.xlim(0,8000)
fig = ax.get_figure()
plt.tight_layout()
sns.set_context("talk")

plot_name = os.path.join(img_dir, 'ccs_read-length.png')
fig.savefig(plot_name)

plt.show()

In [None]:
# accuracy score
sns.set(rc={'figure.figsize':(4.5,4)})
ax = sns.histplot(data=df, x='readscore', hue='Library', bins=50, alpha=1)
ax.set(xlabel='Read Accuracy')
plt.ticklabel_format(style='plain', axis='y')
plt.xlim(.995,1)
fig = ax.get_figure()
plt.tight_layout()
sns.set_context("talk")

plot_name = os.path.join(img_dir, 'ccs_read-accuracy.png')
fig.savefig(plot_name)

plt.show()

In [None]:
# passes
sns.set(rc={'figure.figsize':(4.1,4)})
ax = sns.histplot(data=df, x='num_passes', hue='Library', bins=100, alpha=1)
ax.set(xlabel='Read Passes')
plt.xlim(0,75)
fig = ax.get_figure()
plt.tight_layout()
sns.set_context("talk")

plot_name = os.path.join(img_dir, 'ccs_read-passes.png')
fig.savefig(plot_name)

plt.show()

In [None]:
# donut plot of total reads that align to PKR, K3L, and junk
df = pd.read_csv('../minimap2_summary.csv')
df.reads.tolist()

In [None]:
# Create a circle at the center of the plot
names = ['PKR', 'K3L', 'Off-Target']
size = [440228, 164648, 1739593]

ax = plt.Circle( (0,0), 0.7, color='white')

# Label color
plt.rcParams['text.color'] = 'black'
plt.pie(size, labels=names, colors = ['#6ec290', '#786bac', 'gray'])
p = plt.gcf()
p.gca().add_artist(ax)
#plt.title()
fig = ax.get_figure()
fig.savefig('img/read_alignment.png')
plt.show()
plt.close()

In [None]:
# try making stacked barplot
# import libraries
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

In [None]:
# load dataset
tips = sns.load_dataset("tips")

In [None]:
tips.head()

In [None]:
# top bar -> sum all values(smoker=No and smoker=Yes) to find y position of the bars
total = tips.groupby('day')['total_bill'].sum().reset_index()

# bar chart 1 -> top bars (group of 'smoker=No')
bar1 = sns.barplot(x="day",  y="total_bill", data=total, color='darkblue')

# bottom bar ->  take only smoker=Yes values from the data
smoker = tips[tips.smoker=='Yes']

# bar chart 2 -> bottom bars (group of 'smoker=Yes')
bar2 = sns.barplot(x="day", y="total_bill", data=smoker, estimator=sum, ci=None,  color='lightblue')

# add legend
top_bar = mpatches.Patch(color='darkblue', label='smoker = No')
bottom_bar = mpatches.Patch(color='lightblue', label='smoker = Yes')
plt.legend(handles=[top_bar, bottom_bar])

# show the graph
plt.show()

In [None]:
df = pd.read_csv('minimap2_summary-2.csv')
df

In [None]:
plt.figure(figsize=(4, 6))

# top bar -> sum all values(smoker=No and smoker=Yes) to find y position of the bars
reads = df.groupby('Run')['Reads'].sum().reset_index()
reads

# bar chart 1 -> top bars (group of 'smoker=No')
bar1 = sns.barplot(x="Run",  y="Reads", data=reads, color='gray')

# bottom bar ->  take only smoker=Yes values from the data
pkr = df[df.Target=='PKR']
pkr.reset_index(drop=True, inplace=True)
pkr.Reads = pkr.Reads.reset_index(drop=True) + df[df.Target == 'K3L'].Reads.reset_index(drop=True)

# bar chart 2 -> bottom bars (group of 'smoker=Yes')
bar2 = sns.barplot(x="Run", y="Reads", data=pkr, estimator=sum, ci=None,  color='#6ec290')

k3l = df[df.Target=='K3L']
bar3 = sns.barplot(x="Run", y="Reads", data=k3l, estimator=sum, ci=None,  color='#786bac')

plt.ticklabel_format(style='plain', axis='y')

# add legend
top_bar = mpatches.Patch(color='gray', label='Off-Target')
middle_bar = mpatches.Patch(color='#6ec290', label='PKR')
bottom_bar = mpatches.Patch(color='#786bac', label='K3L')
#plt.legend(loc='upper left', handles=[top_bar, middle_bar, bottom_bar])
plt.tight_layout()
fig = plt.gcf()
fig.savefig('img/ccs_total-reads-bar.png')

In [None]:
# plots like Bloom lab
temp_df = df.sort_values('number', ascending=False)[:3]

p = (p9.ggplot(temp_df, p9.aes(x='name', y='number', fill='status')) +
     p9.geom_col(position=p9.position_stack(reverse=True), width=0.8) +
     p9.theme(axis_text_x=p9.element_text(angle=90,
                                          vjust=1,
                                          hjust=0.5),
              figure_size=(0.4 * len(df['name'].unique()), 2.5)
              ) +
     p9.ylab('number of ZMWs') +
     p9.xlab('')
     )

if len(df['status'].unique()) < len(CBPALETTE):
    p = p + p9.scale_fill_manual(CBPALETTE[1:])

p = p + theme(panel_grid_major_x=element_blank())  # no vertical grid lines
_ = p.draw()

In [None]:
p = p + theme(panel_grid_major_x=element_blank())  # no vertical grid lines
_ = p.draw()

In [None]:
p = ccs_summaries.plot_zmw_stats()
p = p + theme(panel_grid_major_x=element_blank())  # no vertical grid lines
_ = p.draw()

