# Plot Guide Counts and Generate Fold Change Table

In [None]:
# start coding here
import os
import math
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import auc

%matplotlib inline

sns.set_style('ticks')
sns.set_context('talk')

## Read in raw read counts per sample

Generated by `combine_read_counts.py` and `read_counts_from_bam.py` scripts

In [None]:
read_counts = pd.read_csv(snakemake.input[0])
samp2mapped = read_counts.groupby(['sample_name'])['Coverage'].sum().to_dict()
read_counts['fraction_reads'] = read_counts.apply(
    lambda x: x.Coverage/samp2mapped[x.sample_name],
    axis=1)
read_counts.head()

## Determine T0 Samples from sample sheet

In [None]:
read_counts.sample_name.unique()

In [None]:
sample_df = pd.read_csv(snakemake.config['samples'], sep='\t')
sample_df.sort_values(by=['timepoint',], inplace=True)
sample_order = sample_df.sample_name.values
sample_df.head()

In [None]:
t0 = sample_df.timepoint.min()
t0_samples = sample_df[sample_df.timepoint == t0].sample_name.values
t0_samples

## Plot Lorenz Curve for T0 Samples

Closer to 0.5 the better

In [None]:
pal = sns.color_palette('colorblind', len(t0_samples))
fig, axes = plt.subplots(figsize=(13, 5), ncols=len(t0_samples))
for smpl, ax in zip(t0_samples, axes):
    y = read_counts[read_counts.sample_name == smpl].fraction_reads.sort_values(
        ascending=False).values
    x = np.arange(0, len(y)) / len(y)
    y = np.cumsum(y)
    ax.plot(x, y, c=pal.pop(0))
    ax.plot([0, x[-1]], [0, y[-1]], '--', c='grey')
    lbl = auc(x, y)
    ax.text(0, 0.9, "AUC = {:.3f}".format(lbl))
    ax.set_title(smpl)
    ax.set_xlabel("Ranked Guides")
    if smpl == t0_samples[0]:
        ax.set_ylabel("Fraction Reads")
plt.savefig(snakemake.output[0])
prefix, _ = os.path.splitext(snakemake.output[0])
plt.savefig(prefix + '.png')

## Plot Read Distributions

In [None]:
g = sns.FacetGrid(read_counts,
                  row='sample_name',
                  row_order=t0_samples,
                  hue='sample_name',
                  hue_order=t0_samples,
                  sharex=False,
                  sharey=False,
                  height=6,
                  aspect=1.5)
g.map_dataframe(sns.histplot, 'Coverage')
plt.savefig(snakemake.output[1])
prefix, _ = os.path.splitext(snakemake.output[1])
plt.savefig(prefix + '.png')

In [None]:
g = sns.FacetGrid(read_counts,
                  row='sample_name',
                  row_order=sample_order,
                  hue='sample_name',
                  hue_order=sample_order,
                  sharex=False,
                  sharey=False,
                  height=6,
                  aspect=1.5)
g.map_dataframe(sns.histplot, 'Coverage')
plt.savefig(snakemake.output[2])
prefix, _ = os.path.splitext(snakemake.output[2])
plt.savefig(prefix + '.png')

## Calculate Normalized Read Counts

In [None]:
read_counts['normalized_count'] = read_counts.fraction_reads * 1e7
read_counts.normalized_count.replace(0, 0.1, inplace=True)
pivot = read_counts.pivot_table(index='Target',
                                columns='sample_name'
                               )['normalized_count'].reset_index()
pivot.index.name = 'index'
pivot['Gene'] = pivot.Target.apply(lambda x: x.split('_')[3])
col_order = ['Target', 'Gene'] + list(sample_order)
pivot = pivot[col_order]
pivot.to_csv(snakemake.output[3], sep='\t', index=False)
pivot.head()

## Calculate Fold Change vs T0 for each Genotype

The resulting tab-delimited file can be used with BAGEL

In [None]:
fold_change_df = pivot[['Target', 'Gene']].copy()
for gt in sample_df.genotype.unique():
    gt_df = sample_df[sample_df.genotype == gt]
    smp_t0 = gt_df[gt_df.timepoint == t0].sample_name.values
    if len(smp_t0) != 1:
        raise ValueError("Expected exactly 1 T0 sample for genotype {}".format(gt))
    comp_samples = gt_df[gt_df.sample_name != smp_t0[0]].sample_name.values
    for smpl in comp_samples:
        print("Comparing {} vs {}".format(smpl, smp_t0[0]))
        fold_change_df[smpl] = (pivot[smpl]/pivot[smp_t0[0]]).apply(lambda x: math.log2(x))
fold_change_df.to_csv(snakemake.output[4], sep='\t', index=False)
fold_change_df.head()