In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sequencing_tools.viz_tools import color_encoder, okabeito_palette
import re
plt.rc('axes', labelsize=15)
plt.rc('xtick', labelsize = 15)
plt.rc('ytick', labelsize = 15)

def label_sample(x):
    if 'HS' in x:
        return 'High salt'
    elif 'Frag' in x:
        return 'Fragmented'
    elif re.search('N[aA]', x):
        return 'Alkaline hydrolysis'
    elif re.search('L[12]',x):
        return 'PolyA-selected'
    elif re.search('Exo|ED|DE', x):
        return 'Exonuclease I'
    elif 'All' in x:
        return 'Untreated'
    elif re.search('IGG|S9', x):
        return 'Pull down'
    else:
        return 'Unfragmented'

In [2]:
tablename = '/stor/work/Lambowitz/cdw2854/cell_Free_nucleotides/tgirt_map/picard_qc/clip_table.tsv'
df = pd.read_table(tablename) \
    .assign(base_fraction = lambda d: d.clipped_bases / d.bases * 100)\
    .assign(clip_aln = lambda d: d.softclip_count / d.aln_count * 100) \
    .assign(prep = lambda d: d.samplename.map(label_sample))
prep_encoder = color_encoder()
prep_encoder.fit(df.prep, okabeito_palette())
df.head()

FileNotFoundError: File b'/stor/work/Lambowitz/cdw2854/cell_Free_nucleotides/tgirt_map/picard_qc/clip_table.tsv' does not exist

In [None]:
df.filter(regex= 'samplename|prep|base_fraction').sort_values('samplename')口

In [None]:
def plot_fraction(df, col, ax):
    df = df.sort_values(col)
    order_prep = df.prep.unique()
    sns.swarmplot(data = df, 
              x = 'prep', 
              y = col, 
              order = order_prep,
              ax = ax,
              palette = [prep_encoder.encoder[p] for p in order_prep])
    xt = ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)

In [None]:
fig = plt.figure(figsize = (8,5))
ax = fig.add_subplot(121)
plot_fraction(df, 'base_fraction' ,ax)
ax.set_ylabel('% of soft-clipped bases')

ax = fig.add_subplot(122)
plot_fraction(df,'clip_aln', ax)
ax.set_ylim(0,100)
ax.set_ylabel('% of alignments\n w/ soft-clipped bases')
fig.tight_layout()
sns.despine()

In [None]:
stat_table = '/stor/work/Lambowitz/cdw2854/cell_Free_nucleotides/tgirt_map/map_stat.csv'
df = pd.read_csv(stat_table) \
    .assign(total_mapping_rate = lambda d: (d['HISAT mapped pairs'] + d['BOWTIE2 mapped pairs'] + \
                                            d['tRNA/rRNA/YRNA pairs'])/d['Trimmed pairs']) \
    .assign(mapping_rate = lambda d: (d['HISAT mapped pairs'] + d['BOWTIE2 mapped pairs'])/d['non tRNA/rRNA/YRNA pairs'])\
    .assign(label = lambda d: d['Sample name'].map(label_sample))
df

In [None]:
def plot_mapping(df, variable, ax):
    sns.swarmplot(data = df, 
                  x = 'label', 
                  y = variable,ax = ax)
    ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)



ax = plt.subplot(121)
plot_mapping(df, 'mapping_rate', ax)
ax = plt.subplot(122)
plot_mapping(df, 'total_mapping_rate', ax)
plt.tight_layout()

In [None]:
ax = plt.subplot(111)
subdf = df.pipe(lambda d: d[d['label'].str.contains('High|Unf')])
plot_mapping(subdf, 'total_mapping_rate', ax)
ax.set_xticklabels(['Low salt', 'High salt'])
ax.set_ylabel('Mapping rate')