Quality Control for Each Sequencing Run

In [185]:
# importing python modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy.api as sc
from anndata import read_h5ad
import anndata
from plotnine import *
from plotnine.data import mtcars

%matplotlib inline

In [228]:
# importing Run GC and subsetting out the last 5 rows

### subsetting out the last 5 rows
run1 = pd.read_csv("190627_A00111_0335_BHLMG5DSXX_corrected.csv",index_col='gene') 
run1 = run1.iloc[:-5]

run2 = pd.read_csv("190724_A00111_0345_BHMC5GDSXX_gene_cell_table.csv",index_col='gene') 
run2 = run2.iloc[:-5]

In [229]:
run1.shape,run2.shape

((26577, 7278), (26577, 736))

In [230]:
run = pd.concat([run1,run2], axis=1, sort=False)

In [231]:
run.shape

(26577, 8014)

In [232]:
# making Anndata object with only matrix X
adata = sc.AnnData(run)
adata = adata.transpose()

# making Anndata object with metadata

In [233]:
adata.obs['plate'] = [i.split('_')[0] for i in adata.obs.index] 
adata.obs['well'] = [i.split('_')[1] for i in adata.obs.index]
adata.obs['cell_id'] = [(i.split('_')[1] + "_" + i.split('_')[0]) for i in adata.obs.index]
myList = list(set(adata.obs['plate']))

In [234]:
metadata_file_path = 'metadata_TS_Pilot_Plate Info_071019.csv'
meta_data = pd.read_csv(metadata_file_path,index_col='cDNAPlate')

In [235]:
try:
    auxdict = dict(meta_data['Tissue'][myList])
    adata.obs['tissue'] = adata.obs['plate'].map(auxdict)
    auxdict = dict(meta_data['PrimarySort'][myList])
    adata.obs['PrimarySort'] = adata.obs['plate'].map(auxdict)
    
    for i in list(adata.obs.index):
        if len(adata.obs.loc[i, 'tissue']) == 2:
            if int(adata.obs.loc[i, 'well'][1:]) <= 12:
                adata.obs.loc[i, 'tissue'] = adata.obs.loc[i, 'tissue'].iloc[0]
                adata.obs.loc[i, 'PrimarySort'] = adata.obs.loc[i, 'PrimarySort'].iloc[0]
            else:
                adata.obs.loc[i, 'tissue'] = adata.obs.loc[i, 'tissue'].iloc[1]
                adata.obs.loc[i, 'PrimarySort'] = adata.obs.loc[i, 'PrimarySort'].iloc[1]
        
            
    print('import complete for run ')

    
except KeyError:
    print('sorry, no metadata available for run ')

import complete for run 


In [236]:
# Calculating number of reads and number of genes for each cell
sc.pp.filter_cells(adata, min_counts=0)
sc.pp.filter_cells(adata, min_genes=0)

In [237]:
# Calculating Percent ERCC
ERCC_genes = adata.var_names.str.startswith('ERCC-')
adata.obs['percent_ERCC'] = np.sum(
    adata[:, ERCC_genes].X, axis=1) / np.sum(adata.X, axis=1)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [238]:
# filtering out cells with less than 500 genes and 5000 reads
filtered = adata[adata.obs['n_genes'] >= 500,:]
filtered = filtered[filtered.obs['n_counts'] >= 5000,:]

In [239]:
fraction_counting = []
for i in set(filtered.obs['plate']):
    fraction = "%.2f" % float((len(filtered[filtered.obs['plate']==i]))/(len(adata[adata.obs['plate']==i])))
    fraction_counting.append((i,fraction,str(list(set(filtered[filtered.obs['plate']==i].obs['tissue']))[0])))
fraction_df = pd.DataFrame(fraction_counting, columns =['plate', 'fraction','tissue'])

In [240]:
# Making all QC plots

### Number of Reads versus Number of Genes
g1_unfiltered = (ggplot(adata.obs, aes(x='n_genes', y='n_counts',color='plate')) + geom_point() + scale_y_log10() + geom_hline(yintercept = 5000) + geom_vline(xintercept = 500) + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title='Number of Reads versus Number of Genes (Unfiltered)', x="Number of Genes",y="Number of Reads"))
g1_filtered = (ggplot(filtered.obs, aes(x='n_genes', y='n_counts',color='plate')) + geom_point() + scale_y_log10() + geom_hline(yintercept = 5000) + geom_vline(xintercept = 500) + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title='Number of Reads versus Number of Genes (Filtered)', x="Number of Genes",y="Number of Reads"))

### Percent ERCCs versus Number of Genes
g2_unfiltered = (ggplot(adata.obs, aes(x='n_genes', y='percent_ERCC',color='plate')) + geom_point() + scale_y_log10() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title='Percent ERCCs versus Number of Genes (Unfiltered)', x="Number of Genes",y="Percent ERCCs"))
g2_filtered = (ggplot(filtered.obs, aes(x='n_genes', y='percent_ERCC',color='plate')) + geom_point() + scale_y_log10() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title='Percent ERCCs versus Number of Genes (Filtered)', x="Number of Genes",y="Percent ERCCs"))

### Number of Reads for Each cDNA Plate
g3_unfiltered = ggplot(adata.obs, aes(x = 'plate', y = 'n_counts', fill='plate')) + geom_boxplot() + scale_y_log10() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Reads for Each cDNA Plate (Unfiltered)", x="cDNA plate", y="Number of Reads")
g3_filtered = ggplot(filtered.obs, aes(x = 'plate', y = 'n_counts', fill='plate')) + geom_boxplot() + scale_y_log10() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Reads for Each cDNA Plate (Filtered)", x="cDNA plate", y="Number of Reads")

### Number of Genes for Each cDNA Plate
g4_unfiltered = ggplot(adata.obs, aes(x = 'plate', y = 'n_genes', fill='plate')) + geom_boxplot() + scale_y_log10() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Genes for Each cDNA Plate (Unfiltered)", x="cDNA plate", y="Number of Genes")
g4_filtered = ggplot(filtered.obs, aes(x = 'plate', y = 'n_genes', fill='plate')) + geom_boxplot() + scale_y_log10() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Genes for Each cDNA Plate (Filtered)", x="cDNA plate", y="Number of Genes")

### Percent ERCCs for Each cDNA Plate
g5_unfiltered = ggplot(adata.obs, aes(x = 'plate', y = 'percent_ERCC', fill='plate')) + geom_boxplot() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Percent ERCCs for Each cDNA Plate (Unfiltered)", x="cDNA plate", y="Percent ERCCs")
g5_filtered = ggplot(filtered.obs, aes(x = 'plate', y = 'percent_ERCC', fill='plate')) + geom_boxplot() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Percent ERCCs for Each cDNA Plate (Filtered)", x="cDNA plate", y="Percent ERCCs")

### Number of Cells for Each cDNA Plate
g6_unfiltered = (ggplot(adata.obs, aes('plate', fill='plate')) + geom_bar() + geom_text(aes(label='stat(count)'),stat='count',nudge_y=0.125,va='bottom',size=6) +  theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Cells for Each cDNA Plate (Unfiltered)", x="cDNA plate"))
g6_filtered = (ggplot(filtered.obs, aes('plate', fill='plate')) + geom_bar() + geom_text(aes(label='stat(count)'),stat='count',nudge_y=0.125,va='bottom',size=6) +  theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Cells for Each cDNA Plate (Filtered)", x="cDNA plate"))

### Number of Cells for Each Tissue
g7_unfiltered = (ggplot(adata.obs, aes('tissue', fill='plate')) + geom_bar(position=position_dodge()) + geom_text(aes(label='stat(count)'),stat='count',va='bottom',size=6,position = position_dodge(0.9)) + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Cells for Each Tissue (Unfiltered)", x="Tissue"))
g7_filtered = (ggplot(filtered.obs, aes('tissue', fill='plate')) + geom_bar(position=position_dodge()) + geom_text(aes(label='stat(count)'),stat='count',va='bottom',size=6,position = position_dodge(0.9)) + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Cells for Each Tissue (Filtered)", x="Tissue"))

### Fraction of Cells
g8 = (ggplot(fraction_df, aes(x='tissue', y='fraction',fill='plate')) + geom_bar(stat="identity",position=position_dodge()) + geom_text(aes(label='fraction'),va='bottom',size=6,position = position_dodge(0.9)) + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Fraction of Cells Passed QC in Each Tissue for Each Plate", x="Tissue"))

In [241]:
# Writing all plots into a PDF file
plots = [g1_unfiltered, g1_filtered,g2_unfiltered, g2_filtered,
         g3_unfiltered, g3_filtered,g4_unfiltered, g4_filtered,
         g5_unfiltered, g5_filtered,g6_unfiltered, g6_filtered,
         g7_unfiltered, g7_filtered,g8]

pdf_file = 'sapien.pdf'

save_as_pdf_pages(plots,pdf_file)

  warn('Filename: {}'.format(filename))
  return self.trans.transform(x)
  return self.trans.transform(x)
  self.data = self.geom.handle_na(self.data)
  return self.trans.transform(x)
  return self.trans.transform(x)
  data = self.stat.compute_layer(data, params, layout)
  return self.trans.transform(x)
  data = self.stat.compute_layer(data, params, layout)
  data = self.stat.compute_layer(data, params, layout)
