Quality Control for Each Sequencing Run

In [1]:
# importing python modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy.api as sc
from MulticoreTSNE import MulticoreTSNE as TSNE #faster TSNE alternative
from anndata import read_h5ad
import anndata
from plotnine import *
from plotnine.data import mtcars

%matplotlib inline

In [2]:
# importing Run GC and subsetting out the last 5 rows

## change your input file in gc_file_path
gc_file_path = "190116_A00111_0256_BHH237DSXX_mm10.csv"

### subsetting out the last 5 rows
maca_run1 = pd.read_csv(gc_file_path,index_col='gene') 
maca_run1 = maca_run1.iloc[:-5]

In [3]:
# making Anndata object with only matrix X
adata = sc.AnnData(maca_run1)
adata = adata.transpose()

In [5]:
# making new metadata correlated to Anndata
total_cells = list(maca_run1.columns)
meta_data = pd.DataFrame(columns=['cell_id', 'well', 'plate','lib','mouse_id','tissue','subtissue','FACS.selection','%ERCC'])
meta_data['cell_id'] = total_cells
meta_data = meta_data.set_index('cell_id')

## change your input metadata file in ori_metadata_file_path
ori_metadata_file_path = 'MACA_Processing_Metadata - MACA_processing_metadata.csv'

ori_metadata = pd.read_csv(ori_metadata_file_path,index_col='plate.id')

## writing new metadata
for i in list(total_cells):
    meta_data = meta_data.set_value(i, 'well', i.split('_')[0])
    meta_data = meta_data.set_value(i, 'plate', i.split('_')[1])
    meta_data = meta_data.set_value(i, 'lib', i.split('_')[2])
    meta_data.set_value(i, 'mouse_id', ori_metadata.get_value(i.split('_')[1], 'mouse_id'))
    meta_data.set_value(i, 'tissue', ori_metadata.get_value(i.split('_')[1], 'tissue'))
    meta_data.set_value(i, 'subtissue', ori_metadata.get_value(i.split('_')[1], 'subtissue'))
    meta_data.set_value(i, 'FACS.selection', ori_metadata.get_value(i.split('_')[1], 'FACS'))
    tissue = str(ori_metadata.get_value(i.split('_')[1],'tissue'))
    if tissue == 'Brain':
        if str(ori_metadata.get_value(i.split('_')[1],'FACS')) == 'Neurons':
            meta_data.set_value(i, 'tissue', 'Brain_Non-Myeloid')
        if str(ori_metadata.get_value(i.split('_')[1],'FACS')) == 'Microglia':
            meta_data.set_value(i, 'tissue', 'Brain_Myeloid')

  
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [6]:
# Append new mmetadata to Anndata object as observations dataframe
adata.obs = meta_data

In [7]:
# Calculating number of reads and number of genes for each cell
sc.pp.filter_cells(adata, min_counts=0)
sc.pp.filter_cells(adata, min_genes=0)

In [8]:
# Calculating percent ERCCs for each cells (total reads of ERCCs/ total reads)
for i in adata.obs.index:
    total_ERCC = 0
    for name in adata.var_names:
        if name.startswith('ERCC-'):
            total_ERCC += maca_run1.get_value(name, i)
    percent_ERCC = total_ERCC/adata.obs.loc [i, 'n_counts']
    adata.obs.loc[i,'%ERCC'] = percent_ERCC

meta_data = adata.obs
meta_data['%ERCC'] = meta_data['%ERCC'].astype(float)
adata.obs=meta_data

  
  import sys


In [9]:
# filtering out cells with less than 500 genes and 5000 reads
filtered = adata[adata.obs['n_genes'] >= 500,:]
filtered = filtered[filtered.obs['n_counts'] >= 5000,:]

In [10]:
# Making all QC plots

### Number of Reads versus Number of Genes
g1_unfiltered = (ggplot(adata.obs, aes(x='n_genes', y='n_counts',color='plate')) + geom_point() + scale_y_log10() + geom_hline(yintercept = 5000) + geom_vline(xintercept = 500) + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title='Number of Reads versus Number of Genes (Unfiltered)', x="Number of Genes",y="Number of Reads"))
g1_filtered = (ggplot(filtered.obs, aes(x='n_genes', y='n_counts',color='plate')) + geom_point() + scale_y_log10() + geom_hline(yintercept = 5000) + geom_vline(xintercept = 500) + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title='Number of Reads versus Number of Genes (Filtered)', x="Number of Genes",y="Number of Reads"))

### Percent ERCCs versus Number of Genes
g2_unfiltered = (ggplot(adata.obs, aes(x='n_genes', y='%ERCC',color='plate')) + geom_point() + scale_y_log10() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title='Percent ERCCs versus Number of Genes (Unfiltered)', x="Number of Genes",y="Percent ERCCs"))
g2_filtered = (ggplot(filtered.obs, aes(x='n_genes', y='%ERCC',color='plate')) + geom_point() + scale_y_log10() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title='Percent ERCCs versus Number of Genes (Filtered)', x="Number of Genes",y="Percent ERCCs"))

### Number of Reads for Each cDNA Plate
g3_unfiltered = ggplot(adata.obs, aes(x = 'plate', y = 'n_counts', fill='plate')) + geom_boxplot() + scale_y_log10() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Reads for Each cDNA Plate (Unfiltered)", x="cDNA plate", y="Number of Reads")
g3_filtered = ggplot(filtered.obs, aes(x = 'plate', y = 'n_counts', fill='plate')) + geom_boxplot() + scale_y_log10() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Reads for Each cDNA Plate (Filtered)", x="cDNA plate", y="Number of Reads")

### Number of Genes for Each cDNA Plate
g4_unfiltered = ggplot(adata.obs, aes(x = 'plate', y = 'n_genes', fill='plate')) + geom_boxplot() + scale_y_log10() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Genes for Each cDNA Plate (Unfiltered)", x="cDNA plate", y="Number of Genes")
g4_filtered = ggplot(filtered.obs, aes(x = 'plate', y = 'n_genes', fill='plate')) + geom_boxplot() + scale_y_log10() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Genes for Each cDNA Plate (Filtered)", x="cDNA plate", y="Number of Genes")

### Percent ERCCs for Each cDNA Plate
g5_unfiltered = ggplot(adata.obs, aes(x = 'plate', y = '%ERCC', fill='plate')) + geom_boxplot() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Percent ERCCs for Each cDNA Plate (Unfiltered)", x="cDNA plate", y="Percent ERCCs")
g5_filtered = ggplot(filtered.obs, aes(x = 'plate', y = '%ERCC', fill='plate')) + geom_boxplot() + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Percent ERCCs for Each cDNA Plate (Filtered)", x="cDNA plate", y="Percent ERCCs")

### Number of Cells for Each cDNA Plate
g6_unfiltered = (ggplot(adata.obs, aes('plate', fill='plate')) + geom_bar() + geom_text(aes(label='stat(count)'),stat='count',nudge_y=0.125,va='bottom') +  theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Cells for Each cDNA Plate (Unfiltered)", x="cDNA plate"))
g6_filtered = (ggplot(filtered.obs, aes('plate', fill='plate')) + geom_bar() + geom_text(aes(label='stat(count)'),stat='count',nudge_y=0.125,va='bottom',) +  theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Cells for Each cDNA Plate (Filtered)", x="cDNA plate"))

### Number of Cells for Each Tissue
g7_unfiltered = (ggplot(adata.obs, aes('tissue', fill='plate')) + geom_bar() +  theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Cells for Each Tissue (Unfiltered)", x="Tissue"))
g7_filtered = (ggplot(filtered.obs, aes('tissue', fill='plate')) + geom_bar() +  theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title="Number of Cells for Each Tissue (Filtered)", x="Tissue"))


The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  if cbook.iterable(self.breaks) and cbook.iterable(self.labels):
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  if cbook.iterable(val) and not is_string(val):
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  if cbook.iterable(val) and not is_string(val):
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  if cbook.iterable(self.breaks) and cbook.iterable(self.labels):
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  if cbook.iterable(val) and not is_string(val):
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  if cbook.iterable(val) and not is_string(val):
The iterable function wa

In [12]:
# Writing all plots into a PDF file
plots = [g1_unfiltered, g1_filtered,g2_unfiltered, g2_filtered,
         g3_unfiltered, g3_filtered,g4_unfiltered, g4_filtered,
         g5_unfiltered, g5_filtered,g6_unfiltered, g6_filtered,
         g7_unfiltered, g7_filtered]

pdf_file = '18M_Run1_QC.pdf'

save_as_pdf_pages(plots,pdf_file)

  warn('Filename: {}'.format(filename))
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  if cbook.iterable(self.breaks) and cbook.iterable(self.labels):
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  if cbook.iterable(self.breaks) and cbook.iterable(self.labels):
  return self.trans.transform(x)
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  return cbook.iterable(var) and not is_string(var)
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  return cbook.iterable(var) and not is_string(var)
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  return cbook.iterable(var) and not is_string(var)
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable 