In this notebook, we will assemble a list of contaminats from ~200 negative control samples from instruments and reagent used during tissue transport, digestion, and sequencing. These are from 5 10X lanes devoted to deeply sequencing negative controls

### Libraries

In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

params = {'legend.fontsize': 'small',
         'axes.labelsize': 'small',
         'axes.titlesize':'small',
         'xtick.labelsize':'small',
         'ytick.labelsize':'small',
         'figure.figsize': (3, 2)}
plt.rcParams.update(params)
import os
import glob
import re
import itertools
import collections
from collections import Counter
import math
import random
from random import randrange
import string
import subprocess
import numpy as np
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import seaborn as sns
import phylopandas as ph
cmap = sns.cm.rocket_r
sns.set_style("white")
import anndata
from anndata import read_h5ad
from anndata import AnnData
import phylopandas as ph
import wget


### Directories

In [2]:
mainDir = '/oak/stanford/groups/quake/gita/raw/tab3-14_20210420/benchmarking_post_reviews/bacterial_test/'
mainDir1= mainDir + 'micoNT_blastn/'
mainDir2 = '/oak/stanford/groups/quake/gita/raw/tab3-14_20210420/benchmarking_post_reviews/'
dbDir = '/oak/stanford/groups/quake/gita/raw/database/taxonomyNCBI20200125/'
outputdir = '/oak/stanford/groups/quake/gita/raw/nb/microbe/paper/forGitHub/human_tissue_microbiome_atlas/post_review/tables/'

### NCBI taxonomy

In [3]:
tax = pd.read_csv(dbDir + 'ncbi_lineages_2021-01-26.csv')
#want to take only the following columns from the lineage dataframe tax 
tax_short=tax[['tax_id','superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']]
tax_short = tax_short.rename(columns={'tax_id':'taxid'})


  interactivity=interactivity, compiler=compiler, result=result)


### Negative control data 
Will create a contamination dataframe

In [4]:
control = '/oak/stanford/groups/quake/gita/raw/tab3-14_20210420/benchmarking_post_reviews/controls/'
results=pd.DataFrame({})
for file in glob.glob(control + 'virNTblastn/*_deduplicated.csv'):
    filename = file.split('/')[-1].split('_deduplicated.csv')[0].split('scmicrobecontamination_')[1]
    batch = filename.split('_')[0]
    bl = pd.read_csv(file)
    bl['filename']= [filename] *bl.shape[0]
    bl['batch']= [batch] *bl.shape[0]
    results=pd.concat([results, bl])

results['staxids'] = results['staxids'].astype(str)
results['staxids'] = results['staxids'].apply(lambda x: x.split(';')[0]) #sometimes two taxids are provided, taking just one
tax_shortcopy = tax_short.copy()
tax_shortcopy = tax_shortcopy.rename(columns={"taxid":"staxids"})
tax_shortcopy['staxids'] = tax_shortcopy['staxids'].astype('str')
vir=results.merge(tax_shortcopy, on='staxids', how='left')


In [5]:
results=pd.DataFrame({})
for file in glob.glob(control + 'micoNT_blastn/*_deduplicated.csv'):
    filename = file.split('/')[-1].split('_deduplicated.csv')[0].split('scmicrobecontamination_')[1]
    batch = filename.split('_')[0]
    bl = pd.read_csv(file)
    bl['filename']= [filename] *bl.shape[0]
    bl['batch']= [batch] *bl.shape[0]
    results=pd.concat([results, bl])

results['staxids'] = results['staxids'].astype(str)
results['staxids'] = results['staxids'].apply(lambda x: x.split(';')[0]) #sometimes two taxids are provided, taking just one
bac=results.merge(tax_shortcopy, on='staxids', how='left')


In [6]:
results=pd.DataFrame({})
for file in glob.glob(control + 'fungi_NT_blastn/*_deduplicated.csv'):
    filename = file.split('/')[-1].split('_deduplicated.csv')[0].split('scmicrobecontamination_')[1]
    batch = filename.split('_')[0]
    bl = pd.read_csv(file)
    bl['filename']= [filename] *bl.shape[0]
    bl['batch']= [batch] *bl.shape[0]
    results=pd.concat([results, bl])

results['staxids'] = results['staxids'].astype(str)
results['staxids'] = results['staxids'].apply(lambda x: x.split(';')[0]) #sometimes two taxids are provided, taking just one
fungi=results.merge(tax_shortcopy, on='staxids', how='left')


In [7]:
contam = pd.concat([vir, bac, fungi])

info about each batch \
**batch 1:** Bladder, Salivary gland, Trachea and Blood \
**batch 2:** Fat, Uterus, Skin, Spleen and Lymph node \
**batch 3:** Vasculature, Bone Marrow, Heart, Intestine and Liver \
**batch 4:** Kidney, Thymus, Lung, Exocrine Pancreas and Bulk reagents \
**batch 5:** Eye, muscle and UW 

going to save the controls dataframe

In [9]:
contam_fil=contam[(contam.length>=90) & (contam.pident>=90)] #reducing possibilty of false positives by picking high-quality alignments
contam_fil.to_csv(outputdir + 'negative_control.csv', index=False)
contam.to_csv(outputdir + 'negative_control_raw.csv', index=False)