In [None]:
import os
import pandas as pd
import numpy as np 

from sklearn import preprocessing
from sklearn.mixture import GaussianMixture
from scipy.stats import norm

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm, Normalize,LinearSegmentedColormap, ListedColormap
from matplotlib.patches import Patch
import matplotlib.cm as cm

import math
import re
from multiprocessing import Pool

from IPython.display import display

In [None]:
data_dir=os.path.join("..","data")
analysis_dir=os.path.join(data_dir,'qc_plots')
if (not os.path.exists(analysis_dir)):
    os.makedirs(analysis_dir)
results_dir=os.path.join(data_dir,"pipeline_run","results")
metadata_file= os.path.join(data_dir,"complete_metadata.csv")
pass_qc_field='QC:PASS'

In [None]:
metadata=pd.read_csv(metadata_file,index_col='DEMUX:BIOSAMPLE')
if 'QC:QUALITY' in metadata.columns:
    metadata = metadata.drop(quality_field,axis=1)

In [None]:
metadata.columns

## QC

In [None]:
if pass_qc_field in metadata.columns:
    metadata = metadata.drop(pass_qc_field,axis=1)

In [None]:
thresholds=dict()
qc_vars = ['RUN:FRIP','RUN:ORACLE_FRIP','QC:PROMOTER_FRIP','RUN:TSS_ENRICHMENT','RUN:FILTERED_PEAKS']
tot_reads_columns='RUN:FASTQC_TOTAL_PASS_FILTER_READS'
cell_type_field='SAMPLE:TISSUE'

# thresholds[tot_reads_columns]=1E5

In [None]:
thresholds['RUN:TSS_ENRICHMENT']=4.5
thresholds['RUN:FILTERED_PEAKS']=5000

In [None]:
metadata[pass_qc_field]=True
for var in thresholds.keys():
    metadata.loc[metadata[var]<thresholds[var],pass_qc_field]=False

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(metadata[~metadata[pass_qc_field]])

In [None]:
metadata.loc[metadata['SAMPLE:TISSUE']=='neutrophil',pass_qc_field]=False

In [None]:
x='RUN:FILTERED_PEAKS'
hue=cell_type_field

n_rows = 1
n_cols = len(qc_vars)-1

fig, ax = plt.subplots(n_rows,n_cols,figsize=(n_cols*7,n_rows*10))
for i, y in enumerate(qc_vars):
    if (y!=x):
        current_ax = sns.scatterplot(
            x, 
            y, 
            hue=hue,
            data=metadata[metadata[x]!=0],
            ax=ax[i],
            alpha=0.75,
            style=pass_qc_field,
            markers={True:'o',False:'X'}
        )

        if (x==tot_reads_columns):
            current_ax.set(xscale='log')
        if (y==tot_reads_columns):
            current_ax.set(yscale='log')

        if x in thresholds.keys():
            _ = current_ax.axvline(thresholds[x],color='k',linestyle='--')

        if y in thresholds.keys():
            _ = current_ax.axhline(thresholds[y],color='k',linestyle='--')    

fig.savefig(os.path.join(analysis_dir,"QC_plots.svg"))
        

In [None]:
all_vars = [tot_reads_columns, 'RUN:UNIQUE_ALIGNED_PERC']
all_vars.extend(qc_vars)

In [None]:
cell_types = ['PBMC','cd8t','monocyte','nkcell']

fig, ax = plt.subplots(len(cell_types),len(all_vars),figsize=(len(cell_types)*7.5,len(all_vars)*3.5),sharey='col')
for i, variable in enumerate(all_vars):
    for j, cell_type in enumerate(cell_types):
        to_plot=metadata[metadata[cell_type_field]==cell_type]
        to_plot=to_plot[to_plot[pass_qc_field]]
        
        axes = sns.violinplot(
            x='SAMPLE:VISIT',
            y=variable,
            data=to_plot,
            cut = 0,
            ax=ax[j][i])
        
        if (i==0):
            axes.set_ylabel(cell_type)
        else:
            axes.set_ylabel('')
        if (j==0):
            axes.set_title(variable)
        
        if variable in thresholds.keys():
            _ = axes.axhline(thresholds[variable],color='k',linestyle='--') 

fig.savefig(os.path.join(analysis_dir,"QC_distributions.svg"))


In [None]:
metadata.groupby([cell_type_field,'SAMPLE:VISIT',pass_qc_field]).size()

In [None]:
fail_qc = metadata[~metadata[pass_qc_field] & (metadata[cell_type_field]!='neutrophil')][all_vars]

In [None]:
fail_qc

In [None]:
metadata[metadata['QC:PASS']]

In [None]:
print(thresholds)

In [None]:
metadata.to_csv(metadata_file)