In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad
from scar import model

import warnings
warnings.simplefilter("ignore")

In [3]:
from matplotlib import pylab

params = {'legend.fontsize': 8,
          'figure.facecolor':"w",
          'figure.figsize': (6, 4.5),
         'axes.labelsize': 10,
         'axes.titlesize':10,
         'axes.linewidth': 0.5,
         'xtick.labelsize':8,
         'ytick.labelsize':8,
         'axes.grid':False,}
pylab.rc('font',**{'family':'serif','serif':['Palatino'],'size':10})
pylab.rcParams.update(params);

sns.set_palette("muted");
sns.set_style("ticks");

In [171]:
i = "brain5_str"

In [None]:
rna = '/media/chang/HDD-8/chang/cloneseq/mouse_fastqs/trex/' + i +"/"+ i + '/outs/raw_feature_bc_matrix.h5'
sticr = '/media/chang/HDD-8/chang/cloneseq/mouse_fastqs/trex/' + i + '/trex_raw/umi_count_matrix.csv'
out = '/media/chang/HDD-8/chang/cloneseq/mouse_fastqs/trex/' + i + '/assign.csv'
gw15_rep1 = sc.read_10x_h5(filename=rna, gex_only=True) # Change this if ur using a different scRNA-seq format t
gw15_rep1_sticr = pd.read_csv(sticr, index_col=0)
gw15_rep1_sticr = gw15_rep1_sticr.add_suffix('-1')
gw15_rep1_sticr_raw = gw15_rep1_sticr
gw15_rep1.var_names_make_unique()

In [None]:
rank_UMIs = pd.DataFrame(gw15_rep1[:,gw15_rep1.var['feature_types']=='Gene Expression'].X.sum(axis=1), index = gw15_rep1.obs_names, columns=['total_counts'])
rank_UMIs['droplets'] = 'cell-free droplets'
rank_UMIs['droplets'] = rank_UMIs['droplets'].mask(rank_UMIs['total_counts']>50, 'droplet II')
rank_UMIs['droplets'] = rank_UMIs['droplets'].mask(rank_UMIs['total_counts']>100, 'droplet I')
rank_UMIs['droplets'] = rank_UMIs['droplets'].mask(rank_UMIs['total_counts']>500, 'cells')
rank_UMIs = rank_UMIs.sort_values(by='total_counts', ascending=False).reset_index().rename_axis("rank").reset_index()
rank_UMIs = rank_UMIs.loc[rank_UMIs['total_counts']>0]
rank_UMIs = rank_UMIs.set_index('index').rename_axis('cells')

In [None]:
plt.figure(figsize=(4, 3), dpi=150)

ax = sns.lineplot(data = rank_UMIs,
                  x='rank',
                  y='total_counts',
                  hue='droplets',
                  hue_order=['cells', 'droplet I', 'droplet II', 'cell-free droplets'],
                  palette=sns.color_palette()[-4:],
                  markers=False,
                  lw=2)

ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('sorted droplets');
ax.legend(loc='lower left', ncol=1, title=None, frameon=False)
ax.set_title(f'kneeplot: gw15_rep1');

sns.set_palette("muted")
sns.set_style("ticks")
sns.despine(offset=10, trim=False);

In [None]:
####################
gw15_rep1_filtered = gw15_rep1[gw15_rep1.obs_names.isin(rank_UMIs[rank_UMIs['droplets']=='cells'].index)] # equal to filtered population as cellranger output
gw15_rep1_sticr_filtered = gw15_rep1_sticr.loc[:,gw15_rep1_sticr.columns.isin(gw15_rep1_filtered.obs_names)].T # pandas.DataFrame
gw15_rep1_sticr_filtered_empty_droplets = gw15_rep1_sticr.loc[:,gw15_rep1_sticr.columns.difference(gw15_rep1_sticr_filtered.index)].T
gw15_rep1_sticr_filtered_empty_profile = gw15_rep1_sticr_filtered_empty_droplets.sum()/gw15_rep1_sticr_filtered_empty_droplets.sum().sum()    # pandas.Series
scarObj = model(raw_count=gw15_rep1_sticr_filtered,
                     ambient_profile=gw15_rep1_sticr_filtered_empty_profile.values, # In the case of default None, the empty_profile will be calculated by averaging pooled cells
                     feature_type='sgRNAs')
scarObj.train(epochs=400)
####################

In [None]:
scarObj.inference(cutoff=3)
scarObj.feature_assignment.to_csv(out)
scarObj.feature_assignment.value_counts("n_sgRNAs")

In [None]:
cell_num = scarObj.feature_assignment.value_counts("n_sgRNAs").to_frame("cell numbers").reset_index()

plt.figure(figsize=(4, 3))

sns.barplot(data = cell_num,
            x = 'n_sgRNAs',
            y = 'cell numbers'
           );

plt.xlabel("number of distinct sgRNAs");
