# Functions to move to general_utils module

In [None]:
# create a list of letters
def char_range(c1, c2):
    """Generates the characters from `c1` to `c2`, inclusive."""
    for c in range(ord(c1), ord(c2) + 1):
        yield chr(c)

# create list of wells in range
def well_range(c1,n1,c2,n2):
    well_order_by_columns = [
            f"{w}{n:1}" for n in range(n1, n2) for w in char_range(c1, c2)
        ]
    return well_order_by_columns

In [None]:
# convert ensembl to gene symbol
def convert_ensembl_symbol(adata,species='human',idcol = 'ensembls'):
    import mygene
    mg = mygene.MyGeneInfo()

    mygene_converter = mg.querymany(list(adata.var[idcol]),scopes='all', species=species, as_dataframe=True)
    mygene_converter.loc[mygene_converter['notfound']==True,'symbol'] = mygene_converter.loc[mygene_converter['notfound']==True].index

    adata.var = adata.var.merge(
        mygene_converter.reset_index(),left_on='ensembls',right_on='query').sort_values(
        by='_score',ascending=False).drop_duplicates(
        'ensembl_id').set_index('symbol')
    
    return adata

In [None]:
import logging as logg
def downsample_to_smallest_category(
        adata,
        column="sample_short",
        random_state=None,
        min_cells=15,
        keep_small_categories=False
) -> sc.AnnData:
    """
    returns an annData object in which all categories in 'column' have
    the same size

    column
        column with the categories to downsample
    min_cells
        Minimum number of cells to downsample.
        Categories having less than `min_cells` are discarded unless
        keep_small_categories is True
    keep_small_categories
        Be default categories with less than min_cells are discarded.
        Set to true to keep them
    """
    counts = adata.obs[column].value_counts(sort=False)
    if len(counts[counts < min_cells]) > 0 and keep_small_categories is False:
        logg.warning(
            "The following categories have less than {} cells and will be "
            "ignored: {}".format(min_cells, dict(counts[counts < min_cells]))
        )
    min_size = min(counts[counts >= min_cells])
    sample_selection = None
    for sample, num_cells in counts.items():
        if num_cells <= min_cells:
            if keep_small_categories:
                sel = adata.obs.index.isin(
                    adata.obs[adata.obs[column] == sample].index)
            else:
                continue
        else:
            sel = adata.obs.index.isin(
                adata.obs[adata.obs[column] == sample]
                .sample(min_size, random_state=random_state)
                .index
            )
        if sample_selection is None:
            sample_selection = sel
        else:
            sample_selection |= sel
    logg.info(
        "The cells in category {!r} had been down-sampled to have each {} cells. "
        "The original counts where {}".format(column, min_size, dict(counts))
    )
    return adata[sample_selection].copy()

# Packages needed

In [2]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns

sc.settings.verbosity = 3   

  from pandas.core.index import RangeIndex


# Load gene count table batch 1

In [3]:
adata = sc.read_csv('/mnt/ibm_lg/angela/sapiens/TSP2/smartseq2/tsp2_ss2_batch1.csv')
adata = adata.transpose()
adata

AnnData object with n_obs × n_vars = 7656 × 58967 

Remove htseq last 5lines

In [4]:
display(adata.var_names[-10:])
adata = adata[:,:-5].copy()
display(adata.var_names[-10:])
adata

Index(['ERCC-00164', 'ERCC-00165', 'ERCC-00168', 'ERCC-00170', 'ERCC-00171',
       '__no_feature', '__ambiguous', '__too_low_aQual', '__not_aligned',
       '__alignment_not_unique'],
      dtype='object')

Index(['ERCC-00157', 'ERCC-00158', 'ERCC-00160', 'ERCC-00162', 'ERCC-00163',
       'ERCC-00164', 'ERCC-00165', 'ERCC-00168', 'ERCC-00170', 'ERCC-00171'],
      dtype='object')

AnnData object with n_obs × n_vars = 7656 × 58962 

In [5]:
adata.obs.head()

TSP2_Bladder_NA_SS2_B113692_B104865_Empty_A15_S15.homo.gencode.v30.ERCC.chrM
TSP2_Bladder_NA_SS2_B113692_B104865_Empty_A16_S16.homo.gencode.v30.ERCC.chrM
TSP2_Bladder_NA_SS2_B113692_B104865_Empty_A17_S17.homo.gencode.v30.ERCC.chrM
TSP2_Bladder_NA_SS2_B113692_B104865_Empty_A18_S18.homo.gencode.v30.ERCC.chrM
TSP2_Bladder_NA_SS2_B113692_B104865_Empty_A19_S19.homo.gencode.v30.ERCC.chrM


In [6]:
adata.obs['pilot'] = [c.split('_')[0] for c in adata.obs.index]
adata.obs['Tissue'] = [c.split('_')[1] for c in adata.obs.index]
adata.obs['AnatomicalPosition'] = [c.split('_')[2] for c in adata.obs.index]
adata.obs['method'] = [c.split('_')[3] for c in adata.obs.index]
adata.obs['cDNAPlate'] = [c.split('_')[4] for c in adata.obs.index]
adata.obs['LibraryPlateID'] = [c.split('_')[5] for c in adata.obs.index]
adata.obs['population'] = [c.split('_')[6] for c in adata.obs.index]
adata.obs['well_seq'] = [c.split('_')[7] for c in adata.obs.index]
adata.obs['cell_id'] = [c.split('.')[0] for c in adata.obs.index]

Convert ensembl ids to gene symbol

In [7]:
adata.var_names

Index(['ENSG00000000003.14', 'ENSG00000000005.6', 'ENSG00000000419.12',
       'ENSG00000000457.14', 'ENSG00000000460.17', 'ENSG00000000938.13',
       'ENSG00000000971.15', 'ENSG00000001036.13', 'ENSG00000001084.12',
       'ENSG00000001167.14',
       ...
       'ERCC-00157', 'ERCC-00158', 'ERCC-00160', 'ERCC-00162', 'ERCC-00163',
       'ERCC-00164', 'ERCC-00165', 'ERCC-00168', 'ERCC-00170', 'ERCC-00171'],
      dtype='object', length=58962)

In [None]:
# adata.var['ensembl_id'] = adata.var_names
# adata.var['ensembls'] = [g.split(".")[0] for g in adata.var['ensembl_id']]
# adata.var

In [None]:
# adata = convert_ensembl_symbol(adata);

In [8]:
adata.write_h5ad('/mnt/ibm_lg/angela/sapiens/TSP2/smartseq2/TSP2_SS2_no_official_metadata_unfiltered_raw_batch1.h5ad')
adata

... storing 'pilot' as categorical
... storing 'Tissue' as categorical
... storing 'AnatomicalPosition' as categorical
... storing 'method' as categorical
... storing 'cDNAPlate' as categorical
... storing 'LibraryPlateID' as categorical
... storing 'population' as categorical
... storing 'well_seq' as categorical


AnnData object with n_obs × n_vars = 7656 × 58962 
    obs: 'pilot', 'Tissue', 'AnatomicalPosition', 'method', 'cDNAPlate', 'LibraryPlateID', 'population', 'well_seq', 'cell_id'

# QC batch 1

In [None]:
adata = sc.read_h5ad(
    '/mnt/ibm_lg/angela/sapiens/TSP2/smartseq2/TSP2_SS2_no_official_metadata_unfiltered_raw_batch1.h5ad')
adata

In [9]:
adata.obs.groupby('Tissue').count()

Unnamed: 0_level_0,pilot,AnatomicalPosition,method,cDNAPlate,LibraryPlateID,population,well_seq,cell_id
Tissue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bladder,756,756,756,756,756,756,756,756
Kidney,768,768,768,768,768,768,768,768
Lung,763,763,763,763,763,763,763,763
Muscle,1532,1532,1532,1532,1532,1532,1532,1532
SI,1535,1535,1535,1535,1535,1535,1535,1535
Skin,1536,1536,1536,1536,1536,1536,1536,1536
Spleen,766,766,766,766,766,766,766,766


In [10]:
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_cells(adata, min_counts=5000)
sc.pp.filter_genes(adata, min_cells=0)

filtered out 3556 cells that have less than 200 genes expressed
filtered out 202 cells that have less than 5000 counts


In [11]:
adata.obs.groupby('Tissue').count()

Unnamed: 0_level_0,pilot,AnatomicalPosition,method,cDNAPlate,LibraryPlateID,population,well_seq,cell_id,n_genes,n_counts
Tissue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bladder,265,265,265,265,265,265,265,265,265,265
Kidney,225,225,225,225,225,225,225,225,225,225
Lung,486,486,486,486,486,486,486,486,486,486
Muscle,1077,1077,1077,1077,1077,1077,1077,1077,1077,1077
SI,472,472,472,472,472,472,472,472,472,472
Skin,870,870,870,870,870,870,870,870,870,870
Spleen,503,503,503,503,503,503,503,503,503,503


In [12]:
adata

AnnData object with n_obs × n_vars = 3898 × 58962 
    obs: 'pilot', 'Tissue', 'AnatomicalPosition', 'method', 'cDNAPlate', 'LibraryPlateID', 'population', 'well_seq', 'cell_id', 'n_genes', 'n_counts'
    var: 'n_cells'

In [13]:
np.round(3898/7656*100)

51.0

In [15]:
adata.write(
    '/mnt/ibm_lg/angela/sapiens/TSP2/smartseq2/TSP2_SS2_no_official_metadata_filtered_raw_batch1.h5ad')
adata

AnnData object with n_obs × n_vars = 3898 × 58962 
    obs: 'pilot', 'Tissue', 'AnatomicalPosition', 'method', 'cDNAPlate', 'LibraryPlateID', 'population', 'well_seq', 'cell_id', 'n_genes', 'n_counts'
    var: 'n_cells'

In [14]:
np.min(adata.obs["n_counts"]),np.min(adata.obs["n_genes"])

(5058.0, 200)

### others

In [None]:
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]
adata.obs['mt_frac'] = adata.X[:, mt_gene_mask].sum(1)/adata.obs['n_counts']

In [None]:
ercc_gene_mask = [gene.startswith('ERCC') for gene in adata.var_names]
adata.obs['ercc_frac'] = adata.X[:, ercc_gene_mask].sum(1)/adata.obs['n_counts']

In [None]:
sc.pl.violin(adata, 'n_counts', groupby='Tissue', size=2, log=True, cut=0)
sc.pl.violin(adata, 'mt_frac', groupby='Tissue')

sc.pl.scatter(adata, 'n_counts', 'n_genes', color='mt_frac')

sns.distplot(adata.obs['n_counts'], kde=False)
sns.distplot(adata.obs['n_genes'], kde=False, bins=60)

In [None]:
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [None]:
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e6)
sc.pp.log1p(adata)
sc.pp.pca(adata, n_comps=50)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
# sc.tl.louvain(adata, key_added='groups', resolution=0.5)

In [None]:
adata.obs.head()

In [None]:
sc.pl.umap(adata,color = ['Tissue','AnatomicalPosition','population'],ncols=1)
sc.pl.umap(adata,color = ['PECAM1','PTPRC'], cmap='Oranges')
sc.pl.umap(adata,color = ['n_genes','n_counts','mt_frac','ercc_frac'],cmap='Oranges',ncols=2)

In [None]:
sc.pl.umap(adata[adata.obs['Tissue']=='Skin'],color=['KRT10','B2M','KRT14','PTPRC','KRT1','CD14','CTNNB1'],cmap='Oranges')

In [None]:
adata.write_h5ad(
    '/mnt/ibm_lg/angela/sapiens/TSP2/smartseq2/TSP2_SS2_no_official_metadata_counts_layer_processed_X.h5ad')
adata

In [None]:
adata1 = adata.copy()
adata1

In [None]:
adata2 = adata[adata.obs['Tissue']=='Bladder'].copy()
adata2

In [None]:
adata2.shape[0]

In [None]:
downsample_to_smallest_category(adata1)

In [None]:
sc.pp.subsample(adata1, n_obs=adata2.shape[0]) 
adata1

In [None]:
downsample_to_smallest_category(adata1, 'Tissue', min_cells=100, keep_small_categories=True)

In [None]:
adata1 = downsample_to_smallest_category(adata1, 'Tissue', min_cells=100, keep_small_categories=True)

In [None]:
adata.obs.columns

In [None]:
adata.obs.groupby(['Tissue','']).count()

In [None]:
import logging as logg
def downsample_to_smallest_category(
        adata,
        column="sample_short",
        random_state=None,
        min_cells=15,
        keep_small_categories=False
) -> sc.AnnData:
    """
    returns an annData object in which all categories in 'column' have
    the same size

    column
        column with the categories to downsample
    min_cells
        Minimum number of cells to downsample.
        Categories having less than `min_cells` are discarded unless
        keep_small_categories is True
    keep_small_categories
        Be default categories with less than min_cells are discarded.
        Set to true to keep them
    """
    counts = adata.obs[column].value_counts(sort=False)
    if len(counts[counts < min_cells]) > 0 and keep_small_categories is False:
        logg.warning(
            "The following categories have less than {} cells and will be "
            "ignored: {}".format(min_cells, dict(counts[counts < min_cells]))
        )
    min_size = min(counts[counts >= min_cells])
    sample_selection = None
    for sample, num_cells in counts.items():
        if num_cells <= min_cells:
            if keep_small_categories:
                sel = adata.obs.index.isin(
                    adata.obs[adata.obs[column] == sample].index)
            else:
                continue
        else:
            sel = adata.obs.index.isin(
                adata.obs[adata.obs[column] == sample]
                .sample(min_size, random_state=random_state)
                .index
            )
        if sample_selection is None:
            sample_selection = sel
        else:
            sample_selection |= sel
    logg.info(
        "The cells in category {!r} had been down-sampled to have each {} cells. "
        "The original counts where {}".format(column, min_size, dict(counts))
    )
    return adata[sample_selection].copy()

In [None]:
adata1 = adata.copy()
display(adata1)
display(adata1.obs.groupby('Tissue').count())
adata1.obs['classification_group'] = 'B'
adata1.obs.loc[adata1.obs[adata1.obs['Tissue']=='Bladder'].index,'classification_group'] = 'A'
adata1 = downsample_to_smallest_category(adata1, 'classification_group', keep_small_categories=True)
display(adata1)
display(adata1.obs.groupby('Tissue').count())

# Load gene count table batch 2

In [16]:
adata = sc.read_csv('/mnt/ibm_lg/angela/sapiens/TSP2/smartseq2/tsp2_ss2_batch2.csv')
adata = adata.transpose()
adata

AnnData object with n_obs × n_vars = 7661 × 58967 

Remove htseq last 5lines

In [17]:
display(adata.var_names[-10:])
adata = adata[:,:-5].copy()
display(adata.var_names[-10:])
adata

Index(['ERCC-00164', 'ERCC-00165', 'ERCC-00168', 'ERCC-00170', 'ERCC-00171',
       '__no_feature', '__ambiguous', '__too_low_aQual', '__not_aligned',
       '__alignment_not_unique'],
      dtype='object')

Index(['ERCC-00157', 'ERCC-00158', 'ERCC-00160', 'ERCC-00162', 'ERCC-00163',
       'ERCC-00164', 'ERCC-00165', 'ERCC-00168', 'ERCC-00170', 'ERCC-00171'],
      dtype='object')

AnnData object with n_obs × n_vars = 7661 × 58962 

In [18]:
adata.obs.head()

TSP2_BM_vertebralbody_SS2_B113700_B133089_LinNegMarrow_A10_S154.homo.gencode.v30.ERCC.chrM
TSP2_BM_vertebralbody_SS2_B113700_B133089_LinNegMarrow_A11_S155.homo.gencode.v30.ERCC.chrM
TSP2_BM_vertebralbody_SS2_B113700_B133089_LinNegMarrow_A12_S156.homo.gencode.v30.ERCC.chrM
TSP2_BM_vertebralbody_SS2_B113700_B133089_LinNegMarrow_A13_S157.homo.gencode.v30.ERCC.chrM
TSP2_BM_vertebralbody_SS2_B113700_B133089_LinNegMarrow_A14_S158.homo.gencode.v30.ERCC.chrM


In [19]:
adata.obs['pilot'] = [c.split('_')[0] for c in adata.obs.index]
adata.obs['Tissue'] = [c.split('_')[1] for c in adata.obs.index]
adata.obs['AnatomicalPosition'] = [c.split('_')[2] for c in adata.obs.index]
adata.obs['method'] = [c.split('_')[3] for c in adata.obs.index]
adata.obs['cDNAPlate'] = [c.split('_')[4] for c in adata.obs.index]
adata.obs['LibraryPlateID'] = [c.split('_')[5] for c in adata.obs.index]
adata.obs['population'] = [c.split('_')[6] for c in adata.obs.index]
adata.obs['well_seq'] = [c.split('_')[7] for c in adata.obs.index]
adata.obs['cell_id'] = [c.split('.')[0] for c in adata.obs.index]

In [20]:
adata.obs.head()

Unnamed: 0,pilot,Tissue,AnatomicalPosition,method,cDNAPlate,LibraryPlateID,population,well_seq,cell_id
TSP2_BM_vertebralbody_SS2_B113700_B133089_LinNegMarrow_A10_S154.homo.gencode.v30.ERCC.chrM,TSP2,BM,vertebralbody,SS2,B113700,B133089,LinNegMarrow,A10,TSP2_BM_vertebralbody_SS2_B113700_B133089_LinN...
TSP2_BM_vertebralbody_SS2_B113700_B133089_LinNegMarrow_A11_S155.homo.gencode.v30.ERCC.chrM,TSP2,BM,vertebralbody,SS2,B113700,B133089,LinNegMarrow,A11,TSP2_BM_vertebralbody_SS2_B113700_B133089_LinN...
TSP2_BM_vertebralbody_SS2_B113700_B133089_LinNegMarrow_A12_S156.homo.gencode.v30.ERCC.chrM,TSP2,BM,vertebralbody,SS2,B113700,B133089,LinNegMarrow,A12,TSP2_BM_vertebralbody_SS2_B113700_B133089_LinN...
TSP2_BM_vertebralbody_SS2_B113700_B133089_LinNegMarrow_A13_S157.homo.gencode.v30.ERCC.chrM,TSP2,BM,vertebralbody,SS2,B113700,B133089,LinNegMarrow,A13,TSP2_BM_vertebralbody_SS2_B113700_B133089_LinN...
TSP2_BM_vertebralbody_SS2_B113700_B133089_LinNegMarrow_A14_S158.homo.gencode.v30.ERCC.chrM,TSP2,BM,vertebralbody,SS2,B113700,B133089,LinNegMarrow,A14,TSP2_BM_vertebralbody_SS2_B113700_B133089_LinN...


Convert ensembl ids to gene symbol

In [21]:
adata.var_names

Index(['ENSG00000000003.14', 'ENSG00000000005.6', 'ENSG00000000419.12',
       'ENSG00000000457.14', 'ENSG00000000460.17', 'ENSG00000000938.13',
       'ENSG00000000971.15', 'ENSG00000001036.13', 'ENSG00000001084.12',
       'ENSG00000001167.14',
       ...
       'ERCC-00157', 'ERCC-00158', 'ERCC-00160', 'ERCC-00162', 'ERCC-00163',
       'ERCC-00164', 'ERCC-00165', 'ERCC-00168', 'ERCC-00170', 'ERCC-00171'],
      dtype='object', length=58962)

In [None]:
# adata.var['ensembl_id'] = adata.var_names
# adata.var['ensembls'] = [g.split(".")[0] for g in adata.var['ensembl_id']]
# adata.var

In [None]:
# adata = convert_ensembl_symbol(adata);

In [22]:
adata.write_h5ad('/mnt/ibm_lg/angela/sapiens/TSP2/smartseq2/TSP2_SS2_no_official_metadata_unfiltered_raw_batch2.h5ad')
adata

... storing 'pilot' as categorical
... storing 'Tissue' as categorical
... storing 'AnatomicalPosition' as categorical
... storing 'method' as categorical
... storing 'cDNAPlate' as categorical
... storing 'LibraryPlateID' as categorical
... storing 'population' as categorical
... storing 'well_seq' as categorical


AnnData object with n_obs × n_vars = 7661 × 58962 
    obs: 'pilot', 'Tissue', 'AnatomicalPosition', 'method', 'cDNAPlate', 'LibraryPlateID', 'population', 'well_seq', 'cell_id'

# QC batch 2

In [None]:
adata = sc.read_h5ad(
    '/mnt/ibm_lg/angela/sapiens/TSP2/smartseq2/TSP2_SS2_no_official_metadata_unfiltered_raw_batch2.h5ad')
adata

In [23]:
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_cells(adata, min_counts=5000)
sc.pp.filter_genes(adata, min_cells=0)

adata

filtered out 2874 cells that have less than 200 genes expressed
filtered out 293 cells that have less than 5000 counts


AnnData object with n_obs × n_vars = 4494 × 58962 
    obs: 'pilot', 'Tissue', 'AnatomicalPosition', 'method', 'cDNAPlate', 'LibraryPlateID', 'population', 'well_seq', 'cell_id', 'n_genes', 'n_counts'
    var: 'n_cells'

In [24]:
adata.obs.groupby('Tissue').count()

Unnamed: 0_level_0,pilot,AnatomicalPosition,method,cDNAPlate,LibraryPlateID,population,well_seq,cell_id,n_genes,n_counts
Tissue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BM,329,329,329,329,329,329,329,329,329,329
Blood,560,560,560,560,560,560,560,560,560,560
Kidney,163,163,163,163,163,163,163,163,163,163
LI,394,394,394,394,394,394,394,394,394,394
LungNeuron,298,298,298,298,298,298,298,298,298,298
LymphNode,953,953,953,953,953,953,953,953,953,953
Muscle,238,238,238,238,238,238,238,238,238,238
SI,161,161,161,161,161,161,161,161,161,161
Spleen,588,588,588,588,588,588,588,588,588,588
Thymus,413,413,413,413,413,413,413,413,413,413


In [26]:
np.round(4494/7656*100)

59.0

In [27]:
np.min(adata.obs["n_counts"]),np.min(adata.obs["n_genes"])

(5029.0, 200)

In [28]:
adata.write(
    '/mnt/ibm_lg/angela/sapiens/TSP2/smartseq2/TSP2_SS2_no_official_metadata_filtered_raw_batch2.h5ad')
adata

AnnData object with n_obs × n_vars = 4494 × 58962 
    obs: 'pilot', 'Tissue', 'AnatomicalPosition', 'method', 'cDNAPlate', 'LibraryPlateID', 'population', 'well_seq', 'cell_id', 'n_genes', 'n_counts'
    var: 'n_cells'

In [None]:
adata = sc.read_h5ad(
    '/mnt/ibm_lg/angela/sapiens/TSP2/smartseq2/TSP2_SS2_no_official_metadata_filtered_raw_batch2.h5ad')
adata

In [None]:
adata.obs.groupby('Tissue').count()

In [None]:
np.min(adata.obs["n_counts"]),np.min(adata.obs["n_genes"])

In [None]:
4639/7661*100

In [None]:
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]
adata.obs['mt_frac'] = adata.X[:, mt_gene_mask].sum(1)/adata.obs['n_counts']

In [None]:
ercc_gene_mask = [gene.startswith('ERCC') for gene in adata.var_names]
adata.obs['ercc_frac'] = adata.X[:, ercc_gene_mask].sum(1)/adata.obs['n_counts']

In [None]:
sc.pl.violin(adata, 'n_counts', groupby='Tissue', size=2, log=True, cut=0)
sc.pl.violin(adata, 'mt_frac', groupby='Tissue')

sc.pl.scatter(adata, 'n_counts', 'n_genes', color='mt_frac')

sns.distplot(adata.obs['n_counts'], kde=False)
sns.distplot(adata.obs['n_genes'], kde=False, bins=60)

In [None]:
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [None]:
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e6)
sc.pp.log1p(adata)
sc.pp.pca(adata, n_comps=50)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
# sc.tl.louvain(adata, key_added='groups', resolution=0.5)

In [None]:
adata.obs.head()

In [None]:
sc.pl.umap(adata,color = ['Tissue','AnatomicalPosition','population'],ncols=1)
sc.pl.umap(adata,color = ['PECAM1','PTPRC'], cmap='Oranges')
sc.pl.umap(adata,color = ['n_genes','n_counts','mt_frac','ercc_frac'],cmap='Oranges',ncols=2)

In [None]:
pd.DataFrame(adata.obs.groupby(['Tissue'])['cell_id'].count())

In [None]:
adata.write_h5ad(
    '/mnt/ibm_lg/angela/sapiens/TSP2/smartseq2/TSP2_SS2_no_official_metadata_counts_layer_processed_X_batch2.h5ad')
adata