# Loading in the Data

In [1]:
#the package imports we need
import hisepy
import scanpy as sc
import pandas as pd

  .. math:: Q = \\frac{1}{m} \\sum_{ij} \\left(A_{ij} - \\frac{k_i^\mathrm{out} k_j^\mathrm{in}}{m} \\right)\\delta(\\sigma_i, \\sigma_j),
  .. math:: Q = \\sum_{ij} \\left(A_{ij} - \\gamma \\frac{k_i^\mathrm{out} k_j^\mathrm{in}}{m} \\right)\\delta(\\sigma_i, \\sigma_j),
  implementation therefore does not guarantee subpartition :math:`\gamma`-density.
  .. math:: Q = \sum_k \\lambda_k Q_k.


In [2]:
#importing the data file
h5ad_files = hisepy.cache_fileset('f48bf688-1a5f-456c-b0b0-c6ec870ec003')

In [3]:
#looking at the first 5 file names
h5ad_files[0:5]

['/home/workspace/input/1784782472/fileset/f48bf688-1a5f-456c-b0b0-c6ec870ec003/rhodium-niobium-silver/Afatinib_BIBW2992_counts_filtered_labeled_sampled.h5ad',
 '/home/workspace/input/1784782472/fileset/f48bf688-1a5f-456c-b0b0-c6ec870ec003/rhodium-niobium-silver/Afatinib_dimaleate_counts_filtered_labeled_sampled.h5ad',
 '/home/workspace/input/1784782472/fileset/f48bf688-1a5f-456c-b0b0-c6ec870ec003/rhodium-niobium-silver/Baricitinib_LY3009104_INCB028050_counts_filtered_labeled_sampled.h5ad',
 '/home/workspace/input/1784782472/fileset/f48bf688-1a5f-456c-b0b0-c6ec870ec003/rhodium-niobium-silver/Baricitinib_phosphate_counts_filtered_labeled_sampled.h5ad',
 '/home/workspace/input/1784782472/fileset/f48bf688-1a5f-456c-b0b0-c6ec870ec003/rhodium-niobium-silver/Canertinib_CI-1033_counts_filtered_labeled_sampled.h5ad']

In [4]:
#creating a list
adata_list = []

#looping through the file names
for h5ad_file in h5ad_files:
    #read the file into an AnnData object using scanpy
    adata = sc.read_h5ad(h5ad_file)
    #appending the AnnData file into the list
    adata_list.append(adata)

In [5]:
#looking at the first 2 AnnData objects in the list
adata_list[0:2]

[AnnData object with n_obs × n_vars = 25000 × 1916
     obs: 'original_barcodes', 'batch_id', 'pool_id', 'chip_id', 'well_id', 'n_umis', 'n_genes', 'plate_location', 'cyto_treatment', 'drug_treatment', 'drug_name', 'drug_cas_number', 'drug_mw', 'drug_solvent', 'drug_pathway', 'drug_target', 'drug_description', 'drug_chembl_name', 'drug_chembl_id', 'AIFI_L1', 'AIFI_L2', 'leiden_2'
     uns: 'drug_name_colors', 'leiden_2', 'neighbors', 'pca', 'umap'
     obsm: 'X_pca', 'X_umap'
     obsp: 'connectivities', 'distances',
 AnnData object with n_obs × n_vars = 25000 × 1916
     obs: 'original_barcodes', 'batch_id', 'pool_id', 'chip_id', 'well_id', 'n_umis', 'n_genes', 'plate_location', 'cyto_treatment', 'drug_treatment', 'drug_name', 'drug_cas_number', 'drug_mw', 'drug_solvent', 'drug_pathway', 'drug_target', 'drug_description', 'drug_chembl_name', 'drug_chembl_id', 'AIFI_L1', 'AIFI_L2', 'leiden_2'
     uns: 'drug_name_colors', 'leiden_2', 'neighbors', 'pca', 'umap'
     obsm: 'X_pca', 'X_um

**Each sample has 25,000 observations and 1916 samples.**

In [6]:
#seeing how many data are in the list (it's 18: 8 drugs with 2 formulas and 2 controls)
len(adata_list)

18

In [7]:
#concatenating/combining all of the AnnData objects into one
adata = sc.concat(adata_list)

  concat_annot = pd.concat(


In [8]:
adata

AnnData object with n_obs × n_vars = 450000 × 1916
    obs: 'original_barcodes', 'batch_id', 'pool_id', 'chip_id', 'well_id', 'n_umis', 'n_genes', 'plate_location', 'cyto_treatment', 'drug_treatment', 'drug_name', 'drug_cas_number', 'drug_mw', 'drug_solvent', 'drug_pathway', 'drug_target', 'drug_description', 'drug_chembl_name', 'drug_chembl_id', 'AIFI_L1', 'AIFI_L2', 'leiden_2'
    obsm: 'X_pca', 'X_umap'

**From concatenating, there are 450,000 obersvations (25,000 * 18) and 1916 samples.**

In [9]:
#creating a copy of the data above
#so any manipulations done won't affect the original data
obs = adata.obs.copy()

In [10]:
#assigning a variable and getting certain columns out
count_types_per_treatment = obs[['AIFI_L2']].value_counts()
count_types_per_treatment

AIFI_L2            
CD4 Naive              118058
CD4 Central Memory     107272
CD8 Naive               78553
CD4 Effector Memory     49677
CD8 Effector Memory     39593
CD8 Central Memory      25655
Treg                    22905
MAIT                     8262
CD8aa                      25
Name: count, dtype: int64

# Start of Scanpy Preprocessing and Clustering

In [11]:
#rescaling each cell to a specific library size
sc.pp.normalize_total(adata, target_sum = 1e4)
#applying a log-transform to the data
sc.pp.log1p(adata)
#centers the data and scales it to unit variance
sc.pp.scale(adata)

  return dispatch(args[0].__class__)(*args, **kw)


The 3 steps above (normalization, log-transformation, and centering) ensures that all of the genes are on a similar scale. Having all the genes on a similar scale prevents any highly expressed genes from affecting or skewing downstream analysis. 

In [16]:
adata.var_names

Index(['LAP3', 'CD99', 'CASP10', 'CD38', 'CREBBP', 'ITGAL', 'ITGA3', 'LAMP2',
       'ITGA2B', 'CX3CL1',
       ...
       'PLEKHM1', 'PRAF2', 'CEBPA', 'TWF2', 'HOXA10', 'CHMP4A', 'POLG2',
       'RBM15B', 'DUSP14', 'PIP4K2B'],
      dtype='object', length=1916)