### CMAP

**NOT UP TO DATE**

This notebook prepares a dataset with 24 individual knockout experiments applied to CD4 T cells ([Freimer et al 2020](https://www.nature.com/articles/s41588-022-01106-y)). Each knockout was profiled with both ATAC and RNA-seq measurements, but we use only RNA. The data have UMI's. Controls are 8 guide RNA's targeting the "safe-harbor" AAVS1 locus, and are labeled `AAVS1_1`, `AAVS1_8`, etc. The experiment was done separately on blood from 3 different donors.

Here we tidy the dataset and carry out a simple exploration in scanpy. (It's not single cell data but scanpy is still useful for data exploration.)

In [None]:
import warnings
warnings.filterwarnings('ignore')
import regex as re
import os
import shutil
import sys
import importlib
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import seaborn as sns
import celloracle as co
from scipy.stats import spearmanr as spearmanr
from IPython.display import display, HTML

# local
import importlib
import sys
sys.path.append("setup")
import ingestion
importlib.reload(ingestion)

import os, sys, time
import itertools as it
from scipy.stats import spearmanr, pearsonr, rankdata, f_oneway
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mutual_info_score
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn import linear_model

#      visualization settings
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
plt.rcParams['figure.figsize'] = [5, 3]
plt.rcParams["savefig.dpi"] = 200

# Specify the working directory explicitly.
os.chdir("/home/gary/cahan_rotation/perturbation_data/")

# Universal
geneAnnotationPath = "setup/gencode.v35.annotation.gtf.gz"       # Downloaded from https://www.gencodegenes.org/human/release_35.html
humanTFPath = "setup/tfList.csv"                                 # Downloaded from http://humantfs.ccbr.utoronto.ca/download.php
humanEpiPath = "setup/epiList.csv"                               # Downloaded from https://epifactors.autosome.org/description 

# cmap Specific
datadirRoot = "not_ready/cmap/new"                                                   
controlPath = os.path.join(datadirRoot, "level3_beta_ctl_n188708x12328.gctx")        # From https://clue.io/data/CMap2020#LINCS2020 Nov. 29th, 2022
treatmtPath = os.path.join(datadirRoot, "level3_beta_trt_oe_n131668x12328.gctx")     # From https://clue.io/data/CMap2020#LINCS2020 Nov. 29th, 2022
instPath    = os.path.join(datadirRoot, "instinfo_beta.txt")                         # From https://clue.io/data/CMap2020#LINCS2020 Nov. 29th, 2022
genePath    = os.path.join(datadirRoot, "geneinfo_beta.txt")                         # From https://clue.io/data/CMap2020#LINCS2020 Nov. 29th, 2022

### Reshape the data & Filter observations failed to pass CMAP's QC test

In [None]:
_control = ingestion.read_cmap( 
    expression_file=controlPath,
    gene_metadata=genePath,
    instance_metadata=instPath, 
)
_treatmt = ingestion.read_cmap( 
    expression_file=treatmtPath,
    gene_metadata=genePath,
    instance_metadata=instPath, 
)

In [None]:
# Removed 1 entry of MIA2, since both ENSG00000150527 & ENSG00000150526 map to MIA2
_control = _control[:, ~_control.var.index.duplicated()].copy()
_treatmt = _treatmt[:, ~_treatmt.var.index.duplicated()].copy()

In [None]:
_control.shape, _treatmt.shape

In [None]:
expression_quantified = sc.AnnData(
    X=np.vstack([_control.X, 
                 _treatmt.X]).copy(),
    var=_control.var.copy(),
    obs=pd.concat([_control.obs,
                   _treatmt.obs]).copy()
)
print(expression_quantified)

del _control
del _treatmt

In [None]:
overallOccupancy = expression_quantified.obs.copy()
overallOccupancy.head()

In [None]:
""" Filter out observations that fail cmap standard QC """
expression_quantified = expression_quantified[expression_quantified.obs.qc_pass == 1].copy()
print(expression_quantified)

In [None]:
# Keep only landmark genes
expression_quantified = expression_quantified[:,expression_quantified.var.feature_space == "landmark"].copy()

In [None]:
expression_quantified_orig = expression_quantified.copy()

In [None]:
expression_quantified.obs = expression_quantified.obs[['sample_id', 'bead_batch', 'cell_mfc_name', 'pert_mfc_id', 'det_plate',
                                                       'det_well', 'rna_plate', 'rna_well', 'count_mean', 'count_cv',
                                                       'qc_f_logp', 'qc_iqr', 'qc_slope', 'pert_id', 'pert_type', 'cell_iname',
                                                       'qc_pass', 'dyn_range', 'inv_level_10', 
                                                       'project_code', 'cmap_name']]

In [None]:
expression_quantified.obs["det_row"] = expression_quantified.obs.det_well.apply(lambda x: x[0 ])
expression_quantified.obs["det_col"] = expression_quantified.obs.det_well.apply(lambda x: x[1:])

### Add `is_control` and `perturbation`

In [None]:
display(expression_quantified.obs["pert_type"].value_counts())

In [None]:
controls = ["ctl_vector", "ctl_vehicle", "ctl_untrt", "ctl_x"]
expression_quantified.obs["is_control"    ] = expression_quantified.obs['pert_type'].isin(controls)
expression_quantified.obs["is_control_int"] = expression_quantified.obs['pert_type'].isin(controls).astype(int)

In [None]:
""" Heterogeneity of controls """
tmp = expression_quantified[expression_quantified.obs.is_control].obs[['pert_type', 'cmap_name']]
for t in set(tmp.pert_type):
    print(f"====={t}=====")
    display(tmp[tmp.pert_type == t].cmap_name.value_counts().head(2))
del tmp

In [None]:
expression_quantified.obs["perturbation"] = expression_quantified.obs["cmap_name"]

In [None]:
""" Count up different perturbations """ 
human_TFs = pd.read_csv(humanTFPath)
human_TFs = human_TFs.loc[human_TFs["Is TF?"]=="Yes",:]

expression_quantified.obs["is_tf"] = expression_quantified.obs["perturbation"].isin(human_TFs["HGNC symbol"])
print(f"perturbation frequencies")
print(expression_quantified.obs["perturbation"].value_counts())
print(f"\n\nperturbation frequencies (just TF's)")
print(expression_quantified.obs.query("is_tf")["perturbation"].value_counts())

In [None]:
""" Perturbagen by count """
treatment = expression_quantified[~expression_quantified.obs.is_control].copy()
print(treatment.obs.perturbation.value_counts())
del treatment

In [None]:
""" # of LANDMARK genes perturbed and measured """
landmarks = expression_quantified[:, expression_quantified.var.feature_space == "landmark"].var_names.tolist()
expression_quantified.obs['is_landmark'] = expression_quantified.obs.perturbation.apply(lambda x: x in landmarks)
treatment = expression_quantified[(~expression_quantified.obs.is_control) & (expression_quantified.obs.is_landmark)].copy()
print(treatment.obs.perturbation.value_counts())
del treatment

### Focus Analysis only on Plates w/ 1 ctl and 1 trt & Cell Types w/ more than 50 cells.

In [None]:
expression_quantified = expression_quantified[expression_quantified.obs.cell_iname.apply(lambda x: x not in ["HEK293", "HELA"])].copy()

In [None]:
batch = "det_plate"
plates = expression_quantified.obs.groupby(batch)

numControl = dict(
    [
        tuple([
            plateName,
            [sum( expression_quantified.obs.iloc[plates.indices[plateName], :].is_control), 
             sum(~expression_quantified.obs.iloc[plates.indices[plateName], :].is_control)]
        ])
        for plateName, _ in plates
        if len(plates.indices[plateName]) > 0
    ]
)

print(f"# of plates: {len(numControl)}")
print(f"1.# of plates w/ only control: {sum([x[1] == 0 for x in numControl.values()])}")
print(f"2.# of plates w/ only treatmt: {sum([x[0] == 0 for x in numControl.values()])}")
print(f"3.# of plates w/ both        : {sum([x[0] != 0 and x[1] != 0 for x in numControl.values()])}")
print(f"# of plates w/ both ctl>15 : {sum([x[0] > 15 for x in numControl.values() if x[0] != 0 and x[1] != 0])}")
print(f"# of plates w/ both ctl>20 : {sum([x[0] > 20 for x in numControl.values() if x[0] != 0 and x[1] != 0])}")
print(f"# of plates w/ both ctl>5 trt>5 : {sum([x[0] > 5 and x[1] > 5 for x in numControl.values()])}")

In [None]:
bothExist = dict([(k,v) for k,v in numControl.items() if v[1] > 0 and v[0] > 0])
bothExistCount = np.array(list(bothExist.values())).T
plt.figure(figsize=(35, 3))
plt.bar(np.array(list(bothExist.keys())), 
        bothExistCount[0], 
        label="control")
plt.bar(np.array(list(bothExist.keys())), 
        bothExistCount[1], 
        bottom=bothExistCount[0],
        label="treatment")
plt.axhline(20, 0, 1, color='r', label="y=20")
plt.legend()
plt.xticks(rotation=90, fontsize=4)
plt.ylim(0, 500)
plt.margins(x=0)
plt.show()

In [None]:
allInstMeta = pd.read_csv(instPath, delimiter="\t")
display(allInstMeta.shape)
groups = allInstMeta.groupby("det_plate")

allPertType = list(set(allInstMeta.pert_type))
allPlateName = list(bothExist.keys())
initGroupCounter = np.zeros((len(allPertType), len(allPlateName)), dtype=float)
        
for plateName in allPlateName:
    currPlate = allInstMeta.iloc[groups.indices[plateName]]
    for pertType,v in currPlate.pert_type.value_counts().items():
            initGroupCounter[allPertType.index(pertType), 
                             allPlateName.index(plateName)] = v
            
initGroupCounter = np.array(initGroupCounter)
initGroupCounter = np.log10(initGroupCounter)

sns.heatmap(initGroupCounter, yticklabels=allPertType, vmin=0, vmax=3)
plt.title("Pert Type on Plates \nw/ Min Count of Ctl and Trt")
plt.show()

In [None]:
""" Keep only plates with a minimum # of treatment & control """
print(f"{expression_quantified.n_obs} of observations prior to filtering.")
rowsToKeep = [i in bothExist for i in expression_quantified.obs[batch]]
expression_quantified = expression_quantified[rowsToKeep, :].copy()
print(f"{expression_quantified.n_obs} of observations after filtering.")

### EDA For Data Quality Control

#### Visualization Code

In [None]:
def visualizeControlCorrelationsWithinAndBetweenBatch(adata, batch="det_plate"):
    
    """ Control correlations WITHIN a batch """
    overallControl = np.median(adata[adata.obs.is_control].X, axis=0)
    intraPlateCorr, interPlateCorr = list(), list()

    for idx, plate in enumerate(sorted(set(adata.obs[batch]))):
        plateCorr = list()
        controls = adata[(adata.obs[batch] == plate) &
                         (adata.obs.is_control)]
        nCTL = controls.n_obs * (controls.n_obs - 1) // 2

        # compute correlations among controls at fold change level
        for x1, x2 in it.combinations(controls.X, r=2):
            if controls.n_obs > 10 and np.random.random() > 50 / nCTL:
                continue
            fc1 = np.log2(x1/overallControl)
            fc2 = np.log2(x2/overallControl)
            good= np.isfinite(fc1) & np.isfinite(fc2)
            plateCorr.append(spearmanr(fc1[good],fc2[good])[0])

        if (idx + 1) % 20 == 0:
            print(f"{idx+1}..", end=" ")

        intraPlateCorr.append(np.median(plateCorr))

    if len(intraPlateCorr) == 1:
        print(f"There is only 1 plate...ctl corr is {intraPlateCorr}.")
        return
        
    """ Control correlations BETWEEN a pair of batches """
    plateCombo = np.array([p for p in it.combinations(set(adata.obs[batch]), 2)])
    plateCombo = plateCombo[np.random.choice(plateCombo.shape[0], len(intraPlateCorr)*1)]

    for idx, (plate1, plate2) in enumerate(plateCombo):
        if plate1 == plate2:
            continue
        plateCorr = list()
        ctl1 = adata[(adata.obs[batch] == plate1) & (adata.obs.is_control)]
        ctl2 = adata[(adata.obs[batch] == plate2) & (adata.obs.is_control)]
        nCTL = ctl1.n_obs * ctl2.n_obs 

        # compute correlations among controls at fold change level
        for x1, x2 in it.product(ctl1.X, ctl2.X):
            if ctl1.n_obs > 10 and ctl2.n_obs > 10 and np.random.random() > 50 / nCTL:
                continue
            fc1, fc2 = np.log2(x1/overallControl), np.log2(x2/overallControl)
            good     = np.isfinite(fc1) & np.isfinite(fc2)
            plateCorr.append(spearmanr(fc1[good], fc2[good])[0])

        if (idx + 1) % 20 == 0:
            print(f"{idx+1}..", end=" ")

        interPlateCorr.append(np.median(plateCorr)) 
        
    """ Format results into dataframe for seaborn visualization """
    interPlateResult = np.hstack([np.array(interPlateCorr)[:, np.newaxis],
                                  np.array(["InterPlateCorr"] * len(interPlateCorr))[:, np.newaxis]])
    intraPlateResult = np.hstack([np.array(intraPlateCorr)[:, np.newaxis],
                                  np.array(["IntraPlateCorr"] * len(intraPlateCorr))[:, np.newaxis]])
    corrResult = pd.DataFrame(np.vstack([interPlateResult, intraPlateResult]), columns=["Value", "Class"])
    corrResult['Value'] = corrResult['Value'].astype(np.float64)
    
    plt.figure(figsize=(4,2))
    ax = sns.violinplot(corrResult, x="Value", y="Class", cut=0)
    plt.xlabel("Spearman Correlation at FC scale")
    plt.xlim([-0.4, 1])
    plt.show()

In [None]:
def visualizeWellPositionSpecificOccupancy(adata, batch="det_well"):
    groups = adata.obs.groupby(batch)
    wellCount   = dict([(k, len(v)) for k,v in groups])

    overallOccupancyBothExist = overallOccupancy.det_plate.apply(lambda x: x in bothExist)
    overallOccupancyBothExistGroups = overallOccupancy[overallOccupancyBothExist].groupby("det_well")
    overallWellCount = dict(
        [
            (wellName, len(wellEntries)) for 
             wellName, wellEntries in overallOccupancyBothExistGroups
        ]
    )
    absPlateCount = np.zeros((16, 24))
    fracPlateCount= np.zeros((16, 24))

    for i,j in it.product(range(16), range(24)):
        wellName = f"{chr(65+i)}{1+j:02}"
        if wellName in wellCount:
            absPlateCount [i][j] = wellCount[wellName]
            fracPlateCount[i][j] = wellCount[wellName] / overallWellCount[wellName]

    fig, axes = plt.subplots(1, 2, figsize=(13, 4))
    annotation = np.full((16,24), "", dtype=object)
    annotation[:2, :2] = "0"
    sns.heatmap(absPlateCount, robust=True, cmap="coolwarm", annot=annotation, fmt="s",
                square=True, yticklabels=[chr(i+65) for i in range(16)], 
                xticklabels=[i+1 for i in range(24)], ax=axes[0])
    axes[0].xaxis.tick_top()
    axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, horizontalalignment='center')
    axes[0].set_yticklabels(axes[0].get_yticklabels(), rotation=0, horizontalalignment='center')
    axes[0].set_title("Well Occupancy Count across 394 Plates\n(Ignoring non-OE treatment)")
    sns.heatmap(fracPlateCount, robust=True, cmap="coolwarm", annot=annotation, fmt="s",
                square=True, yticklabels=[chr(i+65) for i in range(16)], 
                xticklabels=[i+1 for i in range(24)], ax=axes[1])
    axes[1].xaxis.tick_top()
    axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, horizontalalignment='center')
    axes[1].set_yticklabels(axes[1].get_yticklabels(), rotation=0, horizontalalignment='center')
    axes[1].set_title("Fraction Well Occupancy across 394 Plates\n(Count / pre-QC Count)")
    plt.show()

In [None]:
def visualizeWellPositionSpecificExpression(adata, 
                                            batch="det_well", 
                                            gene1="GAPDH", 
                                            gene2="RPS6"):
    
    def expression(geneName):
        return dict([(
            k, 
            np.mean(
                adata[v].X[:,np.where(adata.var_names == geneName)[0]]
            )
        ) for k,v in wellIndices.items()])
        
    groups = adata.obs.groupby(batch)
    wellIndices = dict([tuple([k, groups.indices[k]]) for k,v in groups])
    meanExpr1, meanExpr2 = expression(gene1), expression(gene2)
    
    meanExprPlate1, meanExprPlate2 = np.zeros((16, 24)), np.zeros((16, 24))
    
    for i,j in it.product(range(16), range(24)):
        wellName = f"{chr(65+i)}{1+j:02}"
        meanExprPlate1[i][j] = meanExpr1[wellName] if wellName in meanExpr1 else 0
        meanExprPlate2[i][j] = meanExpr2[wellName] if wellName in meanExpr2 else 0

    mat = [meanExprPlate1, meanExprPlate2]
        
    fig, axes = plt.subplots(1, 2, figsize=(13, 4))
    annotation = np.full((16,24), "", dtype=object)
    annotation[:2, :2] = "0"
    
    for col in range(2):
        sns.heatmap(mat[col], robust=True, cmap="coolwarm", annot=annotation, fmt="s",
                    square=True, yticklabels=[chr(i+65) for i in range(16)], 
                    xticklabels=[i+1 for i in range(24)], ax=axes[col])
        axes[col].xaxis.tick_top()
        axes[col].set_xticklabels(axes[col].get_xticklabels(), rotation=45, horizontalalignment='center')
        axes[col].set_yticklabels(axes[col].get_yticklabels(), rotation=0, horizontalalignment='center')
        axes[col].set_title(f"Mean {gene1 if col == 0 else gene2} Expression Across 394 Plates")
    plt.show()

In [None]:
def visualizeUMAPColoredByFeatureOfInterest(adata):
    
    def basicUnsupervisedAnalysis(adata):
        adata = adata.copy()
        # This is just to estimate the mean and dispersion, but we'll use all 978 landmark genes.
        sc.pp.highly_variable_genes(adata, min_mean=0.2, max_mean=4, min_disp=0.2, n_bins=50)
        adata.var["highly_variable"] = adata.var["feature_space"]=="landmark"
        """ Commented out to reduce the # of figures """
        # sc.pl.highly_variable_genes(adata)
        with warnings.catch_warnings():
            sc.tl.pca(adata, n_comps=100)
        sc.pp.neighbors(adata)
        sc.tl.umap(adata)
        sc.tl.leiden(adata, resolution=0.5)
        return adata

    def visualizeBasicAnalysis(adata):
        adata = adata.copy()
        adata.obs["oddRow"] = adata.obs.det_row.apply(lambda x: 1 
                                                      if x in ['A','C','E','G',
                                                               'I','K','M','O'] 
                                                      else 0)
        sc.pl.umap(adata, 
                   color=["GAPDH", "bead_batch", 
                          "det_row", "oddRow", 
                          "det_col", "leiden", 
                          "is_control_int", "rna_plate"], 
                   ncols=2, 
                   legend_fontsize="xx-small", 
                   color_map="coolwarm", 
                   size=4)
                
        # Will ask CO to use only one cluster.
        # Requires setting certain other undocumented aspects of object state. :(
        adata.obs["fake_cluster"]="all_one_cluster"
        adata.obs.fake_cluster = adata.obs.fake_cluster.astype("category")
        adata.uns["fake_cluster_colors"] = ['#1f77b4']
        return adata
        
    adata = basicUnsupervisedAnalysis(adata)
    adata = visualizeBasicAnalysis(adata)
    
    return adata

#### Regression Code

In [None]:
def customOneHotEncoding(adata, SVA=True):
    
    def distToCenter(well):
        row = ord(well[0 ])
        col = int(well[1:])
        return np.sqrt((row - 72.5) ** 2 + 
                       (col - 12.5) ** 2)
    
    if 'leiden' not in adata.obs.columns:
        raise ValueError("Missing leiden cluster assignment!")
    
    columns = (list(sorted(set(adata.obs.rna_plate))) + 
               list(sorted(set(adata.obs.rna_well.apply(lambda x: x[0 ])))) +                
               list(sorted(set(adata.obs.rna_well.apply(lambda x: x[1:])))) + 
               list(sorted(set(adata.obs.bead_batch))) + 
               list({"distToPlateCenter"}) + 
               list(sorted(set(adata.obs.project_code))))
    
    if SVA:
        columns = columns + list([f'leiden{i}' for i in sorted(set(adata.obs.leiden))])

    print(f"One-hot encoding size {len(columns)}")
    
    elementPerRow = 7 if SVA else 6
    row  = np.repeat(np.arange(adata.n_obs), elementPerRow)
    col  = np.zeros(row.shape)
    data = np.ones (row.shape)
    
    for obsIdx in range(adata.n_obs):
        startIdx = obsIdx * elementPerRow
        currVal  = adata.obs.iloc[obsIdx]
        col [startIdx    ] = columns.index(currVal.rna_plate)
        col [startIdx + 1] = columns.index(currVal.rna_well [0 ])
        col [startIdx + 2] = columns.index(currVal.rna_well [1:])
        col [startIdx + 3] = columns.index(currVal.bead_batch)
        col [startIdx + 4] = len(columns) - 1
        data[startIdx + 4] = distToCenter(currVal.rna_well)
        col [startIdx + 5] = columns.index(currVal.project_code)
        
        if SVA:
            col [startIdx + 6] = columns.index(f"leiden{currVal.leiden}")
        
    
    feature = np.zeros((adata.n_obs, len(columns)))
    feature[row.astype(int), col.astype(int)] = data
    feature = pd.DataFrame(feature, columns=columns, index=adata.obs.index)    
    
    output = adata.copy()
    output.obs = output.obs.join(feature)
    
    return output

In [None]:
def checkingRegressionOutcomes(adata, cellType):

    print("=" * 100 + "\n", f"Will Start Processing {cellType}...\n", "=" * 100 + "\n")
    
    curr = adata[adata.obs.cell_iname == cellType].copy()
    meanExpr    = np.mean(curr.X, axis=0)
    pseudocount = np.min (curr.X[curr.X != 0]) / 2 
    curr.X = np.log2(curr.X + pseudocount)
    
    print(f"\nBefore Regression {cellType}!\n")
    visualizeControlCorrelationsWithinAndBetweenBatch(curr)  
    visualizeWellPositionSpecificOccupancy(curr)
    visualizeWellPositionSpecificExpression(curr)
    curr = visualizeUMAPColoredByFeatureOfInterest(curr)

    print(f"\nRegressing {cellType}!\n")
    regressOutput = customOneHotEncoding(curr, SVA=False)
    sc.pp.regress_out(regressOutput, 
                      keys=regressOutput.obs.columns[len(curr.obs.columns):].tolist(), 
                      n_jobs=10)
    
    print(f"\nAfter Round 1 of Regression {cellType}!\n")
    visualizeControlCorrelationsWithinAndBetweenBatch(regressOutput)  
    visualizeWellPositionSpecificExpression(regressOutput)    
    regressOutput = visualizeUMAPColoredByFeatureOfInterest(regressOutput)

    regressOutput.X = np.power(2, regressOutput.X) - pseudocount
    regressOutput.X -= np.mean(regressOutput.X, axis=0)
    regressOutput.X += meanExpr
    
    return regressOutput

#### Cell Type Specific Before and After

In [None]:
expression_quantified.obs.cell_iname.value_counts()

##### A549

In [None]:
currStr = "A549"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = currAdata[:, currAdata.var.gene_symbol.apply(lambda x: x 
                                                         not in ['RNF167', 'STX1A', 'CLIC4', 'MAP7']
                                                        )].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in ['18', '22'] else 0)
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (1,2), (1, 4), (3, 4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} before: {currAdata.n_obs}")
currAdata = currAdata[currAdata.obs.leiden.apply(lambda x: x not in ["18", "22"])].copy()
print(f"Number of {currStr} after : {currAdata.n_obs}")
A549 = currAdata.copy()
sc.pl.umap(A549, color="leiden")

##### PC3

In [None]:
currStr = "PC3"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in ['11'] else 0)
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (1,2), (1, 4), (3, 4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} before: {currAdata.n_obs}")
currAdata = currAdata[currAdata.obs.leiden.apply(lambda x: x not in ["11"])].copy()
print(f"Number of {currStr} after : {currAdata.n_obs}")
PC3 = currAdata.copy()
sc.pl.umap(PC3, color="leiden")

##### A375

In [None]:
currStr = "A375"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = currAdata[:, currAdata.var.gene_symbol.apply(lambda x: x 
                                                         not in ['MAP7', 'CLIC4', 'STX1A', 'RNF167']
                                                        )].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in ['17'] else 0)
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(2,0), (2,1), (2,3), (2,4), (2,5), (2,6), (2,7), (2,8)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} before: {currAdata.n_obs}")
currAdata = currAdata[currAdata.obs.leiden.apply(lambda x: x not in ["17"])].copy()
print(f"Number of {currStr} after : {currAdata.n_obs}")
A375 = currAdata.copy()
sc.pl.umap(A375, color="leiden")

##### HEK293T

In [None]:
currStr = "HEK293T"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in ['14'] else 0)
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (0,4), (1,2), (1,3), (1,4), (2,3), (2,4), (3,4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
""" No Removal """
print(f"Number of {currStr} after : {currAdata.n_obs}")
HEK293T = currAdata.copy()

##### HA1E

In [None]:
currStr = "HA1E"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = currAdata[:, currAdata.var.gene_symbol.apply(lambda x: x 
                                                         not in ['MAP7', 'CLIC4', 'STX1A', 'RNF167', 
                                                                 'CEBPA', 'SATB1', 'CXCL2', 
                                                                 'FAS', 
                                                                 'ZNF586', 'PSMD2', 
                                                                 'PAN2', 'PROS1', 
                                                                 'GNAS', 'LAMA3'
                                                                ]
                                                        )].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in ['9', '10', '11', '12', '13', '14', '15', '16'] else 0)
print(f"Num of instances defined as outliers: {currAdata.obs['outlier'].sum()}")
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (0,4), (1,2), (1,3), (1,4), (2,3), (2,4), (3,4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} before: {currAdata.n_obs}")
currAdata = currAdata[currAdata.obs.leiden.apply(lambda x: x not in ['9', '10', '11', '12', '13', '14', '15', '16'])].copy()
print(f"Number of {currStr} after : {currAdata.n_obs}")
HA1E = currAdata.copy()
sc.pl.umap(HA1E, color="leiden")

##### MCF7

In [None]:
currStr = "MCF7"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = currAdata[:, currAdata.var.gene_symbol.apply(lambda x: x 
                                                         not in ['MAP7', 'CLIC4', 'STX1A', 'RNF167', 'CXCL2', 
                                                                ]
                                                        )].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in [] else 0)
print(f"Num of instances defined as outliers: {currAdata.obs['outlier'].sum()}")
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (0,4), (1,2), (1,3), (1,4), (2,3), (2,4), (3,4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} after : {currAdata.n_obs}")
MCF7 = currAdata.copy()
sc.pl.umap(MCF7, color="leiden")

##### HT29

In [None]:
currStr = "HT29"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = currAdata[:, currAdata.var.gene_symbol.apply(lambda x: x 
                                                         not in ['MAP7', 'CLIC4', 'STX1A', 'RNF167',
                                                                 'EPHA3', 'ALDOA', 'DNAJB1', 'CSRP1', 'BHLHE40', 'GAA'                                                                 
                                                                ]
                                                        )].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in [] else 0)
print(f"Num of instances defined as outliers: {currAdata.obs['outlier'].sum()}")
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (0,4), (1,2), (1,3), (1,4), (2,3), (2,4), (3,4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} after : {currAdata.n_obs}")
HT29 = currAdata.copy()
sc.pl.umap(HT29, color="leiden")

##### VCAP

In [None]:
currStr = "VCAP"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in [] else 0)
print(f"Num of instances defined as outliers: {currAdata.obs['outlier'].sum()}")
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (0,4), (1,2), (1,3), (1,4), (2,3), (2,4), (3,4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} after : {currAdata.n_obs}")
VCAP = currAdata.copy()
sc.pl.umap(VCAP, color="leiden")

##### HCC515

In [None]:
currStr = "HCC515"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = currAdata[:, currAdata.var.gene_symbol.apply(lambda x: x 
                                                         not in ['MAP7', 'CLIC4', 'STX1A', 'RNF167', 'CEBPA',
                                                                 'SATB1', 'FOS', 'RAC2',
                                                                 'CSRP1', 'DNAJB1', 
                                                                 'TLR4', 'CAST', 
                                                                 'GNAS', 'SYNGR3',
                                                                 'IKZF1', 'TCFL5', 'MAMLD1', 'ZDHHC6'
                                                                ]
                                                        )].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in ['7'] else 0)
print(f"Num of instances defined as outliers: {currAdata.obs['outlier'].sum()}")
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (0,4), (1,2), (1,3), (1,4), (2,3), (2,4), (3,4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} before: {currAdata.n_obs}")
currAdata = currAdata[currAdata.obs.leiden.apply(lambda x: x not in ['7'])].copy()
print(f"Number of {currStr} after : {currAdata.n_obs}")
HCC515 = currAdata.copy()
sc.pl.umap(HCC515, color="leiden")

##### H1299

In [None]:
currStr = "H1299"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = currAdata[:, currAdata.var.gene_symbol.apply(lambda x: x 
                                                         not in ['MMP1', 'SPP1',
                                                                ]
                                                        )].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in ['5'] else 0)
print(f"Num of instances defined as outliers: {currAdata.obs['outlier'].sum()}")
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (0,4), (1,2), (1,3), (1,4), (2,3), (2,4), (3,4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} after : {currAdata.n_obs}")
H1299 = currAdata.copy()
sc.pl.umap(H1299, color="leiden")

##### AALE

In [None]:
currStr = "AALE"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in ['6', '8'] else 0)
print(f"Num of instances defined as outliers: {currAdata.obs['outlier'].sum()}")
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (0,4), (1,2), (1,3), (1,4), (2,3), (2,4), (3,4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} before: {currAdata.n_obs}")
currAdata = currAdata[currAdata.obs.leiden.apply(lambda x: x not in ['6', '8'])].copy()
print(f"Number of {currStr} after : {currAdata.n_obs}")
AALE = currAdata.copy()
sc.pl.umap(AALE, color="leiden")

##### SALE

In [None]:
currStr = "SALE"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in ['6'] else 0)
print(f"Num of instances defined as outliers: {currAdata.obs['outlier'].sum()}")
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (0,4), (1,2), (1,3), (1,4), (2,3), (2,4), (3,4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} after : {currAdata.n_obs}")
SALE = currAdata.copy()
sc.pl.umap(SALE, color="leiden")

##### HEPG2

In [None]:
currStr = "HEPG2"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = currAdata[:, currAdata.var.gene_symbol.apply(lambda x: x 
                                                         not in ['MAP7', 'CLIC4', 'STX1A', 'RNF167', 
                                                                 'SATB1', 'CXCL2',
                                                                 'RAC2', 'CBR3', 'RPL39L',
                                                                 'HK1', 'PAK6', 'GNAS', 
                                                                 'GHR', 'HLA-DMA',
                                                                 'CAST', 'PSMD2', 'ZNF586', 'BHLHE40'
                                                                ]
                                                        )].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in ['3', '4', '5'] else 0)
print(f"Num of instances defined as outliers: {currAdata.obs['outlier'].sum()}")
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (0,4), (1,2), (1,3), (1,4), (2,3), (2,4), (3,4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} before: {currAdata.n_obs}")
currAdata = currAdata[currAdata.obs.leiden.apply(lambda x: x not in ['3', '4', '5'])].copy()
print(f"Number of {currStr} after : {currAdata.n_obs}")
HEPG2 = currAdata.copy()
sc.pl.umap(HEPG2, color="leiden")

##### U2OS

In [None]:
currStr = "U2OS"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in [] else 0)
print(f"Num of instances defined as outliers: {currAdata.obs['outlier'].sum()}")
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (0,4), (1,2), (1,3), (1,4), (2,3), (2,4), (3,4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
""" No Removal """
print(f"Number of {currStr} after : {currAdata.n_obs}")
U2OS = currAdata.copy()
sc.pl.umap(U2OS, color="leiden")

##### HUH7

In [None]:
currStr = "HUH7"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in [] else 0)
print(f"Num of instances defined as outliers: {currAdata.obs['outlier'].sum()}")
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (0,4), (1,2), (1,3), (1,4), (2,3), (2,4), (3,4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} after : {currAdata.n_obs}")
HUH7 = currAdata.copy()
sc.pl.umap(HUH7, color="leiden")

##### RWPE1

In [None]:
currStr = "RWPE1"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in [] else 0)
print(f"Num of instances defined as outliers: {currAdata.obs['outlier'].sum()}")
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (0,4), (1,2), (1,3), (1,4), (2,3), (2,4), (3,4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} after : {currAdata.n_obs}")
RWPE1 = currAdata.copy()
sc.pl.umap(RWPE1, color="leiden")

##### LHSAR

In [None]:
currStr = "LHSAR"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = currAdata[:, currAdata.var.gene_symbol.apply(lambda x: x 
                                                         not in ['GNAS'
                                                                ]
                                                        )].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in [] else 0)
print(f"Num of instances defined as outliers: {currAdata.obs['outlier'].sum()}")
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (0,4), (1,2), (1,3), (1,4), (2,3), (2,4), (3,4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} after : {currAdata.n_obs}")
LHSAR = currAdata.copy()
sc.pl.umap(LHSAR, color="leiden")

##### LNCAP

In [None]:
currStr = "LNCAP"
currAdata = expression_quantified[expression_quantified.obs.cell_iname == currStr].copy()
currAdata = currAdata[:, currAdata.var.gene_symbol.apply(lambda x: x 
                                                         not in ['GNAS'
                                                                ]
                                                        )].copy()
currAdata = checkingRegressionOutcomes(currAdata, currStr)

In [None]:
currAdata.obs['outlier'] = currAdata.obs.leiden.apply(lambda x: 1 if x in [] else 0)
print(f"Num of instances defined as outliers: {currAdata.obs['outlier'].sum()}")
sc.pl.umap(currAdata, color = 'outlier', color_map="coolwarm")
sc.pl.pca_variance_ratio(currAdata, n_pcs=100)
sc.pl.pca(currAdata, color='outlier', dimensions=[(0,1), (0,2), (0,3), (0,4), (1,2), (1,3), (1,4), (2,3), (2,4), (3,4)], ncols=3, color_map="coolwarm")
sc.pl.pca_loadings(currAdata, components='1,2,3,4,5')

In [None]:
print(f"Number of {currStr} after : {currAdata.n_obs}")
LNCAP = currAdata.copy()
sc.pl.umap(LNCAP, color="leiden")

#### Concatenate All AnnData

In [None]:
allAnndata = [A549, PC3, A375, HEK293T, HA1E, MCF7, HT29, VCAP, HCC515, H1299, AALE, SALE, HEPG2, U2OS, HUH7, RWPE1, LHSAR, LNCAP]
geneNames  = [set(adata.var_names.tolist()) for adata in allAnndata]
geneIntersect = set.intersection(*geneNames)
print(f"{len(geneIntersect)} genes remaining..")

In [None]:
newAnndata = sc.AnnData(
    X=np.vstack(
        [
            adata[:, adata.var.gene_symbol.apply(lambda x: x in geneIntersect)].X 
            for adata 
            in allAnndata
        ]
    ).copy(),
    
    var=allAnndata[0].var.loc[geneIntersect].copy(),
    
    obs=pd.concat(
        [
            adata.obs.iloc[:, :expression_quantified.obs.shape[1]] 
            for adata 
            in allAnndata
        ]
    ).copy()
)
print(newAnndata)

In [None]:
newAnndata.obs["consistentW/Perturbation"] = np.full(newAnndata.n_obs, "NA", dtype=object)
newAnndata.obs["logFC"] = np.full(newAnndata.n_obs, -999)

for cellType in set(newAnndata.obs.cell_iname):
    currCellTypeIdx = (newAnndata.obs.cell_iname == cellType)
    print(f"========== {cellType} Begin ==========")
    try:
        status, logFC = ingestion.checkConsistency(newAnndata[currCellTypeIdx].copy(),
                                                   perturbationType="overexpression", 
                                                   group="rna_plate", 
                                                   verbose=False)
        newAnndata.obs["consistentW/Perturbation"][currCellTypeIdx] = status
        newAnndata.obs["logFC"                   ][currCellTypeIdx] = logFC
        display(Counter(status))
    except:
        print(f"Skip {cellType}")
    finally:
        print(f"========== {cellType} End ==========\n\n")

In [None]:
newAnndata.obs["spearmanCorr"] = np.full(newAnndata.n_obs, -999)
newAnndata.obs[ "pearsonCorr"] = np.full(newAnndata.n_obs, -999)

for cellType in sorted(set(newAnndata.obs.cell_iname)):
    currCellTypeIdx = (newAnndata.obs.cell_iname == cellType)
    print(f"========== {cellType} Begin ==========")
    print(f"Total {len(set(newAnndata[currCellTypeIdx].obs.perturbation))} perturbagens...")
    try:
        correlations = ingestion.computeCorrelation(newAnndata[currCellTypeIdx], 
                                                    verbose=True, group="rna_plate")
        newAnndata.obs["spearmanCorr"][currCellTypeIdx] = correlations[0]
        newAnndata.obs[ "pearsonCorr"][currCellTypeIdx] = correlations[1]
    except:
        print(f"Skip {cellType}")
        plt.show()
    finally:
        print(f"========== {cellType} End ==========\n\n")

### Output Data

In [None]:
expression_quantified = newAnndata.copy()

In [None]:
# This is just to estimate the mean and dispersion, but we'll use all 978 landmark genes.
sc.pp.log1p(expression_quantified)
sc.pp.highly_variable_genes(expression_quantified, min_mean=0.2, max_mean=4, min_disp=0.2, n_bins=50)
expression_quantified.var["highly_variable"] = expression_quantified.var["feature_space"]=="landmark"

with warnings.catch_warnings():
    sc.tl.pca(expression_quantified, n_comps=100)
sc.pp.neighbors(expression_quantified)
sc.tl.umap(expression_quantified)
sc.tl.leiden(expression_quantified, resolution=1)

In [None]:
sc.pl.umap(expression_quantified, 
           color=["cell_iname", "GAPDH", 
                  "bead_batch", "is_control_int",
                  "det_row",  "det_col", 
                  "leiden"], 
           ncols=2, 
           legend_fontsize="xx-small", 
           color_map="coolwarm", 
           size=4, wspace=0.3)

In [None]:
# Will ask CO to use only one cluster.
# Requires setting certain other undocumented aspects of object state. :(
expression_quantified.obs["fake_cluster"]="all_one_cluster"
expression_quantified.obs.fake_cluster = expression_quantified.obs.fake_cluster.astype("category")
expression_quantified.uns["fake_cluster_colors"] = ['#1f77b4']

In [None]:
perturbed_genes = set(list(expression_quantified.obs['perturbation'].unique())).difference(controls)
perturbed_and_measured_genes = perturbed_genes.intersection(expression_quantified.var.index)
perturbed_but_not_measured_genes = perturbed_genes.difference(expression_quantified.var.index)
genes_keep = expression_quantified.var.index[expression_quantified.var['highly_variable']]
genes_keep = set(genes_keep).union(perturbed_and_measured_genes)
expression_quantified_orig = expression_quantified.copy()
print("These genes were perturbed and measured:")
print(len(perturbed_and_measured_genes))
print("These genes were perturbed but not measured:")
print(len(sorted(list(perturbed_but_not_measured_genes))))
print("Actually, nevermind. Keeping only the 978 landmark genes (some are filtered out)")

In [None]:
# final form, ready to save
expression_quantified.uns["perturbed_and_measured_genes"]     = list(perturbed_and_measured_genes)
expression_quantified.uns["perturbed_but_not_measured_genes"] = list(perturbed_but_not_measured_genes)
expression_quantified = ingestion.describe_perturbation_effect(expression_quantified, "overexpression")

In [None]:
expression_quantified

In [None]:
expression_quantified.obs.logFC = expression_quantified.obs.logFC.astype(np.float64)
expression_quantified.obs.spearmanCorr = expression_quantified.obs.spearmanCorr.astype(np.float64)
expression_quantified.obs.pearsonCorr  = expression_quantified.obs.pearsonCorr .astype(np.float64)
expression_quantified.raw = expression_quantified_orig

In [None]:
os.makedirs("perturbations/cmap", exist_ok = True)
expression_quantified.write_h5ad("perturbations/cmap/test.h5ad")