An initial exploration of the CMAP overexpression data

In [None]:
import warnings
warnings.filterwarnings('ignore')
import regex as re
import os
import shutil
import sys
import importlib
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import seaborn as sns
import celloracle as co
from scipy.stats import spearmanr as spearmanr
from IPython.display import display, HTML
# local
import importlib
import sys
sys.path.append("setup")
import ingestion
importlib.reload(ingestion)

#      visualization settings
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

# Specify the working directory explicitly.
os.chdir("/home/ekernf01/Desktop/jhu/research/projects/perturbation_prediction/cell_type_knowledge_transfer/perturbations/")

### Reshape the data

In [None]:
expression_quantified = ad.concat([
    ingestion.read_cmap( 
        expression_file = 'not_ready/cmap/level3_beta_ctl_n188708x12328.gctx',
        gene_metadata = "not_ready/cmap/GSE70138_Broad_LINCS_gene_info_2017-03-06.txt.gz",
        instance_metadata = "not_ready/cmap/instinfo_beta.txt", 
    ),
    ingestion.read_cmap( 
        expression_file = 'not_ready/cmap/level3_beta_trt_oe_n131668x12328.gctx',
        gene_metadata = "not_ready/cmap/GSE70138_Broad_LINCS_gene_info_2017-03-06.txt.gz",
        instance_metadata = "not_ready/cmap/instinfo_beta.txt", 
    ),
])
expression_quantified
expression_quantified.raw = expression_quantified.copy()

In [None]:
# concat discards gene metadata even with "outer". grrr. have to remake it.
expression_quantified.var = pd.merge(
    expression_quantified.var, 
    pd.read_csv("not_ready/cmap/GSE70138_Broad_LINCS_gene_info_2017-03-06.txt.gz", sep = "\t"),
    how = "left", 
    left_index = True,
    right_on = "pr_gene_symbol",
)
expression_quantified.var_names = expression_quantified.var["pr_gene_symbol"]

In [None]:
# Find names of genes perturbed
perturbation_metadata = pd.read_csv("not_ready/cmap/GSE92742_Broad_LINCS_pert_info.txt.gz", sep = "\t")
print(perturbation_metadata.head())
expression_quantified.obs = pd.merge(
    expression_quantified.obs,
    perturbation_metadata,
    left_on = "pert_id", ???
    right_on = "pert_id",
)
expression_quantified.obs["perturbation"] = expression_quantified.obs["pert_iname"] 

### Count up different perturbations

In [None]:
human_TFs = pd.read_csv("../accessory_data/humanTFs.csv")
human_TFs = human_TFs.loc[human_TFs["Is TF?"]=="Yes",:]

perturbation_metadata = pd.read_csv("not_ready/cmap/GSE92742_Broad_LINCS_pert_info.txt.gz", sep = "\t")
print("\n\nperturbation metadata")
display(perturbation_metadata)
meta_with_pert_meta = pd.merge(
    expression_quantified.obs,
    perturbation_metadata, 
    on = "pert_id",
    how = "left",
)
meta_with_pert_meta["is_tf"] = meta_with_pert_meta["pert_iname"].isin(human_TFs["HGNC symbol"])
print("\n\nperturbation frequencies")
print(meta_with_pert_meta["pert_iname"].value_counts())
print("\n\nperturbation frequencies (just TF's)")
print(meta_with_pert_meta.query("is_tf")["pert_iname"].value_counts())

### Add `is_control` and `perturbation`

In [None]:
print(expression_quantified.obs.columns)
print(expression_quantified.obs["pert_type"].value_counts())
controls = ["ctl_vector"]
expression_quantified.obs['perturbation'] = expression_quantified.obs["pert_id"]
expression_quantified.obs["is_control"] = expression_quantified.obs['pert_type'].isin(controls).astype(int)

### Basic unsupervised analysis

In [None]:
expression_quantified.X
# This is just to estimate the mean and dispersion, but we'll use all 978 landmark genes.
sc.pp.highly_variable_genes(expression_quantified, min_mean=0.2, max_mean=4, min_disp=0.2, n_bins=50)

In [None]:
expression_quantified.var["highly_variable"] = expression_quantified.var["pr_is_lm"]==1
expression_quantified.var

In [None]:
sc.pl.highly_variable_genes(expression_quantified)
with warnings.catch_warnings():
    sc.tl.pca(expression_quantified, n_comps=100)
sc.pp.neighbors(expression_quantified)
sc.tl.umap(expression_quantified)
clusterResolutions = []
sc.tl.leiden(expression_quantified)

In [None]:
expression_quantified.obs = ingestion.simplify_categorical(expression_quantified.obs, column = "cell_iname")

In [None]:
expression_quantified.obs["cell_iname_simplified"].value_counts()

In [None]:
print(expression_quantified.obs.columns)
for l in [
    "GAPDH", "leiden", "is_control",
     'bead_batch', 'nearest_dose', 'pert_dose',
    'count_mean', 'count_cv',
    'qc_f_logp', 'qc_iqr', 'qc_slope', 'pert_type', 'cell_iname_simplified',
    'qc_pass', 'dyn_range', 'inv_level_10', 'build_name', 'failure_mode',
    'project_code',
    ]:
    try:
        print(sc.pl.umap(expression_quantified, color = l))
    except Exception as e:
        print("Error on input\n" + l + ":\n" + repr(e))
# Will ask CO to use only one cluster.
# Requires setting certain other undocumented aspects of object state. :(
expression_quantified.obs["fake_cluster"]="all_one_cluster"
expression_quantified.obs.fake_cluster = expression_quantified.obs.fake_cluster.astype("category")
expression_quantified.uns["fake_cluster_colors"] = ['#1f77b4']

### Data reduction

With only 64GB of RAM, I have been unable to make whole-transcriptome predictions with CellOracle. A data reduction step is necessary where only highly variable genes are included. We also keep all genes that are perturbed, whether or not they appear highly variable -- unless they are unavailable in the first place. 

In [None]:
perturbed_genes = set(list(expression_quantified.obs['perturbation'].unique())).difference(controls)
perturbed_and_measured_genes = perturbed_genes.intersection(expression_quantified.var.index)
perturbed_but_not_measured_genes = perturbed_genes.difference(expression_quantified.var.index)
genes_keep = expression_quantified.var.index[expression_quantified.var['highly_variable']]
genes_keep = set(genes_keep).union(perturbed_and_measured_genes)
expression_quantified_orig = expression_quantified.copy()
print("These genes were perturbed and measured:")
print(perturbed_and_measured_genes)
print("These genes were perturbed but not measured:")
print(sorted(list(perturbed_but_not_measured_genes)))
print("This many variable genes will be kept and used by CO:")
print(len(genes_keep))
print("Actually, nevermind. Let us keep just the 978 landmark genes.")
genes_keep = expression_quantified.var.index[expression_quantified.var['pr_is_lm']]
expression_quantified = ingestion.describe_perturbation_effect(expression_quantified, "overexpression")


In [None]:
# final form, ready to save
expression_quantified = expression_quantified_orig[:,list(genes_keep)]
expression_quantified.uns["perturbed_and_measured_genes"]     = list(perturbed_and_measured_genes)
expression_quantified.uns["perturbed_but_not_measured_genes"] = list(perturbed_but_not_measured_genes)

In [None]:
os.makedirs("perturbations/cmap", exist_ok = True)
expression_quantified.write_h5ad("perturbations/cmap/test.h5ad")