# SOPHIE vs traditional DE

The goal of this experiment is to determine how often SOPHIE ranks specific genes over generic genes compared to using traditional DE analysis

In [1]:
%load_ext autoreload
%autoreload 2
import os
import pickle
import numpy as np
import pandas as pd
import random
import seaborn as sns
from ponyo import utils

np.random.seed(2)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


## Create simulated data

1. Each perturbation experiment has 8 samples (4 perturbed vs 4 control) with 1000 genes. Create initial expression profiles for 4 samples by drawing from a gaussian with a mean/sd for each gene.
2. Say that there are 100 generic genes selected from the 1000 genes. The generic genes will have the some scalar value added for the “perturbed” samples
3. Each perturbation experiment will have 10 “specific” genes are randomly selected from the remaining 900 non-generic genes. Then the specific genes will have the some scalar value added for the “perturbed” samples
4. Repeat this process 90 times to get 90 experiments

In [3]:
# User params
num_genes = 1000
num_samples = 8
num_generic_genes = 100
num_specific_genes = 10
num_experiments = 90

In [6]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))

config_filename = "config_sophie_vs_trad.tsv"

params = utils.read_config(config_filename)

In [7]:
# Load config params

# Local directory to store intermediate files
local_dir = params["local_dir"]

# Un-normalized compendium filename
raw_compendium_filename = params["raw_compendium_filename"]

In [8]:
# Pickle files to save
generic_gene_ids_filename = "generic_gene_ids.pickle"

In [9]:
# Setup variables

# Sample ids (4 controls, 4 perturbed)
sample_ids = [
    "0_control",
    "1_control",
    "2_control",
    "3_control",
    "4_perturb",
    "5_perturb",
    "6_perturb",
    "7_perturb",
]

# Make range of numbers into string to use as gene ids
gene_ids = [f"G_{i}" for i in range(num_genes)]

# Randomly select generic gene ids
generic_gene_ids = random.sample(gene_ids, num_generic_genes)
print(generic_gene_ids)

['G_101', 'G_723', 'G_668', 'G_883', 'G_802', 'G_302', 'G_838', 'G_387', 'G_335', 'G_942', 'G_463', 'G_341', 'G_437', 'G_774', 'G_862', 'G_915', 'G_896', 'G_948', 'G_492', 'G_731', 'G_331', 'G_193', 'G_904', 'G_956', 'G_579', 'G_224', 'G_255', 'G_155', 'G_848', 'G_467', 'G_320', 'G_644', 'G_959', 'G_436', 'G_787', 'G_804', 'G_797', 'G_381', 'G_894', 'G_116', 'G_292', 'G_642', 'G_747', 'G_415', 'G_854', 'G_835', 'G_743', 'G_484', 'G_394', 'G_508', 'G_782', 'G_144', 'G_167', 'G_366', 'G_562', 'G_339', 'G_926', 'G_171', 'G_870', 'G_924', 'G_441', 'G_827', 'G_826', 'G_306', 'G_755', 'G_614', 'G_534', 'G_771', 'G_606', 'G_326', 'G_742', 'G_95', 'G_118', 'G_834', 'G_136', 'G_205', 'G_53', 'G_740', 'G_950', 'G_772', 'G_958', 'G_225', 'G_433', 'G_657', 'G_714', 'G_480', 'G_477', 'G_631', 'G_300', 'G_615', 'G_868', 'G_764', 'G_885', 'G_143', 'G_308', 'G_32', 'G_162', 'G_12', 'G_531', 'G_395']


## Supporting functions

In [10]:
# Function to make an individual experiment
def run_make_experiment(
    num_samples,
    num_genes,
    sample_ids,
    all_gene_ids,
    generic_gene_ids,
    specific_gene_ids,
    r,
):
    experiment_data = {}
    for gene in range(num_genes):

        # Simulate expression by sampling from a Negative Binomial distribution
        # NB(r,p)
        # r: number of successes until we stop trials
        # p: success rate, the probability that each trial is a success
        # Within an experiment, each trial can have 2 outcomes = success or failure
        # Within a trial, the probability of success is p
        # NB()
        # Randomly select a different success probability for each gene so that
        # each gene has a different rate at which its expressed
        #
        # The variance of the NB distribution gets smaller as r increases
        # (specifically it's r * (p / (1 - p)^2)), see https://en.wikipedia.org/wiki/Negative_binomial_distribution
        # Using different r values will give you both different means and different variances
        # (so will different values of p, but the scaling is much simpler for r).
        p = random.uniform(0.0, 1.0)
        gene_profile = np.random.negative_binomial(r, p, num_samples)

        # Create dictionary to define dataframe
        experiment_data[f"G_{gene}"] = gene_profile

    # Create experiment dataframe
    experiment_df = pd.DataFrame(experiment_data)

    # Set index
    experiment_df.index = sample_ids

    return experiment_df

In [11]:
# Make multiple experiments
def make_experiments(
    num_experiments,
    num_samples,
    num_genes,
    sample_ids,
    all_gene_ids,
    generic_gene_ids,
    num_specific_genes,
):

    expression_df = pd.DataFrame()
    specific_gene_id_lst = []

    for i in range(num_experiments):

        # Randomly select specific genes from the pool of (900)remaining genes
        # Select without replacement
        remaining_gene_ids = list(set(all_gene_ids).difference(generic_gene_ids))
        specific_gene_ids = random.sample(remaining_gene_ids, num_specific_genes)

        # Save specific gene ids for reference later
        specific_gene_id_lst.append(specific_gene_ids)

        # Set a different r per experiment to try to add more
        # variance to the compendium
        # Issue: the VAE is compressing the DE signal in the template
        # experiment too much so the simulated experiments don't retain this
        # DE signal at all.
        # Hypothesis: Generating more variance in the compendium experiments
        # may reduce the compression
        r = random.randint(5, 50)

        experiment_df = run_make_experiment(
            num_samples,
            num_genes,
            sample_ids,
            all_gene_ids,
            generic_gene_ids,
            specific_gene_ids,
            r,
        )

        # Randomly select scaler for generic and specific genes to be the median
        # of the expression values so that the signal is strong

        # p = random.uniform(0.0, 0.2)
        # generic_scaler = np.random.negative_binomial(100, p)
        # specific_scaler = np.random.negative_binomial(50, p)
        generic_scaler = experiment_df.median().quantile(0.98)
        specific_scaler = generic_scaler
        print("generic scaler", generic_scaler)
        print("specific scaler", specific_scaler)

        # Perturb generic genes by scaler
        # Randomly select a scaler
        # Only add scaler to perturbed samples
        experiment_df.loc[
            experiment_df.index.str.contains("perturb"), generic_gene_ids
        ] += generic_scaler

        # Perturb specific genes by a different scaler
        experiment_df.loc[
            experiment_df.index.str.contains("perturb"), specific_gene_ids
        ] += specific_scaler

        # Concatenate experiments
        expression_df = pd.concat([expression_df, experiment_df])

    # Try to reset index to see if this makes a difference
    # NOTE: VAE don't train when sample indices are identical, not sure why
    if num_experiments > 1:
        expression_df = expression_df.reset_index(drop=True)

    return expression_df, specific_gene_id_lst

## Make a template experiment

In [12]:
for i in range(10):
    template_experiment, template_specific_gene_ids = make_experiments(
        1,
        num_samples,
        num_genes,
        sample_ids,
        gene_ids,
        generic_gene_ids,
        num_specific_genes,
    )

    # Save template experiment
    raw_template_filename = f"/home/alexandra/Documents/Data/Generic_expression_patterns/reviewer_experiment/raw_template_{i}.tsv"
    template_experiment.to_csv(raw_template_filename, sep="\t")

    # Pickle specific gene ids
    template_specific_gene_ids_filename = f"/home/alexandra/Documents/Data/Generic_expression_patterns/reviewer_experiment/template_specific_gene_ids_{i}.pickle"
    with open(template_specific_gene_ids_filename, "wb") as pkl_fh:
        pickle.dump(template_specific_gene_ids[0], pkl_fh, protocol=3)

generic scaler 1228.5299999999936
specific scaler 1228.5299999999936
generic scaler 1210.6999999999985
specific scaler 1210.6999999999985
generic scaler 336.4399999999987
specific scaler 336.4399999999987
generic scaler 1709.4599999999987
specific scaler 1709.4599999999987
generic scaler 688.01
specific scaler 688.01
generic scaler 1133.8199999999997
specific scaler 1133.8199999999997
generic scaler 1412.1299999999985
specific scaler 1412.1299999999985
generic scaler 829.8199999999997
specific scaler 829.8199999999997
generic scaler 2719.2999999999993
specific scaler 2719.2999999999993
generic scaler 613.689999999998
specific scaler 613.689999999998


## Make compendium

In [13]:
compendium, compendium_specific_ids = make_experiments(
    num_experiments,
    num_samples,
    num_genes,
    sample_ids,
    gene_ids,
    generic_gene_ids,
    num_specific_genes,
)

generic scaler 216.58999999999992
specific scaler 216.58999999999992
generic scaler 1666.7399999999998
specific scaler 1666.7399999999998
generic scaler 1849.6799999999967
specific scaler 1849.6799999999967
generic scaler 1441.09
specific scaler 1441.09
generic scaler 1762.2799999999997
specific scaler 1762.2799999999997
generic scaler 1336.4799999999987
specific scaler 1336.4799999999987
generic scaler 3692.119999999999
specific scaler 3692.119999999999
generic scaler 1882.189999999999
specific scaler 1882.189999999999
generic scaler 2327.9599999999928
specific scaler 2327.9599999999928
generic scaler 1640.0899999999945
specific scaler 1640.0899999999945
generic scaler 829.0999999999999
specific scaler 829.0999999999999
generic scaler 383.4199999999996
specific scaler 383.4199999999996
generic scaler 1491.4499999999987
specific scaler 1491.4499999999987
generic scaler 1768.9999999999977
specific scaler 1768.9999999999977
generic scaler 2681.279999999998
specific scaler 2681.2799999999

In [14]:
print(compendium.shape)
compendium.head()

(720, 1000)


Unnamed: 0,G_0,G_1,G_2,G_3,G_4,G_5,G_6,G_7,G_8,G_9,...,G_990,G_991,G_992,G_993,G_994,G_995,G_996,G_997,G_998,G_999
0,7,2.0,4.0,12,15.0,0.0,0.0,0.0,1.0,0.0,...,4.0,92,32.0,9.0,48.0,4.0,2,12,5,6.0
1,5,8.0,1.0,15,14.0,1.0,0.0,0.0,3.0,0.0,...,1.0,102,27.0,55.0,61.0,5.0,0,15,23,6.0
2,5,3.0,2.0,4,21.0,1.0,0.0,0.0,3.0,0.0,...,0.0,41,16.0,25.0,44.0,6.0,0,11,22,9.0
3,6,1.0,9.0,17,15.0,2.0,0.0,0.0,7.0,0.0,...,1.0,79,34.0,18.0,66.0,2.0,1,16,63,2.0
4,5,8.0,8.0,17,10.0,1.0,0.0,0.0,3.0,0.0,...,1.0,68,16.0,14.0,29.0,5.0,0,19,13,5.0


In [15]:
compendium_specific_ids

[['G_533',
  'G_160',
  'G_507',
  'G_371',
  'G_430',
  'G_332',
  'G_229',
  'G_917',
  'G_849',
  'G_818'],
 ['G_922',
  'G_93',
  'G_902',
  'G_179',
  'G_188',
  'G_7',
  'G_151',
  'G_941',
  'G_456',
  'G_86'],
 ['G_426',
  'G_271',
  'G_563',
  'G_472',
  'G_610',
  'G_182',
  'G_419',
  'G_679',
  'G_453',
  'G_666'],
 ['G_988',
  'G_489',
  'G_923',
  'G_26',
  'G_636',
  'G_340',
  'G_995',
  'G_446',
  'G_344',
  'G_505'],
 ['G_262',
  'G_612',
  'G_72',
  'G_157',
  'G_325',
  'G_80',
  'G_121',
  'G_246',
  'G_148',
  'G_536'],
 ['G_212',
  'G_705',
  'G_546',
  'G_176',
  'G_733',
  'G_221',
  'G_544',
  'G_692',
  'G_832',
  'G_729'],
 ['G_567',
  'G_831',
  'G_547',
  'G_295',
  'G_692',
  'G_884',
  'G_323',
  'G_342',
  'G_232',
  'G_673'],
 ['G_935',
  'G_893',
  'G_634',
  'G_513',
  'G_549',
  'G_737',
  'G_647',
  'G_488',
  'G_472',
  'G_720'],
 ['G_964',
  'G_636',
  'G_625',
  'G_30',
  'G_449',
  'G_648',
  'G_407',
  'G_515',
  'G_336',
  'G_953'],
 ['G_79',

In [16]:
# Save
compendium.to_csv(raw_compendium_filename, sep="\t")

# Save generic genes
with open(generic_gene_ids_filename, "wb") as pkl_fh:
    pickle.dump(generic_gene_ids, pkl_fh, protocol=3)