# SOPHIE vs traditional DE

The goal of this experiment is to determine how often SOPHIE ranks specific genes over generic genes compared to using traditional DE analysis

In [1]:
%load_ext autoreload
%autoreload 2
import os
import pickle
import numpy as np
import pandas as pd
import random
from ponyo import utils

np.random.seed(1)

## Create simulated data

1. Each perturbation experiment has 4 samples (2 perturbed vs 2 control) with 1000 genes. Create initial expression profiles for 4 samples by drawing from a gaussian with a mean/sd for each gene.
2. Say that there are 100 generic genes. The generic genes will have the same scalar value added for the “perturbed” samples
3. Each perturbation experiment will have 10 “specific” genes perturbed in addition to the “generic” ones. Select specific genes randomly . Then the specific genes will have the same scalar value added for the “perturbed” samples
4. Repeat this process 90 times to get 90 experiments

In [2]:
# User params
num_genes = 1000
num_samples = 4
num_generic_genes = 100
num_specific_genes = 10
num_experiments = 90

# Range of counts
min_expression = 0
max_expression = 1000

# params for distribution for scaler used to shift generic and specific genes
mean_scaler = 0
var_scaler = 10

In [3]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))

config_filename = "config_sophie_vs_trad.tsv"

params = utils.read_config(config_filename)

In [4]:
# Load config params

# Local directory to store intermediate files
local_dir = params["local_dir"]

# Un-normalized compendium filename
raw_compendium_filename = params["raw_compendium_filename"]

In [5]:
# Pickle files to save
generic_gene_ids_filename = "generic_gene_ids.pickle"

In [6]:
# Setup variables

# Sample ids (2 controls, 2 perturbed)
sample_ids = ["0_control", "1_control", "2_perturb", "3_perturb"]

# Make range of numbers into string to use as gene ids
gene_ids = [f"G_{i}" for i in range(num_genes)]

# Randomly select generic gene ids
generic_gene_ids = random.sample(gene_ids, num_generic_genes)
print(generic_gene_ids)

['G_368', 'G_809', 'G_961', 'G_664', 'G_750', 'G_785', 'G_554', 'G_585', 'G_287', 'G_746', 'G_636', 'G_592', 'G_494', 'G_916', 'G_830', 'G_169', 'G_463', 'G_787', 'G_466', 'G_967', 'G_956', 'G_61', 'G_469', 'G_210', 'G_983', 'G_426', 'G_189', 'G_706', 'G_662', 'G_23', 'G_28', 'G_565', 'G_145', 'G_613', 'G_525', 'G_450', 'G_493', 'G_306', 'G_964', 'G_826', 'G_404', 'G_214', 'G_332', 'G_843', 'G_285', 'G_300', 'G_786', 'G_239', 'G_54', 'G_314', 'G_749', 'G_420', 'G_773', 'G_324', 'G_628', 'G_890', 'G_308', 'G_487', 'G_685', 'G_911', 'G_293', 'G_619', 'G_202', 'G_150', 'G_34', 'G_460', 'G_871', 'G_100', 'G_637', 'G_174', 'G_667', 'G_721', 'G_910', 'G_874', 'G_252', 'G_179', 'G_259', 'G_717', 'G_515', 'G_461', 'G_121', 'G_251', 'G_812', 'G_999', 'G_219', 'G_363', 'G_425', 'G_875', 'G_489', 'G_539', 'G_938', 'G_253', 'G_583', 'G_770', 'G_280', 'G_864', 'G_724', 'G_949', 'G_274', 'G_352']


## Supporting functions

In [7]:
# Function to make an individual experiment
def run_make_experiment(
    num_samples,
    num_genes,
    min_expression,
    max_expression,
    sample_ids,
    all_gene_ids,
    generic_gene_ids,
    specific_gene_ids,
    generic_scaler,
    specific_scaler,
):
    experiment_data = {}
    for sample in range(num_samples):

        # sample_profile = np.zeros(num_genes)
        # for j in range(num_genes):
        #    sample_profile[j] = random.randint(min_expression, max_expression)

        # Randomly select a different probability for each sample
        p = random.uniform(0.0, 1.0)
        sample_profile = np.random.negative_binomial(100, p, num_genes)

        # Create dictionary to define dataframe
        experiment_data[sample_ids[sample]] = sample_profile

    # Create experiment dataframe
    experiment_df = pd.DataFrame(experiment_data).T

    # Set column header
    experiment_df.columns = all_gene_ids

    # Perturb generic genes by scaler
    # Randomly select a scaler
    # Only add scaler to perturbed samples
    experiment_df.loc[
        experiment_df.index.str.contains("perturb"), generic_gene_ids
    ] += generic_scaler

    # Perturb specific genes by a different scaler
    experiment_df.loc[
        experiment_df.index.str.contains("perturb"), specific_gene_ids
    ] += specific_scaler

    return experiment_df

In [8]:
# Make multiple experiments
def make_experiments(
    num_experiments,
    num_samples,
    num_genes,
    mean_expression,
    var_expression,
    sample_ids,
    all_gene_ids,
    generic_gene_ids,
    num_specific_genes,
):

    expression_df = pd.DataFrame()
    specific_gene_id_lst = []

    for i in range(num_experiments):

        # Randomly select specific genes from the pool of (900)remaining genes
        # Select without replacement
        remaining_gene_ids = list(set(all_gene_ids).difference(generic_gene_ids))
        specific_gene_ids = random.sample(remaining_gene_ids, num_specific_genes)

        # Save specific gene ids for reference later
        specific_gene_id_lst.append(specific_gene_ids)

        # Randomly select scaler for generic and specific genes from the same distribution
        scale_factors = np.random.randint(mean_scaler, var_scaler, size=2)
        generic_scaler = scale_factors[0]
        specific_scaler = scale_factors[1]

        experiment_df = run_make_experiment(
            num_samples,
            num_genes,
            mean_expression,
            var_expression,
            sample_ids,
            all_gene_ids,
            generic_gene_ids,
            specific_gene_ids,
            generic_scaler,
            specific_scaler,
        )

        # Concatenate experiments
        expression_df = pd.concat([expression_df, experiment_df])

    # Try to reset index to see if this makes a difference
    # NOTE: VAE don't train when sample indices are identical, not sure why
    if num_experiments > 1:
        expression_df = expression_df.reset_index(drop=True)

    return expression_df, specific_gene_id_lst

## Make a template experiment

In [9]:
for i in range(10):
    template_experiment, template_specific_gene_ids = make_experiments(
        1,
        num_samples,
        num_genes,
        min_expression,
        max_expression,
        sample_ids,
        gene_ids,
        generic_gene_ids,
        num_specific_genes,
    )

    # Save template experiment
    raw_template_filename = f"/home/alexandra/Documents/Data/Generic_expression_patterns/reviewer_experiment/raw_template_{i}.tsv"
    template_experiment.to_csv(raw_template_filename, sep="\t")

    # Pickle specific gene ids
    template_specific_gene_ids_filename = f"/home/alexandra/Documents/Data/Generic_expression_patterns/reviewer_experiment/template_specific_gene_ids_{i}.pickle"
    with open(template_specific_gene_ids_filename, "wb") as pkl_fh:
        pickle.dump(template_specific_gene_ids[0], pkl_fh, protocol=3)

In [10]:
print(template_experiment.shape)
template_experiment.head()

(4, 1000)


Unnamed: 0,G_0,G_1,G_2,G_3,G_4,G_5,G_6,G_7,G_8,G_9,...,G_990,G_991,G_992,G_993,G_994,G_995,G_996,G_997,G_998,G_999
0_control,12,14,14,10,13,15,21,16,23,10,...,11,20,19,19,15,14,9,20,15,22
1_control,20,21,35,33,31,15,23,20,24,19,...,20,23,21,25,23,17,30,20,30,23
2_perturb,159,189,151,161,147,153,193,158,150,151,...,233,185,202,175,140,107,156,157,173,159
3_perturb,59,23,52,47,51,43,49,54,50,52,...,56,54,50,53,45,38,34,50,38,40


In [11]:
template_experiment[generic_gene_ids]

Unnamed: 0,G_368,G_809,G_961,G_664,G_750,G_785,G_554,G_585,G_287,G_746,...,G_938,G_253,G_583,G_770,G_280,G_864,G_724,G_949,G_274,G_352
0_control,16,15,18,10,13,18,19,16,25,19,...,13,12,12,16,11,14,22,15,11,18
1_control,25,27,13,22,29,20,20,27,19,32,...,21,15,26,20,26,24,14,24,26,24
2_perturb,163,138,152,152,169,161,196,152,161,164,...,141,196,143,173,171,186,188,173,188,139
3_perturb,39,63,61,50,49,46,46,49,56,58,...,40,53,38,69,42,59,45,51,42,50


In [12]:
template_specific_gene_ids[0]

['G_261',
 'G_341',
 'G_886',
 'G_777',
 'G_40',
 'G_267',
 'G_509',
 'G_39',
 'G_502',
 'G_123']

## Make compendium

In [13]:
compendium, compendium_specific_ids = make_experiments(
    num_experiments,
    num_samples,
    num_genes,
    min_expression,
    max_expression,
    sample_ids,
    gene_ids,
    generic_gene_ids,
    num_specific_genes,
)

In [14]:
print(compendium.shape)
compendium.head()

(360, 1000)


Unnamed: 0,G_0,G_1,G_2,G_3,G_4,G_5,G_6,G_7,G_8,G_9,...,G_990,G_991,G_992,G_993,G_994,G_995,G_996,G_997,G_998,G_999
0,8,2,2,3,4,3,5,3,3,2,...,1,2,10,5,5,4,8,5,1,5
1,4,8,1,5,5,5,4,4,6,3,...,1,6,6,2,4,5,2,3,0,2
2,1,0,2,0,0,0,2,0,1,0,...,0,1,1,0,0,1,1,1,0,7
3,96,90,142,135,96,109,97,140,137,104,...,130,120,145,111,94,99,115,156,112,105
4,39,47,50,37,41,41,31,48,46,45,...,40,38,40,51,48,42,44,33,55,31


In [15]:
compendium_specific_ids

[['G_950',
  'G_497',
  'G_16',
  'G_109',
  'G_886',
  'G_977',
  'G_440',
  'G_609',
  'G_676',
  'G_360'],
 ['G_822',
  'G_578',
  'G_292',
  'G_397',
  'G_927',
  'G_632',
  'G_1',
  'G_968',
  'G_320',
  'G_114'],
 ['G_257',
  'G_12',
  'G_821',
  'G_551',
  'G_552',
  'G_988',
  'G_791',
  'G_496',
  'G_226',
  'G_96'],
 ['G_581',
  'G_57',
  'G_679',
  'G_227',
  'G_609',
  'G_124',
  'G_614',
  'G_207',
  'G_828',
  'G_84'],
 ['G_861',
  'G_621',
  'G_976',
  'G_109',
  'G_743',
  'G_541',
  'G_1',
  'G_893',
  'G_188',
  'G_909'],
 ['G_536',
  'G_95',
  'G_620',
  'G_925',
  'G_129',
  'G_883',
  'G_535',
  'G_131',
  'G_132',
  'G_896'],
 ['G_675',
  'G_700',
  'G_535',
  'G_542',
  'G_783',
  'G_599',
  'G_359',
  'G_689',
  'G_485',
  'G_193'],
 ['G_688',
  'G_953',
  'G_85',
  'G_509',
  'G_799',
  'G_447',
  'G_877',
  'G_963',
  'G_94',
  'G_994'],
 ['G_66',
  'G_53',
  'G_165',
  'G_472',
  'G_354',
  'G_932',
  'G_464',
  'G_241',
  'G_264',
  'G_516'],
 ['G_102',
  'G

In [16]:
# Save
compendium.to_csv(raw_compendium_filename, sep="\t")

# Save generic genes
# Pickle `scaler` as `scaler_filename` on disk
with open(generic_gene_ids_filename, "wb") as pkl_fh:
    pickle.dump(generic_gene_ids, pkl_fh, protocol=3)