# SOPHIE vs traditional DE

The goal of this experiment is to determine how often SOPHIE ranks specific genes over generic genes compared to using traditional DE analysis

In [1]:
%load_ext autoreload
%autoreload 2
import os
import pickle
import numpy as np
import pandas as pd
import random
from ponyo import utils

np.random.seed(1)

## Create simulated data

1. Each perturbation experiment has 8 samples (4 perturbed vs 4 control) with 1000 genes. Create initial expression profiles for 4 samples by drawing from a gaussian with a mean/sd for each gene.
2. Say that there are 100 generic genes. The generic genes will have the same scalar value added for the “perturbed” samples
3. Each perturbation experiment will have 10 “specific” genes perturbed in addition to the “generic” ones. Select specific genes randomly . Then the specific genes will have the same scalar value added for the “perturbed” samples
4. Repeat this process 90 times to get 90 experiments

In [2]:
# User params
num_genes = 1000
num_samples = 8
num_generic_genes = 100
num_specific_genes = 10
num_experiments = 90

In [3]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))

config_filename = "config_sophie_vs_trad.tsv"

params = utils.read_config(config_filename)

In [4]:
# Load config params

# Local directory to store intermediate files
local_dir = params["local_dir"]

# Un-normalized compendium filename
raw_compendium_filename = params["raw_compendium_filename"]

In [5]:
# Pickle files to save
generic_gene_ids_filename = "generic_gene_ids.pickle"

In [6]:
# Setup variables

# Sample ids (4 controls, 4 perturbed)
sample_ids = [
    "0_control",
    "1_control",
    "2_control",
    "3_control",
    "4_perturb",
    "5_perturb",
    "6_perturb",
    "7_perturb",
]

# Make range of numbers into string to use as gene ids
gene_ids = [f"G_{i}" for i in range(num_genes)]

# Randomly select generic gene ids
generic_gene_ids = random.sample(gene_ids, num_generic_genes)
print(generic_gene_ids)

['G_716', 'G_146', 'G_351', 'G_886', 'G_318', 'G_336', 'G_389', 'G_472', 'G_371', 'G_315', 'G_288', 'G_113', 'G_109', 'G_858', 'G_542', 'G_70', 'G_903', 'G_645', 'G_649', 'G_20', 'G_313', 'G_918', 'G_270', 'G_671', 'G_489', 'G_259', 'G_338', 'G_269', 'G_200', 'G_209', 'G_881', 'G_520', 'G_719', 'G_853', 'G_239', 'G_106', 'G_445', 'G_243', 'G_111', 'G_741', 'G_588', 'G_107', 'G_818', 'G_902', 'G_231', 'G_707', 'G_167', 'G_760', 'G_841', 'G_573', 'G_194', 'G_667', 'G_488', 'G_642', 'G_864', 'G_39', 'G_138', 'G_55', 'G_44', 'G_119', 'G_459', 'G_171', 'G_84', 'G_391', 'G_793', 'G_266', 'G_799', 'G_279', 'G_653', 'G_705', 'G_878', 'G_127', 'G_937', 'G_214', 'G_357', 'G_419', 'G_34', 'G_108', 'G_383', 'G_780', 'G_228', 'G_815', 'G_901', 'G_506', 'G_102', 'G_905', 'G_992', 'G_890', 'G_666', 'G_792', 'G_813', 'G_860', 'G_317', 'G_539', 'G_644', 'G_996', 'G_414', 'G_942', 'G_965', 'G_359']


## Supporting functions

In [7]:
# Function to make an individual experiment
def run_make_experiment(
    num_samples,
    num_genes,
    sample_ids,
    all_gene_ids,
    generic_gene_ids,
    specific_gene_ids,
    generic_scaler,
    specific_scaler,
):
    experiment_data = {}
    for gene in range(num_genes):

        # Simulate expression by sampling from a Negative Binomial distribution
        # NB(r,p)
        # r: number of successes until we stop trials
        # p: success rate, the probability that each trial is a success
        # Within an experiment, each trial can have 2 outcomes = success or failure
        # Within a trial, the probability of success is p
        # NB()
        # Randomly select a different success probability for each gene so that
        # each gene has a different rate at which its expressed
        p = random.uniform(0.0, 1.0)
        gene_profile = np.random.negative_binomial(10, p, num_samples)

        # Create dictionary to define dataframe
        experiment_data[f"G_{gene}"] = gene_profile

    # Create experiment dataframe
    experiment_df = pd.DataFrame(experiment_data)

    # Set index
    experiment_df.index = sample_ids

    # Perturb generic genes by scaler
    # Randomly select a scaler
    # Only add scaler to perturbed samples
    experiment_df.loc[
        experiment_df.index.str.contains("perturb"), generic_gene_ids
    ] += generic_scaler

    # Perturb specific genes by a different scaler
    experiment_df.loc[
        experiment_df.index.str.contains("perturb"), specific_gene_ids
    ] += specific_scaler

    return experiment_df

In [8]:
# Make multiple experiments
def make_experiments(
    num_experiments,
    num_samples,
    num_genes,
    sample_ids,
    all_gene_ids,
    generic_gene_ids,
    num_specific_genes,
):

    expression_df = pd.DataFrame()
    specific_gene_id_lst = []

    for i in range(num_experiments):

        # Randomly select specific genes from the pool of (900)remaining genes
        # Select without replacement
        remaining_gene_ids = list(set(all_gene_ids).difference(generic_gene_ids))
        specific_gene_ids = random.sample(remaining_gene_ids, num_specific_genes)

        # Save specific gene ids for reference later
        specific_gene_id_lst.append(specific_gene_ids)

        # Randomly select scaler for generic and specific genes from the same distribution
        p = random.uniform(0.0, 1.0)
        generic_scaler = np.random.negative_binomial(10, p)
        specific_scaler = np.random.negative_binomial(10, p)

        experiment_df = run_make_experiment(
            num_samples,
            num_genes,
            sample_ids,
            all_gene_ids,
            generic_gene_ids,
            specific_gene_ids,
            generic_scaler,
            specific_scaler,
        )

        # Concatenate experiments
        expression_df = pd.concat([expression_df, experiment_df])

    # Try to reset index to see if this makes a difference
    # NOTE: VAE don't train when sample indices are identical, not sure why
    if num_experiments > 1:
        expression_df = expression_df.reset_index(drop=True)

    return expression_df, specific_gene_id_lst

## Make a template experiment

In [9]:
for i in range(10):
    template_experiment, template_specific_gene_ids = make_experiments(
        1,
        num_samples,
        num_genes,
        sample_ids,
        gene_ids,
        generic_gene_ids,
        num_specific_genes,
    )

    # Save template experiment
    raw_template_filename = f"/home/alexandra/Documents/Data/Generic_expression_patterns/reviewer_experiment/raw_template_{i}.tsv"
    template_experiment.to_csv(raw_template_filename, sep="\t")

    # Pickle specific gene ids
    template_specific_gene_ids_filename = f"/home/alexandra/Documents/Data/Generic_expression_patterns/reviewer_experiment/template_specific_gene_ids_{i}.pickle"
    with open(template_specific_gene_ids_filename, "wb") as pkl_fh:
        pickle.dump(template_specific_gene_ids[0], pkl_fh, protocol=3)

In [10]:
print(template_experiment.shape)
template_experiment.head(8)

(8, 1000)


Unnamed: 0,G_0,G_1,G_2,G_3,G_4,G_5,G_6,G_7,G_8,G_9,...,G_990,G_991,G_992,G_993,G_994,G_995,G_996,G_997,G_998,G_999
0_control,2,17,3,13,29,0,8,10,17,73,...,10,32,72,23,2,4,20,1,7,0
1_control,1,11,2,8,33,0,8,7,12,52,...,9,60,91,12,0,6,15,7,4,1
2_control,1,9,2,7,58,3,12,11,20,62,...,5,68,72,22,3,3,16,1,17,3
3_control,1,10,0,9,43,3,9,10,42,56,...,10,60,78,17,2,1,41,10,8,4
4_perturb,3,10,0,14,63,0,27,15,30,59,...,15,74,82,18,1,2,28,1,7,0
5_perturb,1,9,1,13,28,1,21,11,32,44,...,13,36,103,15,2,1,48,8,19,2
6_perturb,5,9,1,5,14,1,27,17,40,13,...,6,58,139,21,2,6,41,13,7,1
7_perturb,0,16,0,4,53,2,28,2,37,64,...,7,54,55,15,4,0,41,3,4,4


In [11]:
template_experiment[generic_gene_ids]

Unnamed: 0,G_716,G_146,G_351,G_886,G_318,G_336,G_389,G_472,G_371,G_315,...,G_813,G_860,G_317,G_539,G_644,G_996,G_414,G_942,G_965,G_359
0_control,3,3,15,16,98,1,15,4,3,6,...,109,2,6,63,2,20,7,1,12,963
1_control,7,5,10,12,146,0,20,2,7,6,...,108,4,18,110,1,15,4,5,4,687
2_control,10,5,13,7,46,0,6,3,2,13,...,102,4,17,110,0,16,1,6,4,392
3_control,4,4,15,9,89,1,13,6,5,10,...,170,2,5,64,1,41,4,7,5,681
4_perturb,22,18,21,30,96,17,24,21,17,19,...,89,18,33,79,18,28,18,25,20,606
5_perturb,20,16,27,18,94,17,35,16,19,25,...,110,21,29,110,15,48,20,26,17,565
6_perturb,23,17,30,30,166,20,25,18,17,23,...,168,22,30,82,18,41,19,27,17,742
7_perturb,22,17,24,23,94,19,29,17,17,22,...,79,21,30,55,15,41,19,26,19,444


In [12]:
template_specific_gene_ids[0]

['G_487',
 'G_453',
 'G_236',
 'G_833',
 'G_6',
 'G_755',
 'G_936',
 'G_914',
 'G_582',
 'G_186']

## Make compendium

In [13]:
compendium, compendium_specific_ids = make_experiments(
    num_experiments,
    num_samples,
    num_genes,
    sample_ids,
    gene_ids,
    generic_gene_ids,
    num_specific_genes,
)

In [14]:
print(compendium.shape)
compendium.head()

(720, 1000)


Unnamed: 0,G_0,G_1,G_2,G_3,G_4,G_5,G_6,G_7,G_8,G_9,...,G_990,G_991,G_992,G_993,G_994,G_995,G_996,G_997,G_998,G_999
0,7,3,16,9,0,26,1,5,44,4,...,3,7,26,1,24,160,7,66,23,31
1,4,2,16,0,1,88,3,5,45,6,...,1,9,30,2,21,152,17,96,10,32
2,4,1,5,9,1,46,0,4,28,6,...,2,6,29,3,34,79,2,79,6,30
3,5,3,14,9,2,32,1,8,31,8,...,3,4,14,6,12,249,9,149,22,48
4,4,3,15,9,1,20,1,6,30,10,...,5,6,40,2,60,126,7,93,11,39


In [15]:
compendium_specific_ids

[['G_47',
  'G_141',
  'G_2',
  'G_647',
  'G_569',
  'G_145',
  'G_248',
  'G_511',
  'G_562',
  'G_40'],
 ['G_130',
  'G_759',
  'G_399',
  'G_23',
  'G_875',
  'G_350',
  'G_326',
  'G_256',
  'G_891',
  'G_939'],
 ['G_508',
  'G_922',
  'G_669',
  'G_963',
  'G_608',
  'G_196',
  'G_648',
  'G_263',
  'G_643',
  'G_773'],
 ['G_970',
  'G_398',
  'G_564',
  'G_277',
  'G_33',
  'G_142',
  'G_180',
  'G_894',
  'G_861',
  'G_600'],
 ['G_640',
  'G_448',
  'G_537',
  'G_410',
  'G_712',
  'G_311',
  'G_561',
  'G_418',
  'G_12',
  'G_541'],
 ['G_599',
  'G_763',
  'G_640',
  'G_753',
  'G_378',
  'G_730',
  'G_592',
  'G_823',
  'G_840',
  'G_248'],
 ['G_158',
  'G_702',
  'G_451',
  'G_78',
  'G_697',
  'G_674',
  'G_772',
  'G_353',
  'G_742',
  'G_456'],
 ['G_99',
  'G_526',
  'G_212',
  'G_875',
  'G_548',
  'G_861',
  'G_2',
  'G_810',
  'G_198',
  'G_457'],
 ['G_422',
  'G_132',
  'G_888',
  'G_558',
  'G_794',
  'G_966',
  'G_561',
  'G_964',
  'G_578',
  'G_16'],
 ['G_22',
  '

In [16]:
# Save
compendium.to_csv(raw_compendium_filename, sep="\t")

# Save generic genes
# Pickle `scaler` as `scaler_filename` on disk
with open(generic_gene_ids_filename, "wb") as pkl_fh:
    pickle.dump(generic_gene_ids, pkl_fh, protocol=3)