In [1]:
from generate_data import dNk_from_errors, generate_catastrophe_data, generate_independent_data
import numpy as np
import scipy.stats as st
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

This notebook is to generate synthetic data from the two models.  Currently we're focusing on the "initial condition" distribution.

In [2]:
# this is not what I used but we could consider generating a lot of datasets...
# p_misseg = np.linspace(0.001, 0.1, 10)
# n_chrom = 92
# n_cells = 10000
# p_cat = np.linspace(0.005, 0.5, 10)
# C = np.arange(1,10)

Consider extreme case for our test datasets (high_cat):
- Very low probability of missegregation for each chromosome for each division: ~0.3% (lower end of what's been seen in aneuploid tumor cell lines)
- High probability of cells having catastrophe: ~50% (this is very high)
- Large number of chromosomes bound to be missegregated in catastrophe cells: ~10

Consider a less extreme case (low_cat):
- High probability of missegregation for each chromosome for each division: ~1% (higher end of what's been seen in aneuploid tumor cell lines)
- Low probability of cells having catastrophe: ~10%
- Low number of chromosomes bound to be missegregated in catastrophe cells: ~3

And for the corresponding independent model datasets, just vary probability of missegregation from lower bound (low_ind) to upper (high_ind) bound.

I will generate test datasets of 10000 cells each.  We can then sample from these datasets to simulate experiments with finite numbers of cells.



In [3]:
# set random seed for reproducibility
np.random.seed(75)

# set up parameters
p_misseg = [0.003, 0.01]
n_cells = 10000
n_chrom = 92
p_cat = [0.5, 0.1]
C = [10,3]

# set up names for saving files
cat_names = ['high_cat.txt', 'low_cat.txt']
ind_names = ['low_ind.txt', 'high_ind.txt']
param_names = ['p_misseg', 'n_cells', 'n_chrom', 'p_cat', 'C']

# generate and store data
for i in range(2):
    params = p_misseg[i], n_cells, n_chrom, p_cat[i], C[i]
    df_cat = generate_catastrophe_data(params)
    with open('data_'+cat_names[i], 'w') as f:
        f.write('# catastrophe model\n')
        for j, param in enumerate(params):
            f.write('# '+param_names[j]+': '+str(param)+'\n')
        f.write(df_cat.to_string(index=None))
    df_ind = generate_independent_data(params[:3]) 
    with open('data_'+ind_names[i], 'w') as f:    
        f.write('# independent model')
        for j, param in enumerate(params[:3]):
            f.write('# '+param_names[j]+': '+str(param)+'\n')
        f.write(df_ind.to_string(index=None))