In [34]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as p
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.spatial import distance
from scipy.stats.mstats import gmean
%matplotlib inline
from itertools import combinations
from itertools import chain
import sys
import os
import copy
sns.set_style('white')
sns.set_style('ticks')
sns.set_color_codes()

fgm_simulation_path = '/Users/grantkinsler/Documents/Stanford/Research/StarryNight/Git/starry-night/Simulations/FGM_simulation_callable.py'
sys.path.append(os.path.dirname(os.path.expanduser(fgm_simulation_path)))
from FGM_simulation_callable import simulation, nball_pull, gaussian_fitness

tools_path = '../code/tools.py'
sys.path.append(os.path.dirname(os.path.expanduser(tools_path)))
import tools

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
np.random.seed(953527608) # for exact figure reproducibility use this seed

In [36]:
# fitness_data = p.read_csv('../data/DoubleBC_Merged_Fitness_Atish_Default_AllConditions.csv')
# fitness_data = p.read_csv('../data/DoubleBC_Merged_Fitness_Atish_Weighted_Default_AllConditions_IncludingOld_swapsremoved.csv')
fitness_data = p.read_csv('../data/fitness_weighted_allconditions_swapsremoved.csv')

In [37]:
fitness_data = fitness_data.replace([np.inf, -np.inf], np.nan).dropna()

In [38]:
gene_type_combos = np.unique([(g,t) for g,t in zip(fitness_data['gene'].values,fitness_data['type'].values)],axis=0)

We can construct a training and test set of mutants by, for each mutant type (combination of gene and mutation type), dividing half of those mutant types into each set. "Half" is calculated by rounding down to be conservative, so mutant types with only one representative are only present in the "test" set.

Because "other" (mutation not in RAS/PKA or TOR pathway and/or Diploid) and "Not Sequenced" mutants have genotypes that are not easily classified as being similar to something else (or unknown), they are not included in these training and test sets.

Because neutral lineages should (in theory) give no information on the space, they are included only in the test set.

In [39]:
bc_list = []

for (g,t) in gene_type_combos:
    if not (('other' in g) or ('NotSequenced' in g)):
        this_gt = fitness_data[(fitness_data['gene'].isin([g]) & fitness_data['type'].isin([t]))]
        n_samples = int(np.floor(len(this_gt.index)/2))
        print(g,t,len(this_gt.index),n_samples)
        
        bc_list = bc_list + list(np.random.choice(this_gt['barcode'].values,n_samples))

# exp_neutral =  fitness_data[fitness_data['class'].isin(['ExpNeutral'])]
# n_samples = int(np.floor(len(exp_neutral.index)/2))
# bc_list = bc_list + list(np.random.choice(exp_neutral['barcode'].values,n_samples)) 


fitness_data[fitness_data['barcode'].isin(bc_list)].to_csv('../data/mutant_train_set.csv',index=False)

typical_test_list = []

number_per = 10

for (g,t) in gene_type_combos:
    if not (('other' in g) or ('NotSequenced' in g)):
        this_gt = fitness_data[(fitness_data['gene'].isin([g]) & fitness_data['type'].isin([t]))]
        n_samples = min([int(np.floor(len(this_gt.index))),number_per])
        print(g,t,len(this_gt.index),n_samples)
        
        options = [bc for bc in this_gt['barcode'].values if bc not in bc_list]
        
        typical_test_list = typical_test_list + list(np.random.choice(options,n_samples))

exp_neutral =  fitness_data[fitness_data['class'].isin(['ExpNeutral'])]
n_samples =  min([int(np.floor(len(this_gt.index))),number_per])
# typical_test_list = typical_test_list + list(np.random.choice(exp_neutral['barcode'].values,n_samples)) 
typical_test_list = typical_test_list + list(exp_neutral['barcode'].values) 

typical_test_list = typical_test_list + list(fitness_data[fitness_data['gene']=='other']['barcode'].values) + list(fitness_data[fitness_data['gene']=='NotSequenced']['barcode'].values)

typical_test_list = [bc for bc in typical_test_list if bc not in bc_list]

fitness_data[fitness_data['barcode'].isin(typical_test_list)].to_csv('../data/mutant_test_set.csv',index=False) 

CYR1 missense_variant 3 1
Diploid Diploid 200 100
Diploid + Chr11Amp Diploid + Chr11Amp 3 1
Diploid + Chr12Amp Diploid + Chr12Amp 1 0
Diploid + IRA1 missense_variant 1 0
Diploid + IRA2 frameshift_variant 1 0
Diploid + IRA2 missense_variant 1 0
Diploid + IRA2 stop_gained 1 0
GPB1 frameshift_variant 1 0
GPB1 missense_variant 1 0
GPB1 stop_gained 2 1
GPB2 frameshift_variant 5 2
GPB2 missense_variant 1 0
GPB2 stop_gained 8 4
IRA1 frameshift_variant 11 5
IRA1 missense_variant 9 4
IRA1 stop_gained 10 5
IRA1 upstream_point_variant 1 0
IRA2 frameshift_variant 1 0
IRA2 missense_variant 8 4
KOG1 missense_variant 1 0
PDE2 frameshift_variant 6 3
PDE2 missense_variant 2 1
PDE2 stop_gained 3 1
RAS2 missense_variant 1 0
SCH9 missense_variant 1 0
TFS1 missense_variant 1 0
TOR1 missense_variant 1 0
CYR1 missense_variant 3 3
Diploid Diploid 200 10
Diploid + Chr11Amp Diploid + Chr11Amp 3 3
Diploid + Chr12Amp Diploid + Chr12Amp 1 1
Diploid + IRA1 missense_variant 1 1
Diploid + IRA2 frameshift_variant 1 1


 The "minimal" set below tries to account for imbalances in the number of mutants per type by instead taking 2 of every mutant type in the training set and (at most) 10 of each for the testing set.
 
Again, "other" mutants and "Not Sequenced" are excluded.

In [40]:
minimal_bc_list = []

number_per = 2
print('MINIMAL TRAINING SET')
for (g,t) in gene_type_combos:
    if not (('other' in g) or ('NotSequenced' in g)):
        this_gt = fitness_data[(fitness_data['gene'].isin([g]) & fitness_data['type'].isin([t]))]
        n_samples = min([int(np.floor(len(this_gt.index)/2)),number_per])
        print(g,t,len(this_gt.index),n_samples)
        
        minimal_bc_list = minimal_bc_list + list(np.random.choice(this_gt['barcode'].values,n_samples))
        

# exp_neutral =  fitness_data[fitness_data['class'].isin(['ExpNeutral'])]
# n_samples = int(np.floor(len(exp_neutral.index)/2))
# minimal_bc_list = minimal_bc_list + list(np.random.choice(exp_neutral['barcode'].values,n_samples))  
## should this be balanced by mutation type (diploids dominate...)   

fitness_data[fitness_data['barcode'].isin(minimal_bc_list)].to_csv('../data/mutant_minimal_train_set.csv',index=False)

minimal_test_list = []

number_per = 10
print('MINIMAL TESTING SET')
for (g,t) in gene_type_combos:
    if not (('other' in g) or ('NotSequenced' in g)):
        this_gt = fitness_data[(fitness_data['gene'].isin([g]) & fitness_data['type'].isin([t]))]
        n_samples = min([int(np.floor(len(this_gt.index))),number_per])
        print(g,t,len(this_gt.index),n_samples)
        
        options = [bc for bc in this_gt['barcode'].values if bc not in minimal_bc_list]
        
        minimal_test_list = minimal_test_list + list(np.random.choice(options,n_samples))

exp_neutral =  fitness_data[fitness_data['class'].isin(['ExpNeutral'])]
n_samples =  min([int(np.floor(len(this_gt.index))),number_per])
# minimal_test_list = minimal_test_list + list(np.random.choice(exp_neutral['barcode'].values,n_samples)) 
minimal_test_list = minimal_test_list + list(exp_neutral['barcode'].values) 


# print(list(np.random.choice(exp_neutral['barcode'].values,n_samples)) )
# minimal_test_list = minimal_test_list + list(fitness_data[fitness_data['gene']=='other']['barcode'].values) + list(fitness_data[fitness_data['gene']=='NotSequenced']['barcode'].values)

minimal_test_list = [bc for bc in minimal_test_list if bc not in minimal_bc_list]

fitness_data[fitness_data['barcode'].isin(minimal_test_list)].to_csv('../data/mutant_minimal_test_set.csv',index=False)        

MINIMAL TRAINING SET
CYR1 missense_variant 3 1
Diploid Diploid 200 2
Diploid + Chr11Amp Diploid + Chr11Amp 3 1
Diploid + Chr12Amp Diploid + Chr12Amp 1 0
Diploid + IRA1 missense_variant 1 0
Diploid + IRA2 frameshift_variant 1 0
Diploid + IRA2 missense_variant 1 0
Diploid + IRA2 stop_gained 1 0
GPB1 frameshift_variant 1 0
GPB1 missense_variant 1 0
GPB1 stop_gained 2 1
GPB2 frameshift_variant 5 2
GPB2 missense_variant 1 0
GPB2 stop_gained 8 2
IRA1 frameshift_variant 11 2
IRA1 missense_variant 9 2
IRA1 stop_gained 10 2
IRA1 upstream_point_variant 1 0
IRA2 frameshift_variant 1 0
IRA2 missense_variant 8 2
KOG1 missense_variant 1 0
PDE2 frameshift_variant 6 2
PDE2 missense_variant 2 1
PDE2 stop_gained 3 1
RAS2 missense_variant 1 0
SCH9 missense_variant 1 0
TFS1 missense_variant 1 0
TOR1 missense_variant 1 0
MINIMAL TESTING SET
CYR1 missense_variant 3 3
Diploid Diploid 200 10
Diploid + Chr11Amp Diploid + Chr11Amp 3 3
Diploid + Chr12Amp Diploid + Chr12Amp 1 1
Diploid + IRA1 missense_variant 1 1

To account for the fact "other" mutants may in fact have interesting phenotypic behavior that isn't capture by the commonly hit mutants, we also include another training set where we divide these equally into the training and test set, but otherwise keep the "minimal" training and test sets from above. 

Note that though the "other" mutants have no obvious commonalities in genotype, it is possible there is over-representation of particular mutants in some of these sets.

To avoid the problem that some of the "Not Sequenced" mutants are actually likely to contain some similarities, particularly because we've not sequenced some because of similarities to others (i.e. likely to be Diploid/IRA1) and this may put our sets out of balance.

In [41]:
this_gt = fitness_data[(fitness_data['gene'].isin(['other']))]
n_samples = int(len(this_gt.index)/2)
print(n_samples,len(this_gt.index))
train_others = list(np.random.choice(this_gt['barcode'].values,n_samples))
test_others = [bc for bc in this_gt['barcode'].values if bc not in train_others]
        
training_w_others = minimal__list + train_others
fitness_data[fitness_data['barcode'].isin(training_w_others)].to_csv('../data/mutant_minimal+other_train_set.csv',index=False)


testing_w_others = minimal_test_list + test_others
fitness_data[fitness_data['barcode'].isin(training_w_others)].to_csv('../data/mutant_minimal+other_test_set.csv',index=False)


fitness_data[fitness_data['barcode'].isin(train_others)].to_csv('../data/mutant_justother_train_set.csv',index=False)

fitness_data[fitness_data['barcode'].isin(test_others)].to_csv('../data/mutant_justother_test_set.csv',index=False)
 

41 82


In [43]:
len(training_w_others)

41