In [2]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as p
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.spatial import distance
from scipy.stats.mstats import gmean
%matplotlib inline
from itertools import combinations
from itertools import chain
import sys
import os
import copy
sns.set_style('white')
sns.set_style('ticks')
sns.set_color_codes()

fgm_simulation_path = '/Users/grantkinsler/Documents/Stanford/Research/StarryNight/Git/starry-night/Simulations/FGM_simulation_callable.py'
# sys.path.append(os.path.dirname(os.path.expanduser(fgm_simulation_path)))
# from FGM_simulation_callable import simulation, nball_pull, gaussian_fitness

tools_path = '../code/tools.py'
sys.path.append(os.path.dirname(os.path.expanduser(tools_path)))
import tools

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
np.random.seed(953527608) # for exact figure reproducibility use this seed

In [4]:
# fitness_data = p.read_csv('../data/DoubleBC_Merged_Fitness_Atish_Default_AllConditions.csv')
# fitness_data = p.read_csv('../data/DoubleBC_Merged_Fitness_Atish_Weighted_Default_AllConditions_IncludingOld_swapsremoved.csv')
fitness_data = p.read_csv('../data/fitness_weighted_allconditions_swapsremoved_neutral2xpass.csv')

In [5]:
fitness_data = fitness_data.replace([np.inf, -np.inf], np.nan).dropna()
# this_data = merged_fitness_data
# this_data = this_data.replace([np.inf, -np.inf], np.nan)
# # this_data = this_data.dropna('columns',how='all')
# this_data = this_data.dropna()

In [6]:
mutation_types =  np.unique(fitness_data['mutation_type'])

We can construct a training and test set of mutants by, for each mutant type (combination of gene and mutation type), dividing half of those mutant types into each set. "Half" is calculated by rounding down to be conservative, so mutant types with only one representative are only present in the "test" set.

Because "other" (mutation not in RAS/PKA or TOR pathway and/or Diploid) and "Not Sequenced" mutants have genotypes that are not easily classified as being similar to something else (or unknown), they are not included in these training and test sets.

Because neutral lineages should (in theory) give no information on the space, they are included only in the test set.

In [11]:
bc_list = []

for mut_type in mutation_types:
    if not (('other' in mut_type) or ('NotSequenced' in mut_type) or ('ExpNeutral' in mut_type)):
        this_mut_type = fitness_data[(fitness_data['mutation_type'].isin([mut_type]))]
        n_samples = int(np.floor(len(this_mut_type.index)/2))
        print(mut_type,len(this_mut_type.index),n_samples)
        
        bc_list = bc_list + list(np.random.choice(this_mut_type['barcode'].values,n_samples))

# exp_neutral =  fitness_data[fitness_data['class'].isin(['ExpNeutral'])]
# n_samples = int(np.floor(len(exp_neutral.index)/2))
# bc_list = bc_list + list(np.random.choice(exp_neutral['barcode'].values,n_samples)) 


fitness_data[fitness_data['barcode'].isin(bc_list)].to_csv('../data/mutant_train_set_neutral2xpass.csv',index=False)

typical_test_list = []

number_per = 10

for mut_type in mutation_types:
    if not (('other' in mut_type) or ('NotSequenced' in mut_type)):
        this_mut_type = fitness_data[(fitness_data['mutation_type'].isin([mut_type]))]
        n_samples = min([int(np.floor(len(this_mut_type.index))),number_per])
        print(mut_type,len(this_mut_type.index),n_samples)
        
        options = [bc for bc in this_mut_type['barcode'].values if bc not in bc_list]
        n_samples = min([len(options),n_samples])
        
        typical_test_list = typical_test_list + list(np.random.choice(options,n_samples,replace=False))

exp_neutral =  fitness_data[fitness_data['mutation_type'].isin(['ExpNeutral'])]
n_samples =  min([int(np.floor(len(this_mut_type.index))),number_per])
# typical_test_list = typical_test_list + list(np.random.choice(exp_neutral['barcode'].values,n_samples)) 
typical_test_list = typical_test_list + list(exp_neutral['barcode'].values) 

typical_test_list = typical_test_list + list(fitness_data[fitness_data['gene']=='other']['barcode'].values) + list(fitness_data[fitness_data['gene']=='NotSequenced']['barcode'].values)

typical_test_list = [bc for bc in typical_test_list if bc not in bc_list]

fitness_data[fitness_data['barcode'].isin(typical_test_list)].to_csv('../data/mutant_test_set_neutral2xpass.csv',index=False) 

CYR1 3 1
Diploid 188 94
Diploid + Chr11Amp 3 1
Diploid + Chr12Amp 1 0
Diploid + IRA1 1 0
Diploid + IRA2 3 1
Diploid_adaptive 11 5
GPB1 4 2
GPB2 14 7
IRA1_missense 9 4
IRA1_nonsense 20 10
IRA2 8 4
KOG1 1 0
PDE2 11 5
RAS2 1 0
SCH9 1 0
TFS1 1 0
TOR1 1 0
CYR1 3 3
Diploid 188 10
Diploid + Chr11Amp 3 3
Diploid + Chr12Amp 1 1
Diploid + IRA1 1 1
Diploid + IRA2 3 3
Diploid_adaptive 11 10
ExpNeutral 3 3
GPB1 4 4
GPB2 14 10
IRA1_missense 9 9
IRA1_nonsense 20 10
IRA2 8 8
KOG1 1 1
PDE2 11 10
RAS2 1 1
SCH9 1 1
TFS1 1 1
TOR1 1 1


 The "minimal" set below tries to account for imbalances in the number of mutants per type by instead taking 2 of every mutant type in the training set and (at most) 10 of each for the testing set.
 
Again, "other" mutants and "Not Sequenced" are excluded.

In [12]:
minimal_bc_list = []

number_per = 4
print('MINIMAL TRAINING SET')
for mut_type in mutation_types:
    if not (('other' in mut_type) or ('NotSequenced' in mut_type) or ('ExpNeutral' in mut_type)):
        this_mut_type = fitness_data[(fitness_data['mutation_type'].isin([mut_type]))]
        n_samples =  min([int(np.floor(len(this_mut_type.index)/2)),number_per])
        print(mut_type,len(this_mut_type.index),n_samples)
        
        minimal_bc_list = sorted(np.unique(minimal_bc_list + list(np.random.choice(this_mut_type['barcode'].values,n_samples,replace=False))))

fitness_data[fitness_data['barcode'].isin(minimal_bc_list)].to_csv('../data/mutant_minimal_train_set_neutral2xpass.csv',index=False)

minimal_test_list = []

number_per = 10
print('MINIMAL TESTING SET')
for mut_type in mutation_types:
    if not (('other' in mut_type) or ('NotSequenced' in mut_type)):
        this_mut_type = fitness_data[(fitness_data['mutation_type'].isin([mut_type]))]
        options = [bc for bc in this_mut_type['barcode'].values if bc not in minimal_bc_list]
        n_samples = min([int(np.floor(len(options))),number_per])
#         print(options)
        print(mut_type,len(options),n_samples)
        
        minimal_test_list = sorted(np.unique(minimal_test_list + list(np.random.choice(options,n_samples,replace=False))))

exp_neutral =  fitness_data[fitness_data['mutation_type'].isin(['ExpNeutral'])]
# n_samples =  min([int(np.floor(len(this_mut_type.index))),number_per])
# minimal_test_list = minimal_test_list + list(np.random.choice(exp_neutral['barcode'].values,n_samples)) 
minimal_test_list = sorted(np.unique(minimal_test_list + [bc for bc in list(exp_neutral['barcode'].values) if bc not in minimal_test_list]))


# print(list(np.random.choice(exp_neutral['barcode'].values,n_samples)) )
# minimal_test_list = minimal_test_list + list(fitness_data[fitness_data['gene']=='other']['barcode'].values) + list(fitness_data[fitness_data['gene']=='NotSequenced']['barcode'].values)

minimal_test_list = [bc for bc in minimal_test_list if bc not in minimal_bc_list]

fitness_data[fitness_data['barcode'].isin(minimal_test_list)].to_csv('../data/mutant_minimal_test_set_neutral2xpass.csv',index=False)        

MINIMAL TRAINING SET
CYR1 3 1
Diploid 188 4
Diploid + Chr11Amp 3 1
Diploid + Chr12Amp 1 0
Diploid + IRA1 1 0
Diploid + IRA2 3 1
Diploid_adaptive 11 4
GPB1 4 2
GPB2 14 4
IRA1_missense 9 4
IRA1_nonsense 20 4
IRA2 8 4
KOG1 1 0
PDE2 11 4
RAS2 1 0
SCH9 1 0
TFS1 1 0
TOR1 1 0
MINIMAL TESTING SET
CYR1 2 2
Diploid 184 10
Diploid + Chr11Amp 2 2
Diploid + Chr12Amp 1 1
Diploid + IRA1 1 1
Diploid + IRA2 2 2
Diploid_adaptive 7 7
ExpNeutral 3 3
GPB1 2 2
GPB2 10 10
IRA1_missense 5 5
IRA1_nonsense 16 10
IRA2 4 4
KOG1 1 1
PDE2 7 7
RAS2 1 1
SCH9 1 1
TFS1 1 1
TOR1 1 1


To account for the fact "other" mutants may in fact have interesting phenotypic behavior that isn't capture by the commonly hit mutants, we also include another training set where we divide these equally into the training and test set, but otherwise keep the "minimal" training and test sets from above. 

Note that though the "other" mutants have no obvious commonalities in genotype, it is possible there is over-representation of particular mutants in some of these sets.

To avoid the problem that some of the "Not Sequenced" mutants are actually likely to contain some similarities, particularly because we've not sequenced some because of similarities to others (i.e. likely to be Diploid/IRA1) and this may put our sets out of balance.

In [13]:
# this_gt = fitness_data[(fitness_data['gene'].isin(['other']))]
this_mut = fitness_data[(fitness_data['mutation_type'].isin(['other_adaptive']))]
n_samples = int(len(this_mut.index)/2)
print(n_samples,len(this_mut.index))
train_others = list(np.random.choice(this_mut['barcode'].values,n_samples))
test_others = [bc for bc in this_mut['barcode'].values if bc not in train_others]
        
training_w_others = minimal_bc_list + train_others
fitness_data[fitness_data['barcode'].isin(training_w_others)].to_csv('../data/mutant_minimal+otheradaptive_train_set_neutral2xpass.csv',index=False)
print(len(training_w_others))

testing_w_others = minimal_test_list + test_others
fitness_data[fitness_data['barcode'].isin(testing_w_others)].to_csv('../data/mutant_minimal+otheradaptive_test_set_neutral2xpass.csv',index=False)
print(len(testing_w_others))

fitness_data[fitness_data['barcode'].isin(train_others)].to_csv('../data/mutant_justotheradaptive_train_set_neutral2xpass.csv',index=False)

fitness_data[fitness_data['barcode'].isin(test_others)].to_csv('../data/mutant_justotheradaptive_test_set_neutral2xpass.csv',index=False)
 

3 7
36
75


In [33]:
print(len(fitness_data['barcode'].values))

421


In [34]:
len(minimal_bc_list),len(minimal_test_list),len(training_w_others),len(testing_w_others),len(train_others),len(test_others)

(33, 71, 37, 76, 4, 5)

In [25]:
fitness_data[(fitness_data['mutation_type'].isin(['other']))]

Unnamed: 0,barcode,gene,type,ploidy,class,additional_muts,1.4%-R1_error,1.4%-R1_fitness,1.4%-R2_error,1.4%-R2_fitness,...,1BB_1%Raf_error,1BB_0.5%Raf_fitness,1BB_0.5%Raf_error,1BB_1%Gly_fitness,1BB_1%Gly_error,1BB_1%EtOH_fitness,1BB_1%EtOH_error,1BB_SucRaf_fitness,1BB_SucRaf_error,mutation_type
16,1641,other,other,other,other,nan-nan,0.104534,-0.243305,0.219505,-0.022012,...,0.057445,0.098662,0.031246,0.084957,0.043791,0.087348,0.038602,-0.058655,0.085009,other
17,1683,other,other,other,other,SDH3-upstream_point_variant; MFM1-missense_var...,0.118186,-0.075047,0.329653,-0.073542,...,0.051034,0.040171,0.028345,0.040905,0.038521,0.021993,0.032390,-0.003491,0.073554,other
53,7291,other,other,other,other,"IMP4-missense_variant; YER156C,COG3-upstream_i...",0.125247,0.315658,0.276059,-0.087561,...,0.050167,-0.024125,0.029735,-0.028549,0.041540,-0.011670,0.034547,0.033725,0.075502,other
68,8825,other,other,other,other,ATG32-upstream_indel_variant,0.104268,0.437782,0.299720,-0.081207,...,0.060646,-0.032871,0.034515,-0.041815,0.053218,-0.012515,0.047339,-0.055427,0.095369,other
77,10307,other,other,other,other,nan-nan,0.091558,0.104966,0.170830,0.188714,...,0.051112,0.053200,0.028117,0.080667,0.036555,0.103433,0.029084,0.044973,0.072446,other
97,13183,other,other,other,other,nan-nan,0.042835,0.907085,0.074215,0.968915,...,0.051578,-0.002650,0.029743,0.002012,0.041613,0.009745,0.035692,0.033459,0.078585,other
113,17499,other,other,other,other,LCP5-upstream_indel_variant; PTC3-synonymous_v...,0.173937,-0.079916,0.550927,0.302517,...,0.059122,0.049358,0.033103,0.005277,0.050004,0.086882,0.042090,0.070321,0.095877,other
133,19894,other,other,other,other,tQ(UUG)C-upstream_indel_variant,0.115310,0.133337,0.239455,0.054158,...,0.053132,0.038248,0.031482,-0.031191,0.046811,0.125594,0.037571,0.150494,0.088505,other
138,20632,other,other,other,other,"YAR023C,SUP56-upstream_indel_variant",0.112300,-0.079597,0.259916,-0.150241,...,0.053582,-0.013950,0.029929,-0.001907,0.042113,0.055911,0.034246,0.048401,0.077868,other
160,24151,other,other,other,other,nan-nan,0.269317,0.522029,0.507669,0.067997,...,0.054109,0.030936,0.030345,0.102488,0.043450,-0.000405,0.038413,0.085719,0.082346,other


In [None]:
fitness_data[(fitness_data['mutation_type'].isin(['Diploid_adaptive']))]

In [14]:
fitness_data[(fitness_data['mutation_type'].isin(['NotSequenced_adaptive']))]

Unnamed: 0,barcode,gene,type,ploidy,class,additional_muts,1.4%-R1_error,1.4%-R1_fitness,1.4%-R2_error,1.4%-R2_fitness,...,1BB_1%Raf_error,1BB_0.5%Raf_fitness,1BB_0.5%Raf_error,1BB_1%Gly_fitness,1BB_1%Gly_error,1BB_1%EtOH_fitness,1BB_1%EtOH_error,1BB_SucRaf_fitness,1BB_SucRaf_error,mutation_type
2,262,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.048088,0.362946,0.095466,0.26313,...,0.052701,0.288765,0.025901,0.346346,0.03105,0.493549,0.023583,0.350548,0.067009,NotSequenced_adaptive
4,323,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.03748,1.011827,0.04256,0.947841,...,0.043157,0.647284,0.018741,0.662209,0.024546,0.434464,0.020178,0.291819,0.055298,NotSequenced_adaptive
9,697,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.056581,0.446857,0.121933,0.535259,...,0.050038,0.216814,0.0265,0.299265,0.032168,0.491183,0.023601,0.369475,0.069865,NotSequenced_adaptive
12,1379,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.036929,1.197959,0.038409,1.259083,...,0.042748,0.868867,0.017642,0.983137,0.023345,0.44751,0.017755,0.416034,0.050919,NotSequenced_adaptive
13,1488,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.037882,0.914444,0.045158,1.020749,...,0.042775,0.940722,0.017762,1.033481,0.02369,0.43043,0.019887,0.561952,0.051527,NotSequenced_adaptive
28,3250,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.052595,0.545628,0.099959,0.454508,...,0.049603,0.257494,0.025152,0.354278,0.03056,0.496114,0.02256,0.376793,0.067355,NotSequenced_adaptive
36,3730,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.037535,0.937621,0.046086,0.769274,...,0.043168,0.730906,0.018503,0.98472,0.02393,0.08865,0.031906,-0.514917,0.081091,NotSequenced_adaptive
43,5654,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.054587,0.463124,0.10861,0.424426,...,0.048633,0.263721,0.024336,0.315625,0.02949,0.506449,0.021643,0.368297,0.063284,NotSequenced_adaptive
52,7269,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.041768,0.719641,0.066021,0.677798,...,0.043162,0.685227,0.018664,0.65548,0.024257,0.385693,0.019534,0.35158,0.054753,NotSequenced_adaptive
70,9518,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.068526,0.569159,0.150248,0.428477,...,0.05872,0.21814,0.029388,0.259976,0.037298,0.477029,0.028644,0.455859,0.079631,NotSequenced_adaptive


In [199]:
fitness_data[(fitness_data['mutation_type'].isin(['ExpNeutral']))]

Unnamed: 0,barcode,gene,type,ploidy,class,additional_muts,1.4%-R1_error,1.4%-R1_fitness,1.4%-R2_error,1.4%-R2_fitness,...,1BB_1%Raf_error,1BB_0.5%Raf_fitness,1BB_0.5%Raf_error,1BB_1%Gly_fitness,1BB_1%Gly_error,1BB_1%EtOH_fitness,1BB_1%EtOH_error,1BB_SucRaf_fitness,1BB_SucRaf_error,mutation_type
