In [3]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as p
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.spatial import distance
from scipy.stats.mstats import gmean
%matplotlib inline
from itertools import combinations
from itertools import chain
import sys
import os
import copy
sns.set_style('white')
sns.set_style('ticks')
sns.set_color_codes()

fgm_simulation_path = '/Users/grantkinsler/Documents/Stanford/Research/StarryNight/Git/starry-night/Simulations/FGM_simulation_callable.py'
sys.path.append(os.path.dirname(os.path.expanduser(fgm_simulation_path)))
from FGM_simulation_callable import simulation, nball_pull, gaussian_fitness

tools_path = '../code/tools.py'
sys.path.append(os.path.dirname(os.path.expanduser(tools_path)))
import tools

In [4]:
np.random.seed(953527608) # for exact figure reproducibility use this seed

In [5]:
fitness_data = p.read_csv('../data/DoubleBC_Merged_Fitness_Atish_Default_AllConditions.csv')
fitness_data = p.read_csv('../data/DoubleBC_Merged_Fitness_Atish_Weighted_Default_AllConditions_IncludingOld_swapsremoved.csv')
fitness_data = p.read_csv('../data/fitness_weighted_allconditions_swapsremoved.csv')

In [6]:
fitness_data = fitness_data.replace([np.inf, -np.inf], np.nan).dropna()

In [7]:
fitness_data

Unnamed: 0,barcode,gene,type,ploidy,class,additional_muts,1.4%-R1_error,1.4%-R1_fitness,1.4%-R2_error,1.4%-R2_fitness,...,1BB_1%Raf_fitness,1BB_1%Raf_error,1BB_0.5%Raf_fitness,1BB_0.5%Raf_error,1BB_1%Gly_fitness,1BB_1%Gly_error,1BB_1%EtOH_fitness,1BB_1%EtOH_error,1BB_SucRaf_fitness,1BB_SucRaf_error
0,53,Diploid,Diploid,Diploid,Diploid,TIP1-upstream_point_variant; YKR012C-upstream_...,0.059765,0.443607,0.122231,0.479701,...,0.097711,0.048250,0.267503,0.024743,0.347184,0.030058,0.537808,0.021834,0.381319,0.065192
1,151,IRA1,stop_gained,Haploid,PKA,SEH1-missense_variant; ZIP1-missense_variant; ...,0.036871,1.495026,0.037692,1.576258,...,1.037255,0.042743,1.004842,0.017600,1.036011,0.023298,0.716874,0.017019,0.595644,0.050561
2,262,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.048088,0.362946,0.095466,0.263130,...,0.020690,0.052701,0.288765,0.025901,0.346346,0.031050,0.493549,0.023583,0.350548,0.067009
3,273,IRA1,frameshift_variant,Haploid,PKA,,0.037258,0.885360,0.040752,0.926651,...,0.843095,0.042810,0.778585,0.017914,0.812738,0.023726,0.495805,0.018409,0.452001,0.051965
4,323,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.037480,1.011827,0.042560,0.947841,...,0.670262,0.043157,0.647284,0.018741,0.662209,0.024546,0.434464,0.020178,0.291819,0.055298
5,415,IRA1,frameshift_variant,Haploid,PKA,,0.041739,0.931319,0.068687,0.993013,...,1.072823,0.042779,0.889023,0.017789,0.979582,0.023703,0.396559,0.020041,0.414002,0.051737
8,689,IRA1,frameshift_variant,Haploid,PKA,RPL19A-upstream_point_variant,0.040399,0.845239,0.058980,0.981331,...,1.137304,0.042878,0.994024,0.018229,1.085905,0.024702,0.688806,0.022354,0.504585,0.054950
9,697,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.056581,0.446857,0.121933,0.535259,...,0.137100,0.050038,0.216814,0.026500,0.299265,0.032168,0.491183,0.023601,0.369475,0.069865
12,1379,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.036929,1.197959,0.038409,1.259083,...,0.975833,0.042748,0.868867,0.017642,0.983137,0.023345,0.447510,0.017755,0.416034,0.050919
13,1488,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.037882,0.914444,0.045158,1.020749,...,1.086990,0.042775,0.940722,0.017762,1.033481,0.023690,0.430430,0.019887,0.561952,0.051527


In [8]:
gene_type_combos = np.unique([(g,t) for g,t in zip(fitness_data['gene'].values,fitness_data['type'].values)],axis=0)

In [9]:
bc_list = []

for (g,t) in gene_type_combos:
    if not (('other' in g) or ('NotSequenced' in g)):
        this_gt = fitness_data[(fitness_data['gene'].isin([g]) & fitness_data['type'].isin([t]))]
        n_samples = int(np.floor(len(this_gt.index)/2))
        print(g,t,len(this_gt.index),n_samples)
        
        bc_list = bc_list + list(np.random.choice(this_gt['barcode'].values,n_samples))

exp_neutral =  fitness_data[fitness_data['class'].isin(['ExpNeutral'])]
n_samples = int(np.floor(len(exp_neutral.index)/2))
bc_list = bc_list + list(np.random.choice(exp_neutral['barcode'].values,n_samples)) 


fitness_data[fitness_data['barcode'].isin(bc_list)].to_csv('../data/mutant_train_set.csv',index=False)
## should this be balanced by mutation type (diploids dominate...)          

CYR1 missense_variant 3 1
Diploid Diploid 200 100
Diploid + Chr11Amp Diploid + Chr11Amp 3 1
Diploid + Chr12Amp Diploid + Chr12Amp 1 0
Diploid + IRA1 missense_variant 1 0
Diploid + IRA2 frameshift_variant 1 0
Diploid + IRA2 missense_variant 1 0
Diploid + IRA2 stop_gained 1 0
GPB1 frameshift_variant 1 0
GPB1 missense_variant 1 0
GPB1 stop_gained 2 1
GPB2 frameshift_variant 5 2
GPB2 missense_variant 1 0
GPB2 stop_gained 8 4
IRA1 frameshift_variant 11 5
IRA1 missense_variant 9 4
IRA1 stop_gained 10 5
IRA1 upstream_point_variant 1 0
IRA2 frameshift_variant 1 0
IRA2 missense_variant 8 4
KOG1 missense_variant 1 0
PDE2 frameshift_variant 6 3
PDE2 missense_variant 2 1
PDE2 stop_gained 3 1
RAS2 missense_variant 1 0
SCH9 missense_variant 1 0
TFS1 missense_variant 1 0
TOR1 missense_variant 1 0


In [10]:
typical_test_list = []

number_per = 10

for (g,t) in gene_type_combos:
    if not (('other' in g) or ('NotSequenced' in g)):
        this_gt = fitness_data[(fitness_data['gene'].isin([g]) & fitness_data['type'].isin([t]))]
        n_samples = min([int(np.floor(len(this_gt.index))),number_per])
        print(g,t,len(this_gt.index),n_samples)
        
        typical_test_list = typical_test_list + list(np.random.choice(this_gt['barcode'].values,n_samples))

exp_neutral =  fitness_data[fitness_data['class'].isin(['ExpNeutral'])]
n_samples =  min([int(np.floor(len(this_gt.index))),number_per])
typical_test_list = typical_test_list + list(np.random.choice(exp_neutral['barcode'].values,n_samples)) 

typical_test_list = typical_test_list + list(fitness_data[fitness_data['gene']=='other']['barcode'].values) + list(fitness_data[fitness_data['gene']=='NotSequenced']['barcode'].values)

typical_test_list = [bc for bc in typical_test_list if bc not in bc_list]

fitness_data[fitness_data['barcode'].isin(typical_test_list)].to_csv('../data/mutant_test_set.csv',index=False)        

CYR1 missense_variant 3 3
Diploid Diploid 200 10
Diploid + Chr11Amp Diploid + Chr11Amp 3 3
Diploid + Chr12Amp Diploid + Chr12Amp 1 1
Diploid + IRA1 missense_variant 1 1
Diploid + IRA2 frameshift_variant 1 1
Diploid + IRA2 missense_variant 1 1
Diploid + IRA2 stop_gained 1 1
GPB1 frameshift_variant 1 1
GPB1 missense_variant 1 1
GPB1 stop_gained 2 2
GPB2 frameshift_variant 5 5
GPB2 missense_variant 1 1
GPB2 stop_gained 8 8
IRA1 frameshift_variant 11 10
IRA1 missense_variant 9 9
IRA1 stop_gained 10 10
IRA1 upstream_point_variant 1 1
IRA2 frameshift_variant 1 1
IRA2 missense_variant 8 8
KOG1 missense_variant 1 1
PDE2 frameshift_variant 6 6
PDE2 missense_variant 2 2
PDE2 stop_gained 3 3
RAS2 missense_variant 1 1
SCH9 missense_variant 1 1
TFS1 missense_variant 1 1
TOR1 missense_variant 1 1


In [11]:
fitness_data[fitness_data['barcode'].isin(bc_list)].to_csv('../data/mutant_train_set.csv',index=False)

In [12]:
fitness_data[fitness_data['barcode'].isin(bc_list)]

Unnamed: 0,barcode,gene,type,ploidy,class,additional_muts,1.4%-R1_error,1.4%-R1_fitness,1.4%-R2_error,1.4%-R2_fitness,...,1BB_1%Raf_fitness,1BB_1%Raf_error,1BB_0.5%Raf_fitness,1BB_0.5%Raf_error,1BB_1%Gly_fitness,1BB_1%Gly_error,1BB_1%EtOH_fitness,1BB_1%EtOH_error,1BB_SucRaf_fitness,1BB_SucRaf_error
0,53,Diploid,Diploid,Diploid,Diploid,TIP1-upstream_point_variant; YKR012C-upstream_...,0.059765,0.443607,0.122231,0.479701,...,0.097711,0.048250,0.267503,0.024743,0.347184,0.030058,0.537808,0.021834,0.381319,0.065192
5,415,IRA1,frameshift_variant,Haploid,PKA,,0.041739,0.931319,0.068687,0.993013,...,1.072823,0.042779,0.889023,0.017789,0.979582,0.023703,0.396559,0.020041,0.414002,0.051737
8,689,IRA1,frameshift_variant,Haploid,PKA,RPL19A-upstream_point_variant,0.040399,0.845239,0.058980,0.981331,...,1.137304,0.042878,0.994024,0.018229,1.085905,0.024702,0.688806,0.022354,0.504585,0.054950
15,1617,PDE2,frameshift_variant,Haploid,PKA,RIF1-upstream_indel_variant,0.038957,0.686232,0.052047,0.669738,...,0.793095,0.042838,0.823836,0.017939,0.773629,0.023734,0.410301,0.018711,0.479477,0.051868
19,2037,Diploid,Diploid,Diploid,Diploid,,0.049899,0.424994,0.103343,0.335539,...,0.080717,0.046188,0.285390,0.022496,0.380652,0.027091,0.539617,0.019794,0.400161,0.058922
22,2468,IRA2,missense_variant,Haploid,PKA,SEC5-missense_variant,0.055558,0.731295,0.122064,0.769657,...,0.892890,0.043051,0.819757,0.018871,0.745122,0.025686,0.303036,0.026396,0.412359,0.054813
24,2768,GPB2,stop_gained,Haploid,PKA,DUS1-missense_variant,0.037595,0.946479,0.042714,0.955810,...,0.640614,0.043006,0.622411,0.018493,0.690822,0.024204,0.459759,0.019166,0.299712,0.054434
25,2776,Diploid,Diploid,Diploid,Diploid,SWH1-missense_variant,0.063066,0.573550,0.144568,0.406583,...,0.065011,0.058108,0.110425,0.029959,0.265332,0.038015,0.553909,0.027940,0.281149,0.083645
26,2808,Diploid,Diploid,Diploid,Diploid,CDC39-missense_variant,0.045011,0.467559,0.075474,0.529840,...,0.409573,0.046283,0.388362,0.022041,0.473013,0.026582,0.517859,0.021223,0.412059,0.063089
34,3585,Diploid,Diploid,Diploid,Diploid,,0.046697,0.447787,0.083693,0.429264,...,0.023291,0.048417,0.253642,0.024467,0.361170,0.029399,0.510176,0.021648,0.395201,0.064220


In [13]:
exp_neutral

Unnamed: 0,barcode,gene,type,ploidy,class,additional_muts,1.4%-R1_error,1.4%-R1_fitness,1.4%-R2_error,1.4%-R2_fitness,...,1BB_1%Raf_fitness,1BB_1%Raf_error,1BB_0.5%Raf_fitness,1BB_0.5%Raf_error,1BB_1%Gly_fitness,1BB_1%Gly_error,1BB_1%EtOH_fitness,1BB_1%EtOH_error,1BB_SucRaf_fitness,1BB_SucRaf_error
292,72939,NotSequenced,NotSequenced,NotSequenced,ExpNeutral,NotSequenced,0.108037,0.036133,0.216218,0.138363,...,0.026653,0.054947,0.003273,0.031732,-0.022174,0.047902,0.042905,0.041057,0.006673,0.086473
357,120600,other,other,other,ExpNeutral,nan-upstream_indel_variant,0.142245,0.090259,0.26892,0.281687,...,0.059623,0.051313,0.035441,0.029769,0.077561,0.040361,0.090905,0.033684,0.022998,0.07805
474,298344,other,other,other,ExpNeutral,FYV10-upstream_indel_variant; YKL177W-upstream...,0.179211,-0.03605,0.67918,-0.037955,...,0.03401,0.057644,0.054387,0.032119,0.022825,0.047486,0.048438,0.041095,0.019368,0.088239


In [17]:
minimal_bc_list = []

number_per = 2

for (g,t) in gene_type_combos:
    if not (('other' in g) or ('NotSequenced' in g)):
        this_gt = fitness_data[(fitness_data['gene'].isin([g]) & fitness_data['type'].isin([t]))]
        n_samples = min([int(np.floor(len(this_gt.index)/2)),number_per])
        print(g,t,len(this_gt.index),n_samples)
        
        minimal_bc_list = minimal_bc_list + list(np.random.choice(this_gt['barcode'].values,n_samples))

# exp_neutral =  fitness_data[fitness_data['class'].isin(['ExpNeutral'])]
# n_samples = int(np.floor(len(exp_neutral.index)/2))
# minimal_bc_list = minimal_bc_list + list(np.random.choice(exp_neutral['barcode'].values,n_samples))  
## should this be balanced by mutation type (diploids dominate...)          

CYR1 missense_variant 3 1
Diploid Diploid 200 2
Diploid + Chr11Amp Diploid + Chr11Amp 3 1
Diploid + Chr12Amp Diploid + Chr12Amp 1 0
Diploid + IRA1 missense_variant 1 0
Diploid + IRA2 frameshift_variant 1 0
Diploid + IRA2 missense_variant 1 0
Diploid + IRA2 stop_gained 1 0
GPB1 frameshift_variant 1 0
GPB1 missense_variant 1 0
GPB1 stop_gained 2 1
GPB2 frameshift_variant 5 2
GPB2 missense_variant 1 0
GPB2 stop_gained 8 2
IRA1 frameshift_variant 11 2
IRA1 missense_variant 9 2
IRA1 stop_gained 10 2
IRA1 upstream_point_variant 1 0
IRA2 frameshift_variant 1 0
IRA2 missense_variant 8 2
KOG1 missense_variant 1 0
PDE2 frameshift_variant 6 2
PDE2 missense_variant 2 1
PDE2 stop_gained 3 1
RAS2 missense_variant 1 0
SCH9 missense_variant 1 0
TFS1 missense_variant 1 0
TOR1 missense_variant 1 0


In [18]:
fitness_data[fitness_data['barcode'].isin(minimal_bc_list)].to_csv('../data/mutant_minimal_train_set.csv',index=False)

In [21]:
minimal_test_list = []

number_per = 10

for (g,t) in gene_type_combos:
    if not (('other' in g) or ('NotSequenced' in g)):
        this_gt = fitness_data[(fitness_data['gene'].isin([g]) & fitness_data['type'].isin([t]))]
        n_samples = min([int(np.floor(len(this_gt.index))),number_per])
        print(g,t,len(this_gt.index),n_samples)
        
        minimal_test_list = minimal_test_list + list(np.random.choice(this_gt['barcode'].values,n_samples))

exp_neutral =  fitness_data[fitness_data['class'].isin(['ExpNeutral'])]
n_samples =  min([int(np.floor(len(this_gt.index))),number_per])
minimal_test_list = minimal_test_list + list(np.random.choice(exp_neutral['barcode'].values,n_samples)) 
print(list(np.random.choice(exp_neutral['barcode'].values,n_samples)) )
# minimal_test_list = minimal_test_list + list(fitness_data[fitness_data['gene']=='other']['barcode'].values) + list(fitness_data[fitness_data['gene']=='NotSequenced']['barcode'].values)

minimal_test_list = [bc for bc in minimal_test_list if bc not in minimal_bc_list]

fitness_data[fitness_data['barcode'].isin(minimal_test_list)].to_csv('../data/mutant_minimal_test_set.csv',index=False)        

CYR1 missense_variant 3 3
Diploid Diploid 200 10
Diploid + Chr11Amp Diploid + Chr11Amp 3 3
Diploid + Chr12Amp Diploid + Chr12Amp 1 1
Diploid + IRA1 missense_variant 1 1
Diploid + IRA2 frameshift_variant 1 1
Diploid + IRA2 missense_variant 1 1
Diploid + IRA2 stop_gained 1 1
GPB1 frameshift_variant 1 1
GPB1 missense_variant 1 1
GPB1 stop_gained 2 2
GPB2 frameshift_variant 5 5
GPB2 missense_variant 1 1
GPB2 stop_gained 8 8
IRA1 frameshift_variant 11 10
IRA1 missense_variant 9 9
IRA1 stop_gained 10 10
IRA1 upstream_point_variant 1 1
IRA2 frameshift_variant 1 1
IRA2 missense_variant 8 8
KOG1 missense_variant 1 1
PDE2 frameshift_variant 6 6
PDE2 missense_variant 2 2
PDE2 stop_gained 3 3
RAS2 missense_variant 1 1
SCH9 missense_variant 1 1
TFS1 missense_variant 1 1
TOR1 missense_variant 1 1
[120600]
