In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as p
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.spatial import distance
from scipy.stats.mstats import gmean
%matplotlib inline
from itertools import combinations
from itertools import chain
import sys
import os
import copy
sns.set_style('white')
sns.set_style('ticks')
sns.set_color_codes()

fgm_simulation_path = '/Users/grantkinsler/Documents/Stanford/Research/StarryNight/Git/starry-night/Simulations/FGM_simulation_callable.py'
sys.path.append(os.path.dirname(os.path.expanduser(fgm_simulation_path)))
from FGM_simulation_callable import simulation, nball_pull, gaussian_fitness

tools_path = '../code/tools.py'
sys.path.append(os.path.dirname(os.path.expanduser(tools_path)))
import tools

In [2]:
np.random.seed(953527608) # for exact figure reproducibility use this seed

In [11]:
fitness_data = p.read_csv('../data/DoubleBC_Merged_Fitness_Atish_Default_AllConditions.csv')

In [12]:
fitness_data = fitness_data.replace([np.inf, -np.inf], np.nan).dropna()

In [13]:
fitness_data

Unnamed: 0,barcode,gene,type,ploidy,class,additional_muts,13.1_error,13.1_fitness,13.2_error,13.2_fitness,...,1BB_0.2MNaCl_fitness,1BB_0.2MNaCl_error,1BB_0.2MKCl_fitness,1BB_0.2MKCl_error,1BB_0.5MKCl_fitness,1BB_0.5MKCl_error,1BB_0.5%Raf_fitness,1BB_0.5%Raf_error,1BB_1%Gly_fitness,1BB_1%Gly_error
0,53,Diploid,Diploid,Diploid,Diploid,TIP1-upstream_point_variant; YKR012C-upstream_...,0.084158,0.319887,0.080264,0.284783,...,0.082724,0.060322,0.517133,0.054619,0.281774,0.059401,0.267503,0.024743,0.351641,0.031392
1,151,IRA1,stop_gained,Haploid,PKA,"MIM1,tS-upstream_indel_variant; SEH1-missense_...",0.045954,0.215119,0.045518,0.224971,...,0.990029,0.050504,0.252553,0.050905,-2.547008,0.103433,1.004842,0.017600,1.018480,0.024649
2,262,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.063284,0.133557,0.062654,0.058471,...,0.138540,0.063584,0.562643,0.055950,0.301131,0.061419,0.288765,0.025901,0.358404,0.032418
3,273,IRA1,frameshift_variant,Haploid,PKA,,0.046382,0.346726,0.046767,0.357501,...,0.733651,0.050832,0.425995,0.051479,-1.036554,0.081520,0.778585,0.017914,0.787411,0.025075
4,323,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.047905,0.456146,0.047768,0.455681,...,0.701008,0.051590,0.421169,0.052638,-0.430072,0.070879,0.647284,0.018741,0.664027,0.025911
5,415,IRA1,frameshift_variant,Haploid,PKA,,0.056073,0.967334,0.054489,0.918439,...,0.969724,0.050702,-0.290556,0.062332,-3.652083,0.218403,0.889023,0.017789,0.967308,0.025020
8,689,IRA1,frameshift_variant,Haploid,PKA,RPL19A-upstream_point_variant,0.053402,0.817965,0.052065,0.743315,...,0.987435,0.051392,-0.616392,0.096871,-3.315940,0.496535,0.994024,0.018229,1.068526,0.025928
9,697,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.075625,0.347717,0.075000,0.278913,...,0.091431,0.063976,0.468017,0.056099,0.301192,0.061008,0.216814,0.026500,0.306125,0.033305
12,1379,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.045215,0.142476,0.044811,0.187134,...,0.856778,0.050541,0.142506,0.051813,-2.072755,0.097259,0.868867,0.017642,0.957030,0.024690
13,1488,NotSequenced,NotSequenced,NotSequenced,NotSequenced,NotSequenced,0.049747,0.548279,0.048874,0.530546,...,0.960353,0.050722,-0.365364,0.066819,-3.708913,0.281476,0.940722,0.017762,1.009197,0.025021


In [6]:
gene_type_combos = np.unique([(g,t) for g,t in zip(fitness_data['gene'].values,fitness_data['type'].values)],axis=0)

In [7]:
bc_list = []

for (g,t) in gene_type_combos:
    if not (('other' in g) or ('NotSequenced' in g)):
        this_gt = fitness_data[(fitness_data['gene'].isin([g]) & fitness_data['type'].isin([t]))]
        n_samples = int(np.floor(len(this_gt.index)/2))
        print(g,t,len(this_gt.index),n_samples)
        
        bc_list = bc_list + list(np.random.choice(this_gt['barcode'].values,n_samples))

exp_neutral =  fitness_data[fitness_data['class'].isin(['ExpNeutral'])]
n_samples = int(np.floor(len(exp_neutral.index)/2))
bc_list + list(np.random.choice(exp_neutral['barcode'].values,n_samples))  
## should this be balanced by mutation type (diploids dominate...)          

CYR1 missense_variant 3 1
Diploid Diploid 195 97
Diploid + Chr11Amp Diploid + Chr11Amp 3 1
Diploid + Chr12Amp Diploid + Chr12Amp 1 0
Diploid + IRA1 missense_variant 1 0
Diploid + IRA2 frameshift_variant 1 0
Diploid + IRA2 missense_variant 1 0
Diploid + IRA2 stop_gained 1 0
GPB1 frameshift_variant 1 0
GPB1 missense_variant 1 0
GPB1 stop_gained 2 1
GPB2 frameshift_variant 5 2
GPB2 missense_variant 1 0
GPB2 stop_gained 8 4
IRA1 frameshift_variant 10 5
IRA1 missense_variant 9 4
IRA1 stop_gained 9 4
IRA1 upstream_point_variant 1 0
IRA2 missense_variant 7 3
KOG1 missense_variant 1 0
PDE2 frameshift_variant 6 3
PDE2 missense_variant 2 1
PDE2 stop_gained 3 1
RAS2 missense_variant 1 0
SCH9 missense_variant 1 0
TFS1 missense_variant 1 0
TOR1 missense_variant 1 0


[8297,
 62701,
 261861,
 207058,
 10316,
 101506,
 19795,
 40865,
 2808,
 120939,
 67711,
 58400,
 145113,
 132735,
 32685,
 152081,
 434397,
 89848,
 22011,
 375115,
 72591,
 176764,
 14160,
 85751,
 4791,
 65266,
 159633,
 75144,
 90003,
 67711,
 11586,
 317672,
 31607,
 232264,
 142144,
 10050,
 14915,
 178096,
 4691,
 27346,
 101051,
 71696,
 30769,
 73881,
 72591,
 170814,
 50232,
 24858,
 19795,
 14849,
 24858,
 50286,
 19297,
 215325,
 19219,
 112239,
 12752,
 40779,
 32685,
 419831,
 75144,
 55577,
 118509,
 3585,
 70812,
 244443,
 112239,
 62344,
 2037,
 8459,
 101051,
 111905,
 2776,
 60959,
 118509,
 144535,
 112239,
 112239,
 35898,
 83504,
 50232,
 53,
 19219,
 187691,
 52221,
 78573,
 53,
 41484,
 310209,
 70812,
 110443,
 338908,
 39270,
 19697,
 110443,
 120939,
 83504,
 50232,
 71751,
 75669,
 19407,
 19407,
 23688,
 10851,
 2768,
 23688,
 40409,
 415,
 689,
 40409,
 415,
 20873,
 7961,
 20873,
 20873,
 57117,
 31360,
 23868,
 25531,
 2468,
 21956,
 18200,
 309655,
 30

In [8]:
len(bc_list)


127

In [10]:
fitness_data[fitness_data['barcode'].isin(bc_list)].to_csv('../data/mutant_train_set.csv',index=False)

In [9]:
fitness_data[fitness_data['barcode'].isin(bc_list)]

Unnamed: 0,barcode,gene,type,ploidy,class,additional_muts,13.1_error,13.1_fitness,13.2_error,13.2_fitness,...,1BB_0.2MNaCl_fitness,1BB_0.2MNaCl_error,1BB_0.2MKCl_fitness,1BB_0.2MKCl_error,1BB_0.5MKCl_fitness,1BB_0.5MKCl_error,1BB_0.5%Raf_fitness,1BB_0.5%Raf_error,1BB_1%Gly_fitness,1BB_1%Gly_error
0,53,Diploid,Diploid,Diploid,Diploid,TIP1-upstream_point_variant; YKR012C-upstream_...,0.084158,0.319887,0.080264,0.284783,...,0.082724,0.060322,0.517133,0.054619,0.281774,0.059401,0.267503,0.024743,0.351641,0.031392
5,415,IRA1,frameshift_variant,Haploid,PKA,,0.056073,0.967334,0.054489,0.918439,...,0.969724,0.050702,-0.290556,0.062332,-3.652083,0.218403,0.889023,0.017789,0.967308,0.025020
8,689,IRA1,frameshift_variant,Haploid,PKA,RPL19A-upstream_point_variant,0.053402,0.817965,0.052065,0.743315,...,0.987435,0.051392,-0.616392,0.096871,-3.315940,0.496535,0.994024,0.018229,1.068526,0.025928
18,1866,PDE2,stop_gained,Haploid,PKA,,0.058140,0.776542,0.058033,0.793749,...,0.732511,0.051293,-0.101285,0.061852,-3.964795,0.265848,0.809809,0.018517,0.754810,0.026087
19,2037,Diploid,Diploid,Diploid,Diploid,,0.073412,0.317318,0.073696,0.262091,...,0.036624,0.057086,0.545633,0.052726,0.301266,0.055751,0.285390,0.022496,0.379003,0.028576
22,2468,IRA2,missense_variant,Haploid,PKA,SEC5-missense_variant,0.060810,0.795439,0.059670,0.750514,...,0.861046,0.051539,-1.073081,0.100056,-4.499636,0.509471,0.819757,0.018871,0.750540,0.026866
24,2768,GPB2,stop_gained,Haploid,PKA,DUS1-missense_variant,0.048828,0.462327,0.048954,0.483911,...,0.578845,0.051395,0.440994,0.052098,-0.374452,0.066600,0.622411,0.018493,0.673507,0.025561
25,2776,Diploid,Diploid,Diploid,Diploid,SWH1-missense_variant,0.083038,0.301899,0.081749,0.274819,...,0.058157,0.072253,0.482425,0.059436,0.269468,0.066591,0.110425,0.029959,0.270183,0.038436
26,2808,Diploid,Diploid,Diploid,Diploid,CDC39-missense_variant,0.065265,0.338412,0.062620,0.294174,...,0.451453,0.057235,0.695223,0.052227,0.531749,0.053800,0.388362,0.022041,0.485305,0.028107
34,3585,Diploid,Diploid,Diploid,Diploid,,0.069184,0.341606,0.067836,0.265338,...,-0.011998,0.060426,0.535459,0.054073,0.293449,0.058636,0.253642,0.024467,0.362025,0.030726


In [23]:
exp_neutral

Unnamed: 0,barcode,gene,type,ploidy,class,additional_muts,13.1_error,13.1_fitness,13.2_error,13.2_fitness,...,1BB_0.2MNaCl_fitness,1BB_0.2MNaCl_error,1BB_0.2MKCl_fitness,1BB_0.2MKCl_error,1BB_0.5MKCl_fitness,1BB_0.5MKCl_error,1BB_0.5%Raf_fitness,1BB_0.5%Raf_error,1BB_1%Gly_fitness,1BB_1%Gly_error
474,298344,other,other,other,ExpNeutral,FYV10-upstream_indel_variant; YKL177W-upstream...,0.113947,-0.084783,0.104373,0.044516,...,0.085047,0.070722,0.072095,0.081512,0.0282,0.080866,0.054387,0.032119,0.021965,0.04661
