# Split the FP homologs dataset

Last run top to bottom on Apr 30, 2019.

In [1]:
import sys
import os
import random

import numpy as np
import pandas as pd
from sklearn.manifold import MDS
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

sys.path.append('../common')
import data_io_utils
import paths
import utils
import constants

%reload_ext autoreload
%autoreload 2

## Seeds for reproducibility

In [2]:
np.random.seed(1)
random.seed(1)

## Load the FP homologs dataset. 

In [3]:
paths.FP_HOMOLOGS_DATA_FILE

'/notebooks/analysis/common/../../data/s3/datasets/Exp9_all_ex_lasers_FL2_inferred_brightness_parents_decomposed_tts.csv'

In [4]:
data_io_utils.sync_s3_path_to_local(paths.FP_HOMOLOGS_DATA_FILE, is_single_file=True)

# MD5 from A049_common in mlpe-gfp-pilot repo
data_io_utils.verify_file_md5_checksum(paths.FP_HOMOLOGS_DATA_FILE, '560f032f77ece074871f210f522d8955')

df = pd.read_csv(paths.FP_HOMOLOGS_DATA_FILE)
df.head()

Unnamed: 0,quantitative_function,seq,inferred_parents,num_effective_parents,frac_seq_explained,parent_contribution_mat,tts
0,1.43431,MSKGEALFSGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,"[['Topaz'], ['moxCerulean3', 'rsEGFP2', 'SBFP2...",3,[0.69327732 0.13445378 0.1302521 ],[[0 0 0 ... 0 0 0]\n [0 0 0 ... 1 1 1]\n [0 0 ...,train
1,1.269852,MSKGAELFTGIVPILIELNGDVNGHKFSVSGEGEGDADYGKLTLKF...,"[['hfriFP', 'Azurite'], ['aceGFP'], ['Enhanced...",3,[0.65546219 0.16806723 0.17647058],[[0 0 0 ... 0 0 0]\n [0 0 0 ... 0 0 0]\n [1 1 ...,train
2,0.726303,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,"[['BFPsol'], ['GFPxm191uv', 'OFPxm'], ['TagGFP...",4,[0.59663866 0.15126051 0.14285714 0.04201681],[[1 1 1 ... 1 1 1]\n [0 0 0 ... 0 0 0]\n [0 0 ...,train
3,0.792929,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATNGKLTLKF...,"[['EGFP'], ['CFP4', 'mEmerald'], ['TagGFP', 'm...",3,[0.42857144 0.34453781 0.18907562],[[1 1 1 ... 0 0 0]\n [0 0 0 ... 1 1 1]\n [1 1 ...,train
4,1.70917,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLEIKF...,"[['Sapphire', 'mEmerald'], ['EBFP1.2'], ['mT-S...",4,[0.44117648 0.23529412 0.21848739 0.06302521],[[1 1 1 ... 0 0 0]\n [1 1 1 ... 0 0 0]\n [0 0 ...,train


### Master splits to subdivide

In [5]:
df_train = df[df['tts'] == 'train']
df_test = df[df['tts'] == 'test']

# Shuffle to break any inherent ordering
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)

print(df_train.shape)
print(df_test.shape)

display(df_train.head())
display(df_test.head())

(10532, 7)
(27050, 7)


Unnamed: 0,quantitative_function,seq,inferred_parents,num_effective_parents,frac_seq_explained,parent_contribution_mat,tts
0,0.553614,MSKGEEMFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,"[['Sapphire'], ['TagCFP'], ['mKalama1'], ['GFP...",4,[0.37815127 0.27310925 0.19327731 0.11764705],[[1 1 1 ... 0 0 0]\n [0 0 0 ... 1 1 1]\n [0 0 ...,train
1,1.70917,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATNGKLTLKF...,"[['D10'], ['Dreiklang'], ['mAmetrine']]",3,[0.60924371 0.22268907 0.16386554],[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,train
2,0.563526,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGECDATYGKLTLKF...,"[['PA-GFP'], ['EBFP2'], ['EBFP1.2', 'EBFP1.5']]",3,[0.52941177 0.28571428 0.10084033],[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,train
3,0.488306,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDADYGKLEIKF...,"[['SHardonnay'], ['mAmetrine'], ['Azurite'], [...",4,[0.38235295 0.21008404 0.19327731 0.11764705],[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,train
4,0.867455,MSKGAELFTGIVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,"[['mECFP'], ['mKalama1'], ['GFP-151pyTyrCu'], ...",4,[0.50000001 0.21008404 0.12605042 0.09663865],[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,train


Unnamed: 0,quantitative_function,seq,inferred_parents,num_effective_parents,frac_seq_explained,parent_contribution_mat,tts
0,0.514526,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKL...,"[['Clover', 'mClover3'], ['SBFP2'], ['GFPxm191...",3,[0.36974791 0.32773109 0.21848739],[[1 1 1 ... 0 0 0]\n [1 1 1 ... 0 0 0]\n [0 0 ...,test
1,1.580695,MSKGEELFTGVVPILVELDGDVNGRKFSVRGVGEGDADYGKLEIKF...,"[['Topaz', 'hfriFP', 'EGFP', 'J8VIQ3_9SPHN', '...",6,[0.51680673 0.14285715 0.11344538 0.06302521 0...,[[0 0 0 ... 1 1 1]\n [0 0 0 ... 1 1 1]\n [0 0 ...,test
2,0.538488,MSKGEELFTGVVPVLIELDGDVHGHKFSVRGEGEGDADYGKLEIKF...,"[['H9'], ['EBFP1.5'], ['SBFP2'], ['GFPxm191uv']]",4,[0.28991598 0.25210084 0.19747899 0.18487394],[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,test
3,1.98396,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDADYGKLEIKF...,"[['hfriFP'], ['moxCerulean3'], ['OFPxm'], ['Su...",6,[0.30252102 0.2394958 0.1302521 0.1092437 0...,[[0 0 0 ... 0 0 0]\n [0 0 0 ... 1 1 1]\n [0 0 ...,test
4,0.592952,MSNGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDADYGKLEIKF...,"[['EBFP'], ['OFPxm'], ['mVenus'], ['Topaz', 'D...",5,[0.42857144 0.18067227 0.15546218 0.13445378 0...,[[0 0 0 ... 0 0 0]\n [0 0 0 ... 0 0 0]\n [0 0 ...,test


In [6]:
def split_df_into_thirds(df):
    # Note kfold split indices are sequential, but we've shuffled
    # the dataframes above. 
    kf = KFold(n_splits=3)
    split_indices = []
    for _, sp_idx in kf.split(df):
        split_indices.append(sp_idx)
        
    return [df.iloc[si] for si in split_indices]

### Subsplit training dataframes (data distributions)

In [7]:
subsplit_train_dfs = split_df_into_thirds(df_train)

for sdf in subsplit_train_dfs:
    display(sdf.head(n=2))
    print(sdf.shape)

Unnamed: 0,quantitative_function,seq,inferred_parents,num_effective_parents,frac_seq_explained,parent_contribution_mat,tts
0,0.553614,MSKGEEMFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,"[['Sapphire'], ['TagCFP'], ['mKalama1'], ['GFP...",4,[0.37815127 0.27310925 0.19327731 0.11764705],[[1 1 1 ... 0 0 0]\n [0 0 0 ... 1 1 1]\n [0 0 ...,train
1,1.70917,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATNGKLTLKF...,"[['D10'], ['Dreiklang'], ['mAmetrine']]",3,[0.60924371 0.22268907 0.16386554],[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,train


(3511, 7)


Unnamed: 0,quantitative_function,seq,inferred_parents,num_effective_parents,frac_seq_explained,parent_contribution_mat,tts
3511,0.650879,MSKGAELFTCVVPILVELDGDVNGHKFSVRGEGEGDATYGKLTLKF...,"[['hfriFP', 'EGFP', 'J8VIQ3_9SPHN', 'avGFP_int...",3,[0.5882353 0.20168067 0.16806722],[[0 0 0 ... 1 1 1]\n [0 0 0 ... 1 1 1]\n [0 0 ...,train
3512,0.42024,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,"[['Topaz'], ['EBFP2'], ['mCitrine']]",3,[0.42436976 0.35714285 0.20168066],[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,train


(3511, 7)


Unnamed: 0,quantitative_function,seq,inferred_parents,num_effective_parents,frac_seq_explained,parent_contribution_mat,tts
7022,1.377371,MSKGEELFTGVVPILVEMDGDVHGHKFSVSGEGEGDATNGKLTLKL...,"[['YFP3'], ['PA-GFP', 'rsEGFP2', 'mECFP', 'mEm...",3,[0.60084034 0.2605042 0.08823529],[[0 0 0 ... 0 0 0]\n [0 0 0 ... 1 1 1]\n [0 0 ...,train
7023,1.428155,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATNGKLTLKF...,"[['SHardonnay', 'YFP'], ['SBFP2']]",2,[0.74789916 0.25210084],[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,train


(3510, 7)


### Subsplit generalization sets

In [8]:
subsplit_test_dfs = split_df_into_thirds(df_test)

for sdf in subsplit_test_dfs:
    display(sdf.head(n=2))
    print(sdf.shape)

Unnamed: 0,quantitative_function,seq,inferred_parents,num_effective_parents,frac_seq_explained,parent_contribution_mat,tts
0,0.514526,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKL...,"[['Clover', 'mClover3'], ['SBFP2'], ['GFPxm191...",3,[0.36974791 0.32773109 0.21848739],[[1 1 1 ... 0 0 0]\n [1 1 1 ... 0 0 0]\n [0 0 ...,test
1,1.580695,MSKGEELFTGVVPILVELDGDVNGRKFSVRGVGEGDADYGKLEIKF...,"[['Topaz', 'hfriFP', 'EGFP', 'J8VIQ3_9SPHN', '...",6,[0.51680673 0.14285715 0.11344538 0.06302521 0...,[[0 0 0 ... 1 1 1]\n [0 0 0 ... 1 1 1]\n [0 0 ...,test


(9017, 7)


Unnamed: 0,quantitative_function,seq,inferred_parents,num_effective_parents,frac_seq_explained,parent_contribution_mat,tts
9017,0.619303,MSKGEELFTGVVPILVELDGDVNGHKFSVSGKGEGDATYGKLTLKF...,"[['Topaz'], ['GFPxm191uv', 'OFPxm'], ['TagGFP2...",5,[0.61344539 0.1302521 0.07983193 0.06302521 0...,[[1 1 1 ... 1 1 1]\n [0 0 0 ... 0 0 0]\n [0 0 ...,test
9018,1.98396,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,"[['J8VIQ3_9SPHN'], ['EBFP1.2', 'W7', 'W2'], ['...",4,[0.31512606 0.42436974 0.13865546 0.06722689],[[1 1 1 ... 0 0 0]\n [0 0 0 ... 0 0 0]\n [1 1 ...,test


(9017, 7)


Unnamed: 0,quantitative_function,seq,inferred_parents,num_effective_parents,frac_seq_explained,parent_contribution_mat,tts
18034,0.547384,MSKGEELFTGVVPILVELDGDVNDHKFSVSGEGEGDATYGKLTLKF...,"[['GFPhal'], ['TagGFP2'], ['TagCFP']]",3,[0.52100841 0.25210084 0.16806722],[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,test
18035,1.98396,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDADYGKLTLKF...,"[['mEmerald'], ['Ypet', 'mVenus'], ['muGFP', '...",5,[0.37815127 0.24369748 0.14285714 0.10924369 0...,[[1 1 1 ... 0 0 0]\n [1 1 1 ... 0 0 0]\n [1 1 ...,test


(9016, 7)


Some final checks. Make sure we're using all sequences, and assert that there is no overlap between any of the subsplits.

In [9]:
assert set(list(df['seq'])) == set(list(df_train['seq']) + list(df_test['seq']))

assert len( set(list(df_train['seq'])).intersection(set(list(df_test['seq']))) ) == 0

for i,adf in enumerate(subsplit_train_dfs):
    for j,bdf in enumerate(subsplit_train_dfs):
        
        if i != j:
            assert len( set(list(adf['seq'])).intersection(set(list(bdf['seq']))) ) == 0

for i,adf in enumerate(subsplit_test_dfs):
    for j,bdf in enumerate(subsplit_test_dfs):
        
        if i != j:
            assert len( set(list(adf['seq'])).intersection(set(list(bdf['seq']))) ) == 0      

## Export

In [10]:
train_subsplit_prefix = 'fp_homologs_data_dist_split_'
test_subsplit_prefix = 'fp_homologs_gen_split_'

In [11]:
for i,tdf in enumerate(subsplit_train_dfs):
    # Put in data distributions split dir
    ofile = os.path.join(paths.DATA_DISTRIBUTIONS_DIR, train_subsplit_prefix + str(i) + '.csv')
    tdf.to_csv(ofile, index=False)
    
    print(tdf.shape)
    print(ofile)
    print(data_io_utils.generate_md5_checksum(ofile))
    print()
    
    

(3511, 7)
/notebooks/analysis/common/../../data/s3/datasets/tts_splits/data_distributions/fp_homologs_data_dist_split_0.csv
1b06703fd9d50a5a1b4cf2f5d55af47d

(3511, 7)
/notebooks/analysis/common/../../data/s3/datasets/tts_splits/data_distributions/fp_homologs_data_dist_split_1.csv
7a672eab919bb9048d440312394aee38

(3510, 7)
/notebooks/analysis/common/../../data/s3/datasets/tts_splits/data_distributions/fp_homologs_data_dist_split_2.csv
0378896140440610dc8fc424654e03d4



In [12]:
for i,tdf in enumerate(subsplit_test_dfs):
    # Put in generalization sets split dir
    ofile = os.path.join(paths.GEN_SETS_SPLITS_DIR, test_subsplit_prefix + str(i) + '.csv')
    tdf.to_csv(ofile, index=False)
    
    print(tdf.shape)
    print(ofile)
    print(data_io_utils.generate_md5_checksum(ofile))
    print()
    
    

(9017, 7)
/notebooks/analysis/common/../../data/s3/datasets/tts_splits/generalization_sets/fp_homologs_gen_split_0.csv
2fdc06c47b8765760fe69bb31c1fb7a8

(9017, 7)
/notebooks/analysis/common/../../data/s3/datasets/tts_splits/generalization_sets/fp_homologs_gen_split_1.csv
f75ae4491416ba0b0cbd8d1be4480e89

(9016, 7)
/notebooks/analysis/common/../../data/s3/datasets/tts_splits/generalization_sets/fp_homologs_gen_split_2.csv
8c4a7776d62a2bb18871bdc35da676a2



Manually verified these results are reproducible by running the notebook 2x top to bottom and checking MD5 checksums.

## Sync back up to S3

In [13]:
# Post publication note: Disabling sync to read-only bucket.
#data_io_utils.sync_local_path_to_s3(paths.TTS_SPLITS_DIR)