# Summary


- There are many many duplicates produced by the COSMIC SQL query, not sure why...

- 16 GB RAM is _just_ enough to run this notebook...

-----

# Imports

In [3]:
%run imports.ipynb

2016-07-10 20:22:28.876320


In [4]:
NOTEBOOK_NAME = 'core_load_data'
os.environ['NOTEBOOK_NAME'] = NOTEBOOK_NAME

In [7]:
DATASETS = [
    'alascan_gpk',
    'curatedprotherm',
    'guerois',
    'kellogg',
    'potapov',
    'protherm',
    'taipale',
    'humsavar',
    'clinvar',
    'cosmic',
]

# Load data

## Construct SQL query

In [13]:
# db_remote
db_remote = datapkg.MySQL(
    connection_string=os.environ['DATAPKG_CONNECTION_STR'] + '/elaspic', 
    shared_folder=os.environ['NOTEBOOK_NAME'], 
    storage_host=None, 
    echo=False, 
    db_engine='MyISAM')

In [4]:
# Combine experimental mutation sets with ELASPIC features
sql_query_template = """\
SELECT *

FROM ({subquery}) p

JOIN {uniprot_sequence_subquery}
JOIN {elaspic_schema}.uniprot_domain d using (uniprot_id)
LEFT JOIN {elaspic_schema}.uniprot_domain_template t using (uniprot_domain_id)
LEFT JOIN {elaspic_schema}.uniprot_domain_model m using (uniprot_domain_id)
LEFT JOIN {elaspic_schema}.uniprot_domain_mutation mut using (uniprot_id, uniprot_domain_id, mutation)

-- These two lines control whether we include mutations which fall outside domain
WHERE (
    (model_domain_def is not NULL and elaspic.MUTATION_IN_DOMAIN(mutation, model_domain_def)) OR 
    (model_domain_def is NULL and model_errors is NULL and elaspic.MUTATION_IN_DOMAIN(mutation, domain_def))
) 
AND (mutation REGEXP '^[A-Za-z][0-9]+[A-Za-z]$') 
AND (SUBSTRING(mutation, 1, 1) != SUBSTRING(mutation, -1))
"""

In [5]:
# Load dataset queries
subqueries = dict()

subqueries['protein_folding_energy'] = """
SELECT
    uniprot_id uniprot_id,
    uniprot_mutation mutation,
    ddg_exp,
    null del_score_exp,
    null del_class_exp
FROM protein_folding_energy.{table_name}
WHERE ddg_exp is not null  -- remove mutations with no experimental ddG
-- AND (uniprot_mutation REGEXP '^[A-Za-z][0-9]+[A-Za-z]$')  -- remove weird mutations
-- AND (SUBSTRING(uniprot_mutation, 1, 1) != SUBSTRING(uniprot_mutation, -1))  -- remove synonymous mutations
"""


subqueries['humsavar'] = """
SELECT
    uniprot_id uniprot_id,
    uniprot_mutation mutation,
    null ddg_exp,
    null del_score_exp,
    CASE type_of_variant WHEN 'Polymorphism' THEN 0 WHEN 'Disease' then 1 ELSE NULL END del_class_exp
FROM uniprot.humsavar
WHERE type_of_variant = 'Polymorphism' OR type_of_variant = 'Disease'
"""


subqueries['clinvar'] = """
SELECT
    v.mutationassessor_uniprotid uniprot_name,
    v.mutationassessor_variant mutation,
    null ddg_exp,
    v.mutationassessor_score del_score_exp,
    0 del_class_exp
FROM clinvar_local.clinvar_benign c
JOIN dbnsfp.variant v ON (c.id = v.rs_dbsnp146)
WHERE v.mutationassessor_uniprotid IS NOT NULL AND v.mutationassessor_variant IS NOT NULL

    UNION ALL

SELECT 
    v.mutationassessor_uniprotid uniprot_name,
    v.mutationassessor_variant mutation,
    null ddg_exp,
    v.mutationassessor_score del_score_exp,
    1 del_class_exp
FROM clinvar_local.clinvar c
JOIN dbnsfp.variant v ON (c.id = v.rs_dbsnp146)
WHERE v.mutationassessor_uniprotid IS NOT NULL AND v.mutationassessor_variant IS NOT NULL
"""


subqueries['cosmic'] = """
SELECT 
    v.mutationassessor_uniprotid uniprot_name,
    v.mutationassessor_variant mutation,
    NULL ddg_exp,
    cme.fathmm_score del_score_exp,
    CASE cme.fathmm_prediction WHEN 'NEUTRAL' THEN 0 WHEN 'PATHOGENIC' THEN 1 ELSE NULL END del_class_exp
FROM cosmic_new.cosmic_coding_muts ccm 
JOIN cosmic_new.cosmic_mutant_export cme ON (cme.mutation_id = ccm.id)
JOIN dbnsfp.variant v ON (ccm.chrom = v.chr AND ccm.pos = v.pos_1based AND ccm.ref = v.ref AND ccm.alt = v.alt)
WHERE cme.fathmm_score IS NOT NULL AND cme.fathmm_prediction IS NOT NULL
"""


assert not any((';' in v) for v in subqueries.values())

In [6]:
# Load dataset functions
def load_dataset(table_name, elaspic_schema, engine=None):
    """
    """
    if engine is None:
        engine = sa.create_engine(os.environ['DATAPKG_CONNECTION_STR'] + '/elaspic')

    # Create sql query
    if table_name in ['humsavar', 'clinvar', 'cosmic']:
        subquery = subqueries[table_name]
    else:
        subquery = subqueries['protein_folding_energy'].format(table_name=table_name)

    if table_name in ['clinvar', 'cosmic']:
        # Do not join on a table with splice variants if using `uniprot_name`
        uniprot_sequence_subquery = """\
(SELECT uniprot_acc uniprot_id, uniprot_id uniprot_name, uniprot_sequence 
 FROM uniprot_kb_proteomes.UP000005640_9606_fasta) us USING (uniprot_name) \
"""
    else:
        uniprot_sequence_subquery = """\
uniprot_kb.uniprot_sequence us USING (uniprot_id) \
"""

    sql_query = sql_query_template.format(
            elaspic_schema=elaspic_schema,
            subquery=subquery, 
            uniprot_sequence_subquery=uniprot_sequence_subquery)

#     cache_file = op.join(NOTEBOOK_NAME, table_name + '.tsv.gz')
#     if op.isfile(cache_file):
#         logger.info("Reading cache file '{}'...".format(cache_file))
#         sql_query += ' LIMIT 0'
#         df_header = pd.read_sql_query(sql_query, engine)
#         df = pd.read_csv(cache_file, sep='\t', na_values=['\\N'], names=df_header.columns)
#     else:
    df = pd.read_sql_query(sql_query, engine)
    
    # Read sql query
    df = df.rename(columns={'mutation': 'uniprot_mutation'})
    # df = ascommon.df_tools.remove_duplicate_columns(df)
    print("{} [{}]: ({})".format(table_name, elaspic_schema, df.shape[0]))
    
    # === All these rules could be converted to SQL... ===
    # Remove rows with null mutations
    null_mutants_ = (df['uniprot_mutation'].isnull()) | (df['uniprot_mutation'] == '-')
    print("Removing {} null mutants...".format(null_mutants_.sum()))
    df = df[~null_mutants_]

    # Remove SIFTS errors
    sifts_errors_ = df['uniprot_mutation'] == '?'
    print("Removing {} sifts errors...".format(sifts_errors_.sum()))
    df = df[~sifts_errors_]

    # Remove rows with weird mutations
    multi_mutants_ = df['uniprot_mutation'].str.contains(',')
    print("Removing {} multi mutants...".format(multi_mutants_.sum()))
    df = df[~multi_mutants_]

    # Remove wild-type rows
    wild_ = (df['uniprot_mutation'].str.lower() == 'wild')
    print("Removing {} wild mutants...".format(wild_.sum()))
    df = df[~wild_]

    # Remove synonymous mutations
    synonymous_ = df['uniprot_mutation'].str[0] == df['uniprot_mutation'].str[-1]
    print("Removing {} synonymous mutants...".format(synonymous_.sum()))
    df = df[~synonymous_]

    # Remove sequence mismatch mutations
    mutation_matches_sequence_ = (
        df[['uniprot_mutation', 'uniprot_sequence']]
        .apply(lambda x: ascommon.sequence_tools.mutation_matches_sequence(*x), axis=1)
    )
    mutation_matches_sequence_ = mutation_matches_sequence_.fillna(False)
    print("Removing {} mutations not matching sequence...".format((~mutation_matches_sequence_).sum()))
    df = df[mutation_matches_sequence_]

    # Convert ELASPIC features to expected format
    shape_before = df.shape[0]
    df = elaspic.elaspic_predictor.format_mutation_features(df, 'core')
    df = elaspic.elaspic_predictor.convert_features_to_differences(df)
    shape_after = df.shape[0]
    assert shape_before == shape_after
    
    # DONE!
    print('-' * 80, flush=True)
    return df

## Run SQL query

In [7]:
os.makedirs(NOTEBOOK_NAME, exist_ok=True)

In [8]:
logger.setLevel(logging.DEBUG)

### DATA

In [10]:
# DATA
params = []
for table_name in DATASETS:
    # DATA[table_name] = load_dataset(table_name, 'elaspic', db.engine)
    params.append((table_name, table_name, 'elaspic'))
    if table_name not in ['humsavar', 'clinvar', 'cosmic']:
        # DATA[table_name + '_diffseqi'] = load_dataset(table_name, 'elaspic_training', db.engine)
        params.append((table_name + '_diffseqi', table_name, 'elaspic_training_core'))

# Get data from database
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(len(params)) as p:
    results = p.map(lambda x: load_dataset(*x[1:]), params)

# Combine into one dict
DATA = {p[0]: r for p, r in zip(params, results)}

alascan_gpk [elaspic]: (743)
Removing 0 null mutants...
kellogg [elaspic]: (1109)
guerois [elaspic]: (903)
Removing 0 null mutants...
Removing 0 null mutants...
Removing 0 sifts errors...
Removing 0 sifts errors...
Removing 0 sifts errors...
Removing 0 multi mutants...
Removing 0 multi mutants...
Removing 0 multi mutants...
Removing 0 wild mutants...
Removing 0 wild mutants...
Removing 0 wild mutants...
Removing 0 synonymous mutants...
Removing 0 synonymous mutants...
Removing 0 synonymous mutants...
Removing 1 mutations not matching sequence...
Removing 1 mutations not matching sequence...
Removing 3 mutations not matching sequence...
protherm [elaspic]: (5682)
kellogg [elaspic_training]: (2830)
taipale [elaspic]: (1920)
alascan_gpk [elaspic_training]: (2260)
Removing 0 null mutants...
Removing 0 null mutants...
Removing 0 null mutants...
Removing 0 null mutants...
Removing 0 sifts errors...
protherm [elaspic_training]: (17127)
Removing 0 sifts errors...
Removing 0 sifts errors...
tai

In [14]:
with open(op.join(NOTEBOOK_NAME, 'DATA.pkl'), 'wb') as ofh:
    pickle.dump(DATA, ofh, pickle.HIGHEST_PROTOCOL)

### DATA_DF

Combine data into a DataFrame.

All $\Delta \Delta G$ datasets go into a single "protherm" dataset.

In [5]:
with open(op.join(NOTEBOOK_NAME, 'DATA.pkl'), 'rb') as ifh:
    DATA = pickle.load(ifh)

In [8]:
# In case of duplicates, the dataset occuring FIRST has the priority
df_list = []
dataset_map = {
    # Best sequence identity
    'alascan_gpk': 'protherm',
    'curatedprotherm': 'protherm',
    'guerois': 'protherm',
    'kellogg': 'protherm',
    'potapov': 'protherm',
    'protherm': 'protherm',
    #
    'taipale': 'taipale',
    #
    'humsavar': 'humsavar',
    'clinvar': 'clinvar',
    'cosmic': 'cosmic',
}

def append_to_df_list(key, suffix):
    df = DATA[key + suffix]
    df['dataset'] = dataset_map[key] + suffix
    df_list.append(df)

for key in DATASETS:
    if key not in ['humsavar', 'clinvar', 'cosmic']:
        suffixes = ['', '_diffseqi']
    else:
        suffixes = ['']
    for suffix in suffixes:
        append_to_df_list(key, suffix)

DATA_DF = pd.concat(df_list, ignore_index=True)

assert DATA_DF.shape[0] == sum(df.shape[0] for df in DATA.values())

In [9]:
for c in ['ddg_exp', 'del_score_exp', 'del_class_exp']:
    DATA_DF[c] = DATA_DF[c].astype(float)

In [23]:
# Set 'max_seq_identity' where it is missing
DATA_DF.loc[DATA_DF['max_seq_identity'].isnull(), 'max_seq_identity'] = (
    DATA_DF[DATA_DF['max_seq_identity'].isnull()]
    ['alignment_identity']
    .apply(local.get_max_seq_identity)
)

In [10]:
display(DATA_DF.head())
print(DATA_DF.shape)

Unnamed: 0,alignment_coverage,alignment_def,alignment_filename,alignment_identity,alignment_score,alignment_subdefs,backbone_clash_change,backbone_clash_wt,backbone_hbond_change,backbone_hbond_wt,cath_id,chain,chain_modeller,cis_bond_change,cis_bond_wt,dataset,db,ddg,ddg_exp,del_class_exp,del_score_exp,dg_change,dg_wt,disulfide_change,disulfide_wt,domain_def,domain_end,domain_start,electrostatic_kon_change,electrostatic_kon_wt,electrostatics_change,electrostatics_wt,energy_ionisation_change,energy_ionisation_wt,entropy_complex_change,entropy_complex_wt,entropy_mainchain_change,entropy_mainchain_wt,entropy_sidechain_change,entropy_sidechain_wt,gene_name,helix_dipole_change,helix_dipole_wt,m_date_modified,matrix_score,max_seq_identity,mloop_entropy_change,mloop_entropy_wt,model_domain_def,model_errors,model_filename,model_filename_mut,model_filename_wt,mut_date_modified,mutation_errors,mutation_modeller,norm_dope,number_of_residues,organism_name,partial_covalent_bonds_change,partial_covalent_bonds_wt,path_to_data,pcv_hbond_change,pcv_hbond_self_change,pcv_hbond_self_wt,pcv_hbond_wt,pcv_salt_equal_change,pcv_salt_equal_self_change,pcv_salt_equal_self_wt,pcv_salt_equal_wt,pcv_salt_opposite_change,pcv_salt_opposite_self_change,pcv_salt_opposite_self_wt,pcv_salt_opposite_wt,pcv_vdw_change,pcv_vdw_self_change,pcv_vdw_self_wt,pcv_vdw_wt,pdbfam_idx,pdbfam_name,pfam_clan,pfam_names,protein_existence,protein_name,provean_score,sasa_score,secondary_structure_change,secondary_structure_wt,sequence_version,sidechain_hbond_change,sidechain_hbond_wt,sloop_entropy_change,sloop_entropy_wt,solvation_hydrophobic_change,solvation_hydrophobic_wt,solvation_polar_change,solvation_polar_wt,solvent_accessibility_change,solvent_accessibility_wt,t_date_modified,template_errors,torsional_clash_change,torsional_clash_wt,uniprot_domain_id,uniprot_domain_id_old,uniprot_id,uniprot_mutation,uniprot_name,uniprot_sequence,van_der_waals_change,van_der_waals_clashes_change,van_der_waals_clashes_wt,van_der_waals_wt,water_bridge_change,water_bridge_wt
0,100.0,67:156,P00648_1b27A.aln,100.0,1.0,67:156,-0.11,63.13,-0.02,-55.36,1b27A00,A,A,0.0,0.0,protherm,sp,0.8295,0.09,,,-0.58,41.52,0.0,0.0,48:157,157,48,0.0,0.0,0.0,-7.04,0.0,0.36,0.0,0.0,-0.28,151.88,-0.85,63.15,,0.0,-1.96,2014-12-22 21:05:37,-1.0,,0.0,0.0,48:157,,P00648.B99990001.pdb,P00648_Q151A/MUT_RepairPDB_P00648.B99990001_1.pdb,P00648_Q151A/WT_RepairPDB_P00648.B99990001_1.pdb,2014-08-07 23:06:32,,Q104A,-1.33777,110.0,Bacillus amyloliquefaciens,0.0,0.0,bacam/P00/64/P00648/Ribonuclease*67-156/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,1,Ribonuclease,Ribonuclease,Ribonuclease,1.0,Ribonuclease,-2.16,"120.104122279,87.543977591,49.8328050713,24.78...",0,6.0,2.0,0.78,-26.85,0.0,0.0,0.47,-158.13,-0.8,159.06,-18.0192,67.177,2015-09-01 05:12:36,,-0.01,13.7,24447509,,P00648,Q151A,RNBR_BACAM,MMKMEGIALKKRLSWISVCLLVLVSAAGMLFSTAAKTETSSHKAHT...,0.43,-0.28,23.57,-120.88,0.0,0.0
1,100.0,67:156,P00648_1b27A.aln,100.0,1.0,67:156,-0.4,63.28,0.08,-56.0,1b27A00,A,A,0.0,0.0,protherm,sp,0.244914,1.63,,,-3.04,41.46,0.0,0.0,48:157,157,48,0.0,0.0,0.03,-6.92,0.0,0.36,0.0,0.0,-0.23,151.98,-1.21,63.42,,0.0,-1.96,2014-12-22 21:05:37,-2.0,,0.0,0.0,48:157,,P00648.B99990001.pdb,P00648_I156A/MUT_RepairPDB_P00648.B99990001_1.pdb,P00648_I156A/WT_RepairPDB_P00648.B99990001_1.pdb,2014-08-07 23:03:33,,I109A,-1.33777,110.0,Bacillus amyloliquefaciens,0.0,0.0,bacam/P00/64/P00648/Ribonuclease*67-156/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,0.0,1,Ribonuclease,Ribonuclease,Ribonuclease,1.0,Ribonuclease,-2.917,"120.104122279,87.543977591,49.8328050713,24.78...",0,0.0,2.0,0.0,-27.66,0.0,0.0,3.03,-158.81,-1.25,159.87,5.9086,13.662,2015-09-01 05:12:36,,-0.55,13.69,24447509,,P00648,I156A,RNBR_BACAM,MMKMEGIALKKRLSWISVCLLVLVSAAGMLFSTAAKTETSSHKAHT...,1.73,-4.68,24.91,-121.42,0.0,0.0
2,100.0,67:156,P00648_1b27A.aln,100.0,1.0,67:156,-0.06,63.31,0.01,-55.95,1b27A00,A,A,0.0,0.0,protherm,sp,0.602505,3.47,,,1.03,42.11,0.0,0.0,48:157,157,48,0.0,0.0,0.0,-7.0,0.0,0.36,0.0,0.0,-0.22,151.97,-0.47,63.32,,0.0,-1.96,2014-12-22 21:05:37,0.0,,0.0,0.0,48:157,,P00648.B99990001.pdb,P00648_V57A/MUT_RepairPDB_P00648.B99990001_1.pdb,P00648_V57A/WT_RepairPDB_P00648.B99990001_1.pdb,2014-08-07 07:15:52,,V10A,-1.33777,110.0,Bacillus amyloliquefaciens,0.0,0.0,bacam/P00/64/P00648/Ribonuclease*67-156/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,0.0,1,Ribonuclease,Ribonuclease,Ribonuclease,1.0,Ribonuclease,-3.993,"120.104122279,87.543977591,49.8328050713,24.78...",0,4.0,2.0,0.02,-27.66,0.0,0.0,2.09,-158.3,-0.37,159.86,0.0,0.0,2015-09-01 05:12:36,,-0.81,13.66,24447509,,P00648,V57A,RNBR_BACAM,MMKMEGIALKKRLSWISVCLLVLVSAAGMLFSTAAKTETSSHKAHT...,1.04,-0.26,24.97,-121.16,0.0,0.0
3,100.0,67:156,P00648_1b27A.aln,100.0,1.0,67:156,-0.18,63.11,0.07,-56.37,1b27A00,A,A,0.0,0.0,protherm,sp,1.36021,0.39,,,0.02,41.92,0.0,0.0,48:157,157,48,0.0,0.0,0.46,-7.31,0.0,0.38,0.0,0.0,-0.24,152.13,-0.54,63.01,,0.19,-2.03,2014-12-22 21:05:37,-2.0,,0.0,0.0,48:157,,P00648.B99990001.pdb,P00648_D59A/MUT_RepairPDB_P00648.B99990001_1.pdb,P00648_D59A/WT_RepairPDB_P00648.B99990001_1.pdb,2014-08-07 06:15:26,,D12A,-1.33777,110.0,Bacillus amyloliquefaciens,0.0,0.0,bacam/P00/64/P00648/Ribonuclease*67-156/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,17.0,0.0,1,Ribonuclease,Ribonuclease,Ribonuclease,1.0,Ribonuclease,-4.217,"120.104122279,87.543977591,49.8328050713,24.78...",0,4.0,2.0,0.48,-27.11,0.0,0.0,0.09,-158.13,-0.73,160.0,-4.699,52.7104,2015-09-01 05:12:36,,-0.01,13.58,24447509,,P00648,D59A,RNBR_BACAM,MMKMEGIALKKRLSWISVCLLVLVSAAGMLFSTAAKTETSSHKAHT...,0.23,0.0,24.97,-121.18,0.0,0.0
4,100.0,67:156,P00648_1b27A.aln,100.0,1.0,67:156,-0.1,63.13,0.15,-55.89,1b27A00,A,A,0.0,0.0,protherm,sp,0.438801,3.0,,,1.71,41.49,0.0,0.0,48:157,157,48,0.0,0.0,-0.03,-6.98,0.0,0.36,0.0,0.0,-0.5,151.48,-1.27,62.78,,0.0,-1.98,2014-12-22 21:05:37,-2.0,,0.0,0.0,48:157,,P00648.B99990001.pdb,P00648_Y60A/MUT_RepairPDB_P00648.B99990001_1.pdb,P00648_Y60A/WT_RepairPDB_P00648.B99990001_1.pdb,2014-08-07 05:31:44,,Y13A,-1.33777,110.0,Bacillus amyloliquefaciens,0.0,0.0,bacam/P00/64/P00648/Ribonuclease*67-156/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0,0.0,1,Ribonuclease,Ribonuclease,Ribonuclease,1.0,Ribonuclease,-8.45,"120.104122279,87.543977591,49.8328050713,24.78...",0,4.0,2.0,0.0,-27.55,0.0,0.0,2.38,-157.41,-0.35,158.39,-23.5427,35.8754,2015-09-01 05:12:36,,-0.01,13.64,24447509,,P00648,Y60A,RNBR_BACAM,MMKMEGIALKKRLSWISVCLLVLVSAAGMLFSTAAKTETSSHKAHT...,1.36,-0.02,24.98,-120.34,0.0,0.0


(947059, 115)


In [11]:
DATA_DF = DATA_DF.drop_duplicates(
    subset=['uniprot_id', 'uniprot_domain_id', 'max_seq_identity', 'uniprot_mutation', 'dataset'], 
    keep='first')
print(DATA_DF.shape)

(498708, 115)


In [12]:
pprint(set(DATA_DF['dataset']))

{'clinvar',
 'cosmic',
 'humsavar',
 'protherm',
 'protherm_diffseqi',
 'taipale',
 'taipale_diffseqi'}


## Add a train / test split

In [57]:
VALIDATION_DATASETS = [
    'taipale',
    'taipale_diffseqi',
    'humsavar',
    'clinvar',
    'cosmic',    
]

In [58]:
# Add a '_train' and '_test' columns
ls_all = []

for dataset in VALIDATION_DATASETS:
    ls_train = []
    ls_test = []

    df = DATA_DF[(DATA_DF['dataset'] == dataset)]
    
    if df['ddg_exp'].notnull().all():
        df = df.sort_values('ddg_exp')
        df1 = df[:df.shape[0] // 2]
        df2 = df[df.shape[0] // 2:]
        assert df.shape[0] == (df1.shape[0] + df2.shape[0])
    elif df['del_class_exp'].notnull().all():
        df1 = df[df['del_class_exp'] == 0]
        df2 = df[df['del_class_exp'] != 0]
        assert df.shape[0] == (df1.shape[0] + df2.shape[0])
    else:
        raise Exception
        
    df_train = pd.concat([df1[:-df1.shape[0] // 3], df2[:-df2.shape[0] // 3]], ignore_index=True)
    df_test = pd.concat([df1[-df1.shape[0] // 3:], df2[-df2.shape[0] // 3:]], ignore_index=True)
    
    df_train['dataset'] = dataset + '_train'
    df_test['dataset'] = dataset + '_test'

    assert DATA_DF[(DATA_DF['dataset'] == dataset)].shape[0] == (df_train.shape[0] + df_test.shape[0])
    
    ls_all.extend([df_train, df_test])

df_all = pd.concat(ls_all, ignore_index=True)

In [59]:
# Make sure that '_train' in one does not appear in '_test' in other
train_mutations = set()

for dataset in VALIDATION_DATASETS:
    
    train_mutations |= set(
        df_all[df_all['dataset'] == dataset + '_train']
        [['uniprot_id', 'uniprot_mutation']]
        .apply('.'.join, axis=1))

    df_all.loc[(
        (df_all['dataset'] == dataset + '_test') &
        (df_all[['uniprot_id', 'uniprot_mutation']].apply('.'.join, axis=1).isin(train_mutations))), 
        'dataset'] = dataset + '_train'

In [60]:
# Print stats
display(df_all['dataset'].drop_duplicates())

for dataset in VALIDATION_DATASETS:
    n_train = df_all[df_all['dataset'] == dataset + '_train'].shape[0]
    n_test = df_all[df_all['dataset'] == dataset + '_test'].shape[0]
    print('{:20s} {:10d} {:10d} {:10.4f}'.format(dataset, n_train, n_test, n_test / (n_train + n_test)))

0                  taipale_train
1280                taipale_test
1920      taipale_diffseqi_train
4980       taipale_diffseqi_test
6507              humsavar_train
28459              humsavar_test
39436              clinvar_train
85142               clinvar_test
107997              cosmic_train
358956               cosmic_test
Name: dataset, dtype: object

taipale                    1280        640     0.3333
taipale_diffseqi           3122       1465     0.3194
humsavar                  22186      10743     0.3262
clinvar                   48509      20052     0.2925
cosmic                   252758     123682     0.3286


In [61]:
DATA_DF_TT = pd.concat([DATA_DF, df_all], ignore_index=True)
assert DATA_DF_TT.shape[0] == (DATA_DF.shape[0] + df_all.shape[0])

## Save

In [62]:
DATA_DF.to_pickle(op.join(NOTEBOOK_NAME, 'DATA_DF.pkl'))

In [63]:
DATA_DF_TT.to_pickle(op.join(NOTEBOOK_NAME, 'DATA_DF_TT.pkl'))

In [64]:
ls $NOTEBOOK_NAME -alh

total 5.5G
drwxrwxr-x 2 strokach kimlab 4.0K Jul 10 21:44 [0m[01;34m.[0m/
drwxrwxr-x 9 strokach kimlab 4.0K Jul 10 22:01 [01;34m..[0m/
-rw------- 1 strokach kimlab  59M Jun 20 10:25 clinvar.tsv.gz
-rw------- 1 strokach kimlab 281M Jun 20 10:22 cosmic.tsv.gz
-rw------- 1 strokach kimlab 1.8G Jul 10 22:02 DATA_DF.pkl
-rw------- 1 strokach kimlab 2.2G Jul 10 22:02 DATA_DF_TT.pkl
-rw------- 1 strokach kimlab 3.4G Jun 29 17:53 DATA.pkl
