# Summary

Create the `elaspic_training_core` database and fill it with **core** training data.

-----

This is where we find structural templates for our training set, at different sequence identity cutoffs.

max_seq_identity | function
-----------------|--------------
100              | > 80
80               | > 60 & <= 80
60               | > 40 & <= 60
40               | <= 40

We create the `elaspic_training` database and fill it with training data.

We do this for the following training sets only:

  - protherm
  - taipale

----

# Imports

In [1]:
%run imports.ipynb

2016-07-20 16:41:47.430650


In [2]:
NOTEBOOK_NAME = 'elaspic_training_core'
os.makedirs(NOTEBOOK_NAME, exist_ok=True)

os.environ['NOTEBOOK_NAME'] = NOTEBOOK_NAME
os.environ['DB_PORT'] = '8309'

In [3]:
%run mysqld.ipynb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
2016-07-20 16:41:47.659437


# Create database

In [4]:
CONNECTION_STRING = (
    elaspic.helper.make_connection_string(**{
        **elaspic.helper.parse_connection_string(db.connection_string),
        'db_schema': NOTEBOOK_NAME
    }))
print(CONNECTION_STRING)

mysql://root:@localhost:8309/elaspic_training_core?unix_socket=/tmp/strokach/elaspic_training_set/mysql.sock


In [5]:
!elaspic database --connection_string="$CONNECTION_STRING" create --drop_schema

Creating 'temp_dir' folder: /tmp/strokach/elaspic...
Creating 'sequence_dir' folder: /tmp/strokach/elaspic/xcx5kbyb/sequence...
No value provided for key: 'archive_type'
Creating 'model_dir' folder: /tmp/strokach/elaspic/xcx5kbyb/model...
No value provided for key: 'blast_db_dir'
Creating 'modeller_dir' folder: /tmp/strokach/elaspic/xcx5kbyb/model/modeller...
Creating 'tcoffee_dir' folder: /tmp/strokach/elaspic/xcx5kbyb/model/tcoffee...
Creating 'archive_temp_dir' folder: /tmp/strokach/elaspic/archive...
No value provided for key: 'archive_dir'
Creating 'data_dir' folder: /home/kimlab1/strokach/working/elaspic/elaspic/data...
Creating 'unique_temp_dir' folder: /tmp/strokach/elaspic/xcx5kbyb...
No value provided for key: 'pdb_dir'
Creating 'provean_temp_dir' folder: /tmp/strokach/elaspic/xcx5kbyb/sequence/provean_temp...
Opened database connection using engine: 'Engine(mysql://root:***@localhost:8309/elaspic_training_core?unix_socket=/tmp/strokach/elaspic_training_set/mysql.sock)'
Using

In [6]:
SCRIPT_NAME = 'elaspic_training_mod.sql'

In [7]:
!cp ../scripts/$SCRIPT_NAME $NOTEBOOK_NAME/$SCRIPT_NAME

In [8]:
!sed -i "s|{{{{ db_schema_name }}}}|$NOTEBOOK_NAME|g" $NOTEBOOK_NAME/$SCRIPT_NAME

In [9]:
!head $NOTEBOOK_NAME/$SCRIPT_NAME

-- Replace 'elaspic_training_core' with the actual DB_SCHEMA_NAME.

-- Use the right training schema
USE elaspic_training_core;


-- Create views
DROP TABLE IF EXISTS domain;



In [10]:
!mysql -vvv -u root --socket '{mysqld.socket}' < "$NOTEBOOK_NAME/$SCRIPT_NAME"

--------------
DROP TABLE IF EXISTS domain
--------------

Query OK, 0 rows affected (0.01 sec)

--------------
--------------

Empty set (0.00 sec)

--------------
CREATE VIEW domain AS SELECT * FROM elaspic.domain
--------------

Query OK, 0 rows affected (0.00 sec)

--------------
DROP TABLE IF EXISTS domain_contact
--------------

Query OK, 0 rows affected (0.00 sec)

--------------
CREATE VIEW domain_contact AS SELECT * FROM elaspic.domain_contact
--------------

Query OK, 0 rows affected (0.01 sec)

--------------
DROP TABLE IF EXISTS uniprot_sequence
--------------

Query OK, 0 rows affected (0.00 sec)

--------------
CREATE VIEW uniprot_sequence AS SELECT * FROM elaspic.uniprot_sequence
--------------

Query OK, 0 rows affected (0.01 sec)

--------------
DROP TABLE IF EXISTS provean
--------------

Query OK, 0 rows affected (0.00 sec)

--------------
CREATE VIEW provean AS SELECT * FROM elaspic.provean
--------------

Query OK, 0 rows affected (0.00 sec)

--------------
ALTER T

# Temporary hack

For now, data is stored in the DATAPKG central database. This will have to change next time we have to update the training set.

In [None]:
raise Exception

# Load data

In [None]:
with open(op.join('core_load_data', 'TRAINING_SETS_CORE.pkl'), 'rb') as ifh:
    TRAINING_SETS = pickle.load(ifh)
print(TRAINING_SETS.keys())

In [None]:
db_remote = datapkg.MySQL(
    connection_string=os.environ['DATAPKG_CONNECTION_STR'] + '/elaspic', 
    shared_folder=os.environ['NOTEBOOK_NAME'], 
    storage_host=None, 
    echo=False, 
    db_engine='InnoDB'
)

In [None]:
TRAINING_SETS['protherm'].head()

## all_mutations_ddg

In [None]:
all_mutations_ddg = pd.concat([
    TRAINING_SETS['protherm'],
    TRAINING_SETS['taipale'],
], ignore_index=True)

assert all_mutations_ddg.drop_duplicates().shape[0] == all_mutations_ddg.shape[0]

In [None]:
all_mutations_ddg.head()

## all_training_data

We can only get models of different sequence identities for $\Delta \Delta G$ predictor.

In [None]:
all_training_data = (
    all_mutations_ddg
    [['uniprot_id', 'uniprot_domain_id', 'uniprot_sequence']]
    .drop_duplicates()
)

In [None]:
display(all_training_data.head())
print(all_training_data.shape[0])

## all_training_data_2

Add domain and sequence info.

In [None]:
sql_query = """\
select *
from elaspic.uniprot_domainud
join elaspic.uniprot_domain_template udm using (uniprot_domain_id)
join (select uniprot_id, uniprot_name from uniprot_kb.uniprot_sequence) us using (uniprot_id)
where uniprot_domain_id in ({});
""".format(", ".join(str(i) for i in set(all_training_data['uniprot_domain_id'])))

In [None]:
print(sql_query[:200], '...')

In [None]:
uniprot_sequence = pd.read_sql_query(sql_query, db.engine)

In [None]:
display(uniprot_sequence.head())
print(uniprot_sequence.shape)

In [None]:
all_training_data_2 = all_training_data.merge(uniprot_sequence)

In [None]:
all_training_data_2.head(2)

In [None]:
display(all_training_data_2.head())
print(all_training_data_2.shape[0])

In [None]:
def get_domain_sequence(sequence, domain_def):
    domain_start, domain_end = [int(x) for x in domain_def.split(':')]
    domain_sequence = sequence[domain_start - 1:domain_end]
    return domain_sequence

assert get_domain_sequence('ABCDEFG', '1:3') == 'ABC'
assert get_domain_sequence('ABCDEFG', '1:7') == 'ABCDEFG'
assert get_domain_sequence('ABCDEFG', '4:7') == 'DEFG'

In [None]:
all_training_data_2['uniprot_domain_sequence'] = (
    all_training_data_2[['uniprot_sequence', 'domain_def']]
    .apply(lambda x: get_domain_sequence(*x), axis=1)


In [None]:
all_training_data_2['alignment_identity'].hist()

In [None]:
all_training_data_2_bak = all_training_data_2.copy()

## structural_templates

In [None]:
all_training_data_2 = all_training_data_2_bak.copy()

In [None]:
import concurrent.futures

SHARED_COLUMNS = ['uniprot_id', 'uniprot_domain_id', 'uniprot_sequence', 'uniprot_domain_sequence']
DATAPKG_DIR = op.dirname(op.dirname(op.dirname(op.abspath(NOTEBOOK_NAME))))
BLAST_DB = op.abspath(op.join(DATAPKG_DIR, 'profs', 'output', 'libraries_all_together_db', 'libraries_all'))
print(BLAST_DB)


def worker(x):
    i, row = x
    results_df = ascommon.sequence_tools.blastp(
        sequence=row['uniprot_domain_sequence'], db=BLAST_DB, evalue=0.001, max_target_seqs=100000)
    results_df = local.annotate_blast_results(
        results_df, 
        domain_start=int(row['domain_def'].split(':')[0]), 
        domain_sequence_length=len(row['uniprot_domain_sequence']))
    for column in SHARED_COLUMNS:
        results_df[column] = row[column]
    return results_df


structural_templates_file = op.join(NOTEBOOK_NAME, 'structural_templates.pkl')

if not op.isfile(structural_templates_file):
    with concurrent.futures.ProcessPoolExecutor() as p:
        results_df_all = p.map(worker, all_training_data_2.iterrows())
        structural_templates = pd.concat(results_df_all, ignore_index=True)
        structural_templates.to_pickle(structural_templates_file)
else:
    logger.info('Loading from file...')
    structural_templates = pd.read_pickle(structural_templates_file)

In [None]:
_before = structural_templates.drop_duplicates(subset=['uniprot_id', 'uniprot_domain_id']).shape[0]
structural_templates = structural_templates.merge(all_training_data_2, on=SHARED_COLUMNS, suffixes=('', '_old'))
_after = structural_templates.drop_duplicates(subset=['uniprot_id', 'uniprot_domain_id']).shape[0]

assert _before == _after

In [None]:
display(structural_templates.head())
print(structural_templates.shape[0])
print(structural_templates.drop_duplicates(subset=['uniprot_id', 'uniprot_domain_id']).shape[0])

In [None]:
structural_templates['alignment_identity'].hist()

In [None]:
structural_templates_bak = structural_templates.copy()

# Find structural templates

In [None]:
structural_templates_2 = structural_templates_bak.copy()

In [None]:
structural_templates_2['max_seq_identity'] = (
    structural_templates_2['alignment_identity'].apply(get_max_seq_identity)
)

In [None]:
structural_templates_2['max_seq_identity'].hist()

In [None]:
structural_templates_2.plot(kind='scatter', x='alignment_identity', y='max_seq_identity')

In [None]:
display(structural_templates_2.head())
print(structural_templates_2.shape)

In [None]:
structural_templates_2 = structural_templates_2.sort_values('alignment_score', ascending=False)
structural_templates_2 = structural_templates_2.drop_duplicates(subset=['uniprot_domain_id', 'max_seq_identity'])

In [None]:
print(all_training_data_2.shape)
print(structural_templates_2.shape)

In [None]:
structural_templates_2['alignment_identity'].hist()

In [None]:
structural_templates_2_bak = structural_templates_2.copy()

# Save to database

In [None]:
structural_templates_3 = structural_templates_2.copy()

structural_templates_3['uniprot_domain_id_old'] = structural_templates_3['uniprot_domain_id']
structural_templates_3['uniprot_domain_id'] = range(structural_templates_3.shape[0])

### Set *path_to_data*

In [None]:
import elaspic.elaspic_database

def get_path_to_data(X):
    uniprot_name, uniprot_id, pfam_clan, alignment_def = X
    # This is same as always so that we can retreive Provean supsets
    prefix = elaspic.elaspic_database.get_uniprot_base_path(uniprot_name=uniprot_name, uniprot_id=uniprot_id)
    # Add '.training' suffix so this never gets confused with anything in real life
    suffix = elaspic.elaspic_database.get_uniprot_domain_path(pfam_clan=pfam_clan, alignment_def=alignment_def + '.training')
    return prefix + suffix

In [None]:
structural_templates_3['path_to_data'] = (
    structural_templates_3[['uniprot_name', 'uniprot_id', 'pfam_clan', 'alignment_def']]
    .apply(get_path_to_data, axis=1)
)

In [None]:
structural_templates_3['path_to_data'].head()

In [None]:
!mysql -u strokach -h 192.168.6.19 elaspic_training < ../scripts/create_database.sql

In [None]:
engine_et = sa.create_engine('mysql://strokach:@192.168.6.19/elaspic_training')

In [None]:
_uniprot_domain = pd.read_sql_query('select * from uniprot_domain limit 0', engine_et)
_uniprot_domain_template = pd.read_sql_query('select * from uniprot_domain_template limit 0', engine_et)
_uniprot_domain_model = pd.read_sql_query('select * from uniprot_domain_model limit 0', engine_et)
_uniprot_domain_mutation = pd.read_sql_query('select * from uniprot_domain_mutation limit 0', engine_et)

In [None]:
assert not set(_uniprot_domain.columns) - set(structural_templates_3.columns)

In [None]:
assert not set(_uniprot_domain_template.columns) - set(structural_templates_3.columns)

In [None]:
assert (structural_templates_3['uniprot_domain_id'].shape == 
        structural_templates_3['uniprot_domain_id'].drop_duplicates().shape)

In [None]:
structural_templates_3[_uniprot_domain.columns].to_sql(
    'uniprot_domain', engine_et, schema='elaspic_training', index=False, chunksize=1000, if_exists='append')

In [None]:
structural_templates_3[_uniprot_domain_template.columns].to_sql(
    'uniprot_domain_template', engine_et, schema='elaspic_training', index=False, chunksize=1000, if_exists='append')

# Run *elaspic* on training data

In [None]:
CONFIG_FILE_TRAINING = op.abspath('../scripts/config_file_training.ini')
CONFIG_FILE_TRAINING

In [None]:
SYSTEM_COMMAND_TEMPLATE = (
    "elaspic run -u {{uniprot_id}} -m {{uniprot_mutation}} -c {config_file_training}"
    .format(config_file_training=CONFIG_FILE_TRAINING)
)
SYSTEM_COMMAND_TEMPLATE

In [None]:
df = (
    all_mutations_ddg
    [['uniprot_id', 'uniprot_mutation']]
    .drop_duplicates()
)
assert len(df.index) == len(set(df.index))
df.head()

In [None]:
system_commands = list(zip(
    df.index,
    df.apply(lambda x: SYSTEM_COMMAND_TEMPLATE.format(**x), axis=1)
))

In [None]:
system_commands[:2]

In [None]:
logging.getLogger().setLevel(logging.INFO)

In [None]:
reload(ascommon)
reload(ascommon.cluster_tools)
reload(ascommon.cluster_tools._jobsubmitter)

In [None]:
# Run gridsearch on cluster
js = ascommon.cluster_tools.JobSubmitter(
    job_name='_elaspic_training', 
    # connection_string='pbs://:@192.168.233.150', 
    connection_string='sge://:@192.168.6.201', 
    log_root_path=op.abspath(NOTEBOOK_NAME),
    #
    email='ostrokach@gmail.com', force_new_folder=False,
    nproc=1, queue='medium', walltime='04:00:00', mem='10G', vmem='12G',
    concurrent_job_limit=800,
    env={'PATH': os.environ['PATH'], 'KEY_MODELLER': os.environ['KEY_MODELLER']}
)

In [None]:
# Get job status
results_df = js.job_status(system_commands)
print(Counter(results_df['status']))

In [None]:
# Job statistics
failed = (
    set(results_df[results_df['status'] != 'done']['job_id'])
)
system_commands_remaining = [x for x in system_commands if x[0] in failed]
print(len(system_commands_remaining))

In [None]:
with js.connect():
    js.submit(system_commands_remaining)

# Done

In [None]:
!elaspic run -u P61981 -m E90G -c ../scripts/config_file.ini

In [None]:
!elaspic run -u P00648 -m Q151A -c ../scripts/config_file_training.ini

In [None]:
display(structural_templates_2.head())
print(structural_templates_2.shape)

In [None]:
max_seq_identity_df = pd.DataFrame([40, 60, 80, 100], columns=['max_seq_identity'])
max_seq_identity_df['tmp'] = 1
max_seq_identity_df