In [1]:
import sagemaker
AWS_ROLE = 'arn:aws:iam::203378532510:role/service-role/AmazonSageMaker-ExecutionRole-Interns'

In [2]:
%pwd

'/home/ec2-user/SageMaker'

In [3]:
import os
aipt_dir = '/home/ec2-user/SageMaker/antibody-in-pytorch/'
model_dir = 'AIPT/Models/Beshnova2020'
model_dir_abs = os.path.join(aipt_dir, model_dir)
os.chdir(model_dir_abs)
%pwd

'/home/ec2-user/SageMaker/antibody-in-pytorch/AIPT/Models/Beshnova2020'

In [4]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch
from AIPT.Utils.logging import today, current_time

In [5]:
sagemaker_session = sagemaker.Session()
bucket = 'gv20interns'
prefix = 'runs'

In [6]:
# log_root_dir = '/home/ec2-user/SageMaker/logs/tensorboard'
run_root_dir = 's3://gv20interns/roger/runs'
s3_source_dir = os.path.join(run_root_dir, 'source')
run_name = 'full_run'
timezone = 'EST'
run_dir = os.path.join(run_root_dir, today(tz=timezone), run_name, current_time(tz=timezone))
out_dir = os.path.join(run_dir, 'out')
# para_dict = {
#     'seq_len': 12,
#     'embedding_dim': 15,  # paper uses dim 15 PCA features
#     'batch_size': 100,
#     'epoch': 10,
#     'learning_rate': 10 ** -4,
#     'run_name': run_name,
#     'run_dir': run_dir,
#     'work_path': '/opt/ml/model',
# }
para_dict = {
    'seq_len': 16,
    'embedding_dim': 15,  # paper uses dim 15 PCA features
    'index_file': 'OAS_index_large.txt',
    'batch_size': 100,
    'epoch': 1000,
    'run_name': run_name,
    'run_dir': run_dir,
    'work_path': '/opt/ml/model',
    # tuned hyperparameters from https://us-east-2.console.aws.amazon.com/sagemaker/home?region=us-east-2#/hyper-tuning-jobs/roger-beshnova2020-t-201103-0809
    'learning_rate': 0.000125245489276611,
    'dropout_rate': 0.1554058115760688,
    'conv1_filter_dim1': 2,
    'conv1_n_filters': 17,
    'conv2_filter_dim1': 1,
    'conv2_n_filters': 16,
    'max_pool_filter_dim1': 1,
    'fc_hidden_dim': 83
}


test_mcc_key = "best_test_mcc"
metrics = [
    {
        "Name": "best_epoch",
        "Regex": "best_epoch=(.*?);",
    },
    {
        "Name": test_mcc_key,
        "Regex": "best_test_mcc=(.*?);",
    },
]

In [7]:
pytorch_estimator = PyTorch(
    base_job_name=f"roger-beshnova2020-{para_dict['seq_len']}",
    entry_point='submit_beshnova2020.py',
    source_dir="../../../",
    role=get_execution_role(),
#     role=AWS_ROLE,
    output_path=out_dir,
    code_location=s3_source_dir,
    instance_type="ml.m5.large",
#     train_instance_type="local",
    instance_count=1,
    use_spot_instances=True,
    max_run=24 * 60 * 60,
    max_wait=24 * 60 * 60,
    py_version="py3",
    framework_version="1.6.0",
    hyperparameters=para_dict,
    metric_definitions=metrics,
)

In [8]:
pytorch_estimator.fit({'data': 's3://gv20interns/OAS_dataset'})

2020-11-06 09:04:16 Starting - Starting the training job...
2020-11-06 09:04:18 Starting - Launching requested ML instances......
2020-11-06 09:05:20 Starting - Preparing the instances for training.........
2020-11-06 09:06:54 Downloading - Downloading input data.........
2020-11-06 09:08:35 Training - Downloading the training image..bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2020-11-06 09:08:49,573 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2020-11-06 09:08:49,579 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2020-11-06 09:08:49,594 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2020-11-06 09:08:49,601 sagemaker_pytorch_container.training INFO     Invoking user training script.
2020-11-06 09:08:54,295 sagemaker-training-toolkit INFO     Installing module with the following command:
/opt/conda

In [9]:
tune = False
if tune:
    from sagemaker.tuner import (
        IntegerParameter,
        CategoricalParameter,
        ContinuousParameter,
        HyperparameterTuner,
    )


    hyperparameter_ranges = {
        "dropout_rate": ContinuousParameter(0, 0.7),
        "embedding_dim": IntegerParameter(10, 100),
        "conv1_n_filters": IntegerParameter(1, 20),
        "conv2_n_filters": IntegerParameter(10, 30),
        "conv1_filter_dim1": IntegerParameter(1, 4),
        "conv2_filter_dim1": IntegerParameter(1, 4),
        "max_pool_filter_dim1": IntegerParameter(1, 3),
        "fc_hidden_dim": IntegerParameter(5, 100, scaling_type="Logarithmic"),
#         "learning_rate": ContinuousParameter(1e-5, 1e-1, scaling_type="Logarithmic"),
    }

    tuner = HyperparameterTuner(
        pytorch_estimator,
        test_mcc_key,
        hyperparameter_ranges,
        metrics,
        base_tuning_job_name='roger-besh2020-tune',
        max_jobs=30,
        max_parallel_jobs=6,
        objective_type="Maximize",
    )
    
    tuner.fit({'data': 's3://gv20interns/OAS_test'})