# Run PncA WandB Sweep

In [1]:
from IPython.display import display
import os

if "SSH_CONNECTION" in os.environ:
    display("Running via SSH")
else:
    display("Running locally")
    
import sys
import os

path = os.path.join('..', '/Users/dylandissanayake/Desktop/DPhil/Comp Disc/Repositories/TB-PNCA-GNN') if "SSH_CONNECTION" not in os.environ else os.path.join('..', '/mnt/alphafold-volume-1/dylan2/repos/tb-pnca-gnn')
if path not in sys.path:
    sys.path.append(os.path.abspath(path))

import datetime
import random

import numpy as np
import pandas as pd

import torch
from torch_geometric.data import Data

import wandb

import warnings
warnings.filterwarnings('ignore')

from src import run_model, protein_graph, gcn_model, evaluation

%load_ext autoreload
%autoreload 2

%aimport src

torch.cuda.is_available()

'Running via SSH'



True

In [2]:
full_dataset = torch.load('datasets/full_real_dataset_v2.pth')

### Set Up Params and Sweep Config

In [3]:
seed = 42
np.random.seed(seed)
random.seed(seed)

# logging params (only used for wandb metrics)
n_samples = len(full_dataset)
# cutoff_distance = 6.3  

# gcn params
num_node_features = 16
batch_size = 256
# hidden_channels = 64
# learning_rate = 0.001
# wd = 5e-5
epochs = 400

wt_seq = 'MRALIIVDVQNDFCEGGSLAVTGGAALARAISDYLAEAADYHHVVATKDFHIDPGDHFSGTPDYSSSWPPHCVSGTPGADFHPSLDTSAIEAVFYKGAYTGAYSGFEGVDENGTPLLNWLRQRGVDEVDVVGIATDHCVRQTAEDAVRNGLATRVLVDLTAGVSADTTVAALEEMRTASVELVCS'

In [33]:
# First sweep:

sweep_config = {
    'method': 'random'
    }

metric = {
    'name': 'Test Accuracy',
    'goal': 'maximize'   
    }

sweep_config['metric'] = metric

parameters_dict = {
    'hidden_channels': {
        'values': [32, 64, 128, 256]
        },
    'weight_decay': {
        'distribution': 'log_uniform_values',
        'min': 1e-8,
        'max': 1e-2
        },
    'dropout': {
          'values': [0.2, 0.4, 0.5, 0.6, 0.8]
        },
    'cutoff_distance': {
        'distribution': 'uniform',
        'min': 3.5,
        'max': 10 
        },
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-8,
        'max': 1e-1
      },
    }

sweep_config['parameters'] = parameters_dict

# Second / third sweep:

parameters_dict.update({
    'hidden_channels': {
        'values': [64, 128, 192, 256, 320, 384]
        },
    'weight_decay': {
        'distribution': 'log_uniform_values',
        'min': 5e-7, 
        'max': 1e-2
        },
    'dropout': {
          'values': [0.4, 0.5, 0.6, 0.8]
        },
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 1e-2 
        },
    })

# Fourth / Fifth sweep:

parameters_dict.update({
    'hidden_channels': {
        'values': [128, 192, 256, 320]
        },
    'weight_decay': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 1e-3
        },
    'dropout': {
          'values': [0.4, 0.5, 0.6, 0.8]
        },
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 5e-5,
        'max': 5e-2
        },
    })

# Sixth sweep:

parameters_dict.update({
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-6,
        'max': 5e-3 
        }
    })

# Seventh sweep:

parameters_dict.update({
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 5e-6,
        'max': 5e-4 
        }
    })

# Eight sweep w cutoff varying:

parameters_dict.update({
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-15,
        'max': 1e-3 
        },
        'cutoff_distance': {
        'distribution': 'uniform',
        'min': 5.5,
        'max': 14
        },
    })



In [34]:
import pprint
pprint.pprint(sweep_config)

{'method': 'random',
 'metric': {'goal': 'maximize', 'name': 'Test Accuracy'},
 'parameters': {'cutoff_distance': {'distribution': 'uniform',
                                    'max': 14,
                                    'min': 5.5},
                'dropout': {'values': [0.4, 0.5, 0.6, 0.8]},
                'hidden_channels': {'values': [128, 192, 256, 320]},
                'learning_rate': {'distribution': 'log_uniform_values',
                                  'max': 0.001,
                                  'min': 1e-15},
                'weight_decay': {'distribution': 'log_uniform_values',
                                 'max': 0.001,
                                 'min': 1e-05}}}


### Define Training Loop

In [4]:
project = "pnca-singletons-sweep-v2"

In [None]:
sweep_id = wandb.sweep(sweep_config, project=project)
# sweep_id = 'nc8eahq7'
# sweep_id = 'i8r2s0bc'

# sweep_id = 'env42e7l'

In [None]:
## Run a single run

# model = run_model.pnca_simpleGCN(
#             # sequences=sequences_dict,
#             sequences=None,
#             self_loops = False,
#             cutoff_distance = 12,
#             edge_weight_func = '1-(dist/cutoff)',
#             # edge_weight_func = 'none',
#             batch_size = batch_size,
#             num_node_features = num_node_features,
#             hidden_channels = 192,
#             learning_rate = 5e-5,
#             wd = 1e-5,
#             dropout = 0.6,
#             lr_scheduling=False,
#             epochs = epochs,
#             dataset = full_dataset,
#             normalise_ews=True,
#             wandb_params={
#                 'use_wandb' : True,
#                 'wandb_project' : project,
#                 # 'wandb_project' : 'pnca-sweep-1',
#                 'wandb_name' : 'no early stop',
#                 'n_samples' : n_samples
#             }
#         )

In [21]:
def sweep_run():

    with wandb.init() as run:
        config = run.config

        model = run_model.pnca_simpleGCN(
            # sequences=sequences_dict,
            sequences=None,
            self_loops = False,
            cutoff_distance = config.cutoff_distance,
            edge_weight_func = '1-(dist/cutoff)',
            # edge_weight_func = 'none',
            batch_size = batch_size,
            num_node_features = num_node_features,
            hidden_channels = config.hidden_channels,
            learning_rate = config.learning_rate,
            wd = config.weight_decay,
            dropout = config.dropout,
            lr_scheduling=False,
            epochs = epochs,
            dataset = full_dataset,
            normalise_ews=True,
            wandb_params={
                'use_wandb': False,
                'sweep': True
            }
        )

        # os.makedirs(f'saved_models/carter_ds_aug/{project}/{sweep_id}', exist_ok=True)
        
        # torch.save(model, f'saved_models/carter_ds_aug/{project}/{sweep_id}/{run.name}')

In [22]:
wandb.agent(sweep_id, sweep_run, project = project, count=100)



[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
