In [1]:
from lib.pipeline import Pipeline
gpu = 0
CYCLE_SIZE = 10
DEBUG = True
pipeline = Pipeline(
    dataset='atpbind3d-minimal' if DEBUG else 'atpbind3d',
    model='lm-gearnet',
    gpus=[gpu],
    model_kwargs={
        'gpu': gpu,
        'lm_type': 'esm-t33',
        'gearnet_hidden_dim_size': 512,
        'gearnet_hidden_dim_count': 4,
        'lm_freeze_layer_count': 30,
    },
    valid_fold_num=0,
    batch_size=2,
    scheduler='cyclic',
    scheduler_kwargs={
        'base_lr': 1e-3,
        'max_lr': 3e-3,
        'step_size_up': CYCLE_SIZE / 2,
        'step_size_down': CYCLE_SIZE / 2,
        'cycle_momentum': False
    }
)

load model lm-gearnet, kwargs: {'gpu': 0, 'lm_type': 'esm-t33', 'gearnet_hidden_dim_size': 512, 'gearnet_hidden_dim_count': 4, 'lm_freeze_layer_count': 30}
freeze_lm: 30
get dataset atpbind3d-minimal
Initialize Undersampling: all ones
Initialize Weighting: all ones
train samples: 4, valid samples: 1, test samples: 5
Adam parameter: all
use cyclic lr scheduler


In [None]:
import numpy as np
def resiboost_v2_preprocess(pipeline, prev_results, _, prev_weights):
    if not prev_results:
        return

    weights = prev_weights
    THRESHOLD = -1.3863 # This is sigmoid^{-1}(0.2)
    df_train = prev_results[-1]['df_train']
    def from_row(row):
        pred_binary = row['pred'] > THRESHOLD
        is_correct = bool(row['target']) == pred_binary
        protein_index_in_dataset = int(row['protein_index'])
        # assume valid fold is consecutive: so that if protein index is larger than first protein index in valid fold, 
        # we need to add the length of valid fold as an offset
        if row['protein_index'] >= pipeline.dataset.valid_fold()[0]:
            protein_index_in_dataset += len(pipeline.dataset.valid_fold())
        residue_index = int(row['residue_index'])
        return (is_correct, protein_index_in_dataset, residue_index)
    
    # 1. Compute Error of Learner
    err = 0
    weight_sum = 0
    for _, row in df_train.iterrows():
        is_correct, protein_index_in_dataset, residue_index = from_row(row)
        w = weights[protein_index_in_dataset][residue_index]
        
        if not is_correct:
            err += w
        weight_sum += w
    
    a = 0.5 * np.log((1 - err) / err)
    prev_results[-1] = {**prev_results[-1], 'alpha': a}
    print(f'Error: {err}, weighted sum: {weight_sum}')
    
    # 2. Update Weights
    for _, row in df_train.iterrows():
        is_correct, protein_index_in_dataset, residue_index = from_row(row)
        if is_correct:
            weights[protein_index_in_dataset][residue_index] *= np.exp(-a)
        else:
            weights[protein_index_in_dataset][residue_index] *= np.exp(a)
    
    # 3. Normalize Weights
    new_weight_sum = sum(np.sum(weights))
    for weight_vec in weights:
        weight_vec = weight_vec * weight_sum / new_weight_sum    
    
    # 4. Apply Weights and Alpha
    pipeline.apply_mask_and_weights(masks=None, weights=weights)