In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import lz4.frame
import dill
import json
from sklearn.utils import Bunch
import torch
from matal.utils.cache import RobustJSONEncoder
from matal.settings import DATA_DIR
from matal.ann_model import X_COLS, TARGET_Y_COLS, TARGET_Y_SCALES

## Build Training Dataset with Data Augmentation

In [2]:
for n_batch in [2**10, 2**20, ]:
    batch_size = 32
    data_tag = '2503A'
    data_mix_rate = {
        'grade': 19,
        'optical': 4,
        'tensile': 5,
        'fire': 2,
    }
    mix_rate_tag = ''.join([f'{k[0]}{v}' for k, v in data_mix_rate.items()])
    
    data_name = f'{data_tag}_B{batch_size}_N{hex(n_batch)[2:]}_{mix_rate_tag}'
    print(data_name)
    
    targets = list(data_mix_rate.keys())
    target_weights = np.array([data_mix_rate[k] for k in targets])
    target_weights = target_weights / target_weights.sum()
    target_dfs = {
        target: pd.read_csv(DATA_DIR / 'weighted' / f'{target}.csv') for target in targets
    }
    train_dfs = {
        target: df[~df.SampleID.str.startswith('aTS-T')].reset_index(drop=True) for target, df in target_dfs.items()
    }
    test_dfs = {
        target: df[df.SampleID.str.startswith('aTS-T')].reset_index(drop=True) for target, df in target_dfs.items()
    }
    
    print(
        (target_weights[0] * batch_size * n_batch) / len(target_dfs['grade']), 
        (target_weights[1] * batch_size * n_batch) / len(target_dfs['optical']),
        (target_weights[2] * batch_size * n_batch) / len(target_dfs['tensile']),
        (target_weights[3] * batch_size * n_batch) / len(target_dfs['fire'])
    )
    
    batch_ti_arr = []
    batch_X_arr = []
    batch_y_arr = []
    rng = np.random.default_rng(0)
    batch_targets = rng.choice(np.arange(len(targets), dtype=np.int8), size=n_batch, p=target_weights)
    
    for ti in tqdm(batch_targets):
        target = targets[ti]
        
        train_df = train_dfs[target]
        target_y_cols = TARGET_Y_COLS[target]
        target_y_std_cols = [f'{c}_STD' for c in target_y_cols]
        
        batch_df = train_df.sample(
            n=batch_size, replace=True, random_state=rng, ignore_index=True, weights=train_df['Weight'])
        
        # Data Augmentation: add noise based on experimental standard deviation
        batch_df[target_y_cols] += rng.normal(loc=0.0, scale=batch_df[target_y_std_cols])
    
        batch_ti_arr.append(ti)
        batch_X_arr.append(batch_df[X_COLS].values.astype(np.float16))
        batch_y_arr.append(batch_df[TARGET_Y_COLS[target]].values / np.array(TARGET_Y_SCALES[target]).astype(np.float16))
    
    with lz4.frame.open(DATA_DIR / 'batch_cache' / f'{data_name}.batch.pk.lz4', 'wb') as f:
        dill.dump(dict(batch_ti_arr=np.array(batch_ti_arr).astype(np.int8), 
                       batch_X_arr=batch_X_arr,
                       batch_y_arr=batch_y_arr), f)
    
    config = Bunch()
    config.X_COLS = X_COLS
    config.TARGET_Y_COLS = TARGET_Y_COLS
    config.TARGET_Y_SCALES = TARGET_Y_SCALES
    config.target_weights = target_weights
    config.batch_size = batch_size
    config.n_batch = n_batch
    
    with open(DATA_DIR / 'batch_cache' / f'{data_name}.config.json', 'w') as f:
        json.dump(config, f, indent=2, cls=RobustJSONEncoder)
    
    all_dataset = {}
    for target in targets:
        train_df = train_dfs[target]
        if len(train_df) == 0:
            continue
        all_dataset[target] = Bunch(
            scale=TARGET_Y_SCALES[target],
            train_X=torch.as_tensor(train_dfs[target][X_COLS].values),
            train_y=torch.as_tensor(train_dfs[target][TARGET_Y_COLS[target]].values / np.array(TARGET_Y_SCALES[target])),
    
            train_X_df=train_dfs[target][X_COLS],
            train_y_df=train_dfs[target][TARGET_Y_COLS[target]],
        )
    
        test_df = test_dfs[target]
        if len(test_df) == 0:
            continue
        all_dataset[target].test_X = torch.as_tensor(test_dfs[target][X_COLS].values)
        all_dataset[target].test_y = torch.as_tensor(test_dfs[target][TARGET_Y_COLS[target]].values / np.array(TARGET_Y_SCALES[target]))
    
        all_dataset[target].test_X_df=test_dfs[target][X_COLS]
        all_dataset[target].test_y_df=test_dfs[target][TARGET_Y_COLS[target]]
    
    with lz4.frame.open(DATA_DIR / 'batch_cache' / f'{data_name}.all_data.pk.lz4', 'wb') as f:
        dill.dump(all_dataset, f)
    


2503A_B32_N400_g19o4t5f2
6.5467087276550995 12.307230046948357 15.648519579751671 8.0610086100861


  0%|          | 0/1024 [00:00<?, ?it/s]

2503A_B32_N100000_g19o4t5f2
6703.829737118822 12602.603568075117 16024.084049665711 8254.472816728166


  0%|          | 0/1048576 [00:00<?, ?it/s]