In [None]:
# training


template = '\
--arch {arch} --seed {seed} --gpu {gpu} --workers 4 --save-loss \
--batch-size 128 --epochs 200 --learning-rate 0.1 \
--lr-scheduler CosineAnnealingLR --momentum 0.9 --weight-decay 0.0005 --lr-tmax 250 \
--train-labels {dataset}_noisy_labels/{base_folder}/{dataset}_noisy_labels__frac_zero_noise_rates__0.{sparsity}__noise_amount__0.{noise}.json \
datasets/datasets/{dataset}/{dataset}/ \
> {training_folder}/out_{arch}.log \
'

folder_template = '{dataset}_noisy_labels__frac_zero_noise_rates__0_{sparsity}__noise_amount__0_{noise}/seed_{seed}'

###
dataset = 'cifar10'
seeds = [0, 1, 2]
sparsities = [0, 20, 40, 60] #[0, 20, 40, 60]
noise_rates = [0, 10, 20] #[0, 10, 20]
jobs_per_gpu = 1
arch = 'resnet50'
###


num_jobs = 0
all_tasks = []
for seed in seeds:
    for sparsity in sparsities:
        for noise in noise_rates:
            if noise < 1e-2 and sparsity>0:
                continue
            sparsity_str = '{0:0=2d}'.format(sparsity)
            noise_str = '{0:0=2d}'.format(noise)
            
            base_folder = folder_template.format(dataset=dataset, sparsity=sparsity_str, noise=noise_str, seed=seed)
            training_folder = '{}_training/'.format(dataset) + base_folder
            print('mkdir -p', training_folder)
            arg_str = template.format(
                dataset=dataset,
                seed=seed,
                arch=arch,
                gpu=0,
                sparsity=sparsity_str,
                noise=noise_str,
                base_folder=base_folder,
                training_folder=training_folder,
            )
            all_tasks.append(arg_str)
            num_jobs += 1
print('\n'+'='*20, '\nnum_jobs:',num_jobs, '\nnum_tasks:',(num_jobs+jobs_per_gpu-1)//jobs_per_gpu)


config = {}
for i in range(len(all_tasks)):
    task = all_tasks[i]
    task_id = i // jobs_per_gpu
    if task_id not in config:
        config[task_id] = [task]
    else:
        config[task_id].append(task)
print('\ntask_assignment =', config)

In [None]:
# training with pruning


template = '\
--arch {arch} --seed {seed} --gpu {gpu} --workers 4 --save-loss \
--prune-percent {prune_percent} --prune-schedule {prune_schedule} --prune-rm-schedule {prune_rm_schedule} \
--batch-size 128 --epochs 200 --learning-rate 0.1 \
--lr-scheduler CosineAnnealingLR --momentum 0.9 --weight-decay 0.0005 --lr-tmax 200 \
--train-labels {dataset}_noisy_labels/{base_folder}/{dataset}_noisy_labels__frac_zero_noise_rates__0.{sparsity}__noise_amount__0.{noise}.json \
datasets/datasets/{dataset}/{dataset}/ \
> {training_folder}/out_{arch}.log \
'

folder_template = '{dataset}_noisy_labels__frac_zero_noise_rates__0_{sparsity}__noise_amount__0_{noise}/seed_{seed}'

###
dataset = 'cifar10'
seeds = [0, 1, 2]
sparsities = [0, 20, 40, 60] #[0, 20, 40, 60]
noise_rates = [0, 10, 20] #[0, 10, 20]
jobs_per_gpu = 1
arch = 'resnet50'

num_epochs = 200
prune_period = 20
prune_percent = 60
prune_schedule = ' '.join([str(e) for e in range(
    int(num_epochs*0.1), int(num_epochs*0.9), prune_period)])
prune_rm_schedule = ' '.join([str(e) for e in range(
    int(num_epochs*0.1)+int(prune_period//2), int(num_epochs*0.9)+1, prune_period)])
###


num_jobs = 0
all_tasks = []
for seed in seeds:
    for sparsity in sparsities:
        for noise in noise_rates:
            if noise < 1e-2 and sparsity>0:
                continue
            sparsity_str = '{0:0=2d}'.format(sparsity)
            noise_str = '{0:0=2d}'.format(noise)
            
            base_folder = folder_template.format(dataset=dataset, sparsity=sparsity_str, noise=noise_str, seed=seed)
            training_folder = '{}_training_prune/'.format(dataset) + base_folder
            print('mkdir -p', training_folder)
            arg_str = template.format(
                dataset=dataset,
                seed=seed,
                arch=arch,
                gpu=0,
                sparsity=sparsity_str,
                noise=noise_str,
                base_folder=base_folder,
                training_folder=training_folder,
                prune_percent=prune_percent,
                prune_schedule=prune_schedule,
                prune_rm_schedule=prune_rm_schedule,
            )
            all_tasks.append(arg_str)
            num_jobs += 1
print('\n'+'='*20, '\nnum_jobs:',num_jobs, '\nnum_tasks:',(num_jobs+jobs_per_gpu-1)//jobs_per_gpu)


config = {}
for i in range(len(all_tasks)):
    task = all_tasks[i]
    task_id = i // jobs_per_gpu
    if task_id not in config:
        config[task_id] = [task]
    else:
        config[task_id].append(task)
print('\ntask_assignment =', config)

In [None]:
# train and combine cv


template = '\
--arch {arch} --seed {seed} --gpu {gpu} --cvn {cvn} --cv {cv} --cv-seed 0 --workers 4 \
--batch-size 128 --epochs 200 --learning-rate 0.1 \
--lr-scheduler CosineAnnealingLR --momentum 0.9 --weight-decay 0.0005 --lr-tmax 200 \
--train-labels {dataset}_noisy_labels/{base_folder}/{dataset}_noisy_labels__frac_zero_noise_rates__0.{sparsity}__noise_amount__0.{noise}.json \
datasets/datasets/{dataset}/{dataset}/ \
> {training_folder}/out_{arch}__fold_{cv}.log \
'

folder_template = '{dataset}_noisy_labels__frac_zero_noise_rates__0_{sparsity}__noise_amount__0_{noise}/seed_{seed}'

combine_template = 'python {dataset}_train.py \
--arch {arch} --cvn {cvn} --cv-seed 0 --combine-folds \
--combine-source-path {training_folder}/ \
--combine-dest-path {cl_mask_folder}/ \
datasets/datasets/{dataset}/{dataset}/ \
'

###
dataset = 'cifar10'
cvn = 4
seeds = [0, 1, 2]
sparsities = [0, 20, 40, 60] #[0, 20, 40, 60]
noise_rates = [0, 10, 20] #[0, 10, 20]
jobs_per_gpu = 1
arch = 'resnet50'
###


num_jobs = 0
all_tasks = []
for seed in seeds:
    for sparsity in sparsities:
        for noise in noise_rates:
            if noise < 1e-2 and sparsity>0:
                continue
            sparsity_str = '{0:0=2d}'.format(sparsity)
            noise_str = '{0:0=2d}'.format(noise)
            
            base_folder = folder_template.format(dataset=dataset, sparsity=sparsity_str, noise=noise_str, seed=seed)
            training_folder = '{}_training/'.format(dataset) + base_folder
            print('mkdir -p', training_folder)
            
            for cv in range(cvn):
                arg_str = template.format(
                    dataset=dataset,
                    seed=seed,
                    arch=arch,
                    gpu=0,
                    cvn=cvn,
                    cv=cv,
                    sparsity=sparsity_str,
                    noise=noise_str,
                    base_folder=base_folder,
                    training_folder=training_folder,
                )
                all_tasks.append(arg_str)
                num_jobs += 1
print('\n'+'='*20, '\nnum_jobs:',num_jobs, '\nnum_tasks:',(num_jobs+jobs_per_gpu-1)//jobs_per_gpu)


config = {}
for i in range(len(all_tasks)):
    task = all_tasks[i]
    task_id = i // jobs_per_gpu
    if task_id not in config:
        config[task_id] = [task]
    else:
        config[task_id].append(task)
print('\ntask_assignment =', config)


print('\n'+'='*20+'\n')
for seed in seeds:
    for sparsity in sparsities:
        for noise in noise_rates:
            if noise < 1e-2 and sparsity>0:
                continue
            sparsity_str = '{0:0=2d}'.format(sparsity)
            noise_str = '{0:0=2d}'.format(noise)
            
            base_folder = folder_template.format(dataset=dataset, sparsity=sparsity_str, noise=noise_str, seed=seed)
            training_folder = '{}_training/'.format(dataset) + base_folder
            cl_mask_folder = '{}_mask_cl/{}'.format(dataset, base_folder)
            print('mkdir -p', cl_mask_folder)
            print(combine_template.format(
                dataset=dataset,
                arch=arch,
                cvn=cvn,
                training_folder=training_folder,
                cl_mask_folder=cl_mask_folder,
            ), end='\n\n')

In [None]:
# retraining

import numpy as np
import json
import os


template = '\
--arch {arch} --seed {seed} --gpu {gpu} --workers 4 \
--batch-size 128 --epochs 200 --learning-rate 0.1 \
--lr-scheduler CosineAnnealingLR --momentum 0.9 --weight-decay 0.0005 --lr-tmax 200 \
--train-labels {dataset}_noisy_labels/{base_folder}/{dataset}_noisy_labels__frac_zero_noise_rates__0.{sparsity}__noise_amount__0.{noise}.json \
--dir-train-mask {mask_path} \
datasets/datasets/{dataset}/{dataset}/ \
> {training_folder}/out_{arch}.log \
'

folder_template = '{dataset}_noisy_labels__frac_zero_noise_rates__0_{sparsity}__noise_amount__0_{noise}/seed_{seed}'
s_template = '{dataset}_noisy_labels/{dataset}_noisy_labels__frac_zero_noise_rates__0_{sparsity}__noise_amount__0_{noise}/seed_{seed}/{dataset}_noisy_labels__frac_zero_noise_rates__0.{sparsity}__noise_amount__0.{noise}.json'
mask_template = '{dataset}_mask{scl}/{base_folder}/{method}/model_{arch}_train_mask.npy'


###
dataset = 'cifar10'
seeds = [0, 1, 2]
sparsities = [0, 20, 40, 60] #[0, 20, 40, 60]
noise_rates = [0, 10, 20] #[0, 10, 20]
methods = ['kmeans', 'cl', 'no_clean'] # 'kmeans', 'cl', 'no_clean'
jobs_per_gpu = 1
arch = 'resnet50'
###


num_jobs = 0
all_tasks = []
for seed in seeds:
    for sparsity in sparsities:
        for noise in noise_rates:
            if noise < 1e-2 and sparsity>0:
                continue
            sparsity_str = '{0:0=2d}'.format(sparsity)
            noise_str = '{0:0=2d}'.format(noise)
            base_folder = folder_template.format(dataset=dataset, sparsity=sparsity_str, noise=noise_str, seed=seed)
            
            for method in methods:
                mask_path = mask_template.format(dataset=dataset, scl='', base_folder=base_folder, method=method, arch=arch)
                training_folder = '{}_training_masked/{}/{}'.format(dataset, base_folder, method)
                print('mkdir -p', training_folder)
                
                arg_str = template.format(
                    dataset=dataset,
                    seed=seed,
                    arch=arch,
                    gpu=0,
                    sparsity=sparsity_str,
                    noise=noise_str,
                    base_folder=base_folder,
                    mask_path=mask_path,
                    training_folder=training_folder,
                )
                all_tasks.append(arg_str)
                num_jobs += 1
print('\n'+'='*20, '\nnum_jobs:',num_jobs, '\nnum_tasks:',(num_jobs+jobs_per_gpu-1)//jobs_per_gpu)


config = {}
for i in range(len(all_tasks)):
    task = all_tasks[i]
    task_id = i // jobs_per_gpu
    if task_id not in config:
        config[task_id] = [task]
    else:
        config[task_id].append(task)
print('\ntask_assignment =', config)


In [None]:
# retraining with static model-pred labels

import numpy as np
import json
import os
from sklearn.metrics import accuracy_score


pred_template = '\
--make-train-label --label-outfile {label_outfile} \
--arch {arch} --gpu {gpu} --workers 4 --batch-size 256 \
--train-labels {label_path} \
--resume {model_path} \
--dir-train-mask {mask_path} \
datasets/datasets/{dataset}/{dataset}/ \
'
template = '\
--arch {arch} --seed {seed} --gpu {gpu} --workers 4 \
--batch-size 128 --epochs 200 --learning-rate 0.1 \
--lr-scheduler CosineAnnealingLR --momentum 0.9 --weight-decay 0.0005 --lr-tmax 200 \
--train-labels {new_label_path} \
--dir-train-mask {mask_path} \
datasets/datasets/{dataset}/{dataset}/ \
> {training_folder}/out_{arch}.log \
'

folder_template = '{dataset}_noisy_labels__frac_zero_noise_rates__0_{sparsity}__noise_amount__0_{noise}/seed_{seed}'
s_template = '{dataset}_noisy_labels/{dataset}_noisy_labels__frac_zero_noise_rates__0_{sparsity}__noise_amount__0_{noise}/seed_{seed}/{dataset}_noisy_labels__frac_zero_noise_rates__0.{sparsity}__noise_amount__0.{noise}.json'
mask_template = '{dataset}_mask{scl}/{base_folder}/{method}/model_{arch}_train_mask.npy'
label_template = '{dataset}_noisy_labels/{base_folder}/{dataset}_noisy_labels__frac_zero_noise_rates__0.{sparsity}__noise_amount__0.{noise}.json'
new_label_template = '{dataset}_mask{scl}/{base_folder}/{method}/cifar10_noisy_labels.json'
model_template = '{dataset}_training/{base_folder}/model_{arch}_best.pth.tar'


###
dataset = 'cifar10'
seeds = [0, 1, 2]
sparsities = [0, 20, 40, 60] #[0, 20, 40, 60]
noise_rates = [0, 10, 20] #[0, 10, 20]
methods = ['kmeans_pred'] # 'kmeans_pred'
jobs_per_gpu = 1
arch = 'resnet50'
###


num_jobs = 0
all_tasks = []
for seed in seeds:
    for sparsity in sparsities:
        for noise in noise_rates:
            if noise < 1e-2 and sparsity>0:
                continue
            sparsity_str = '{0:0=2d}'.format(sparsity)
            noise_str = '{0:0=2d}'.format(noise)
            base_folder = folder_template.format(dataset=dataset, sparsity=sparsity_str, noise=noise_str, seed=seed)
            
            for method in methods:
                orig_method = method.split('_')[0]
                orig_mask_path = mask_template.format(dataset=dataset, scl='', base_folder=base_folder, method=orig_method, arch=arch)
                model_path = model_template.format(dataset=dataset, base_folder=base_folder, arch=arch)
                label_path = label_template.format(dataset=dataset, base_folder=base_folder, sparsity=sparsity_str, noise=noise_str)
                new_label_path = new_label_template.format(dataset=dataset, scl='', base_folder=base_folder, method=method)
                
                pred_str = pred_template.format(
                    dataset=dataset,
                    seed=seed,
                    arch=arch,
                    gpu=0,
                    label_path=label_path,
                    model_path=model_path,
                    mask_path=orig_mask_path,
                    label_outfile=new_label_path,
                )
                
                training_folder = '{}_training_masked/{}/{}'.format(dataset, base_folder, method)
                print('mkdir -p', training_folder)
                no_clean_mask_path = mask_template.format(dataset=dataset, scl='', base_folder=base_folder, method='no_clean', arch=arch)
                mask_path = mask_template.format(dataset=dataset, scl='', base_folder=base_folder, method=method, arch=arch)
                print('mkdir -p', os.path.dirname(mask_path))
                print('cp', no_clean_mask_path, mask_path)
                
                arg_str = template.format(
                    dataset=dataset,
                    seed=seed,
                    arch=arch,
                    gpu=0,
                    new_label_path=new_label_path,
                    mask_path=mask_path,
                    training_folder=training_folder,
                )
                
                all_tasks.append([pred_str, arg_str])
                num_jobs += 1
print('\n'+'='*20, '\nnum_jobs:',num_jobs, '\nnum_tasks:',(num_jobs+jobs_per_gpu-1)//jobs_per_gpu)


config = {}
for i in range(len(all_tasks)):
    task = all_tasks[i]
    task_id = i // jobs_per_gpu
    if task_id not in config:
        config[task_id] = task
    else:
        config[task_id].append(task)
print('\ntask_assignment =', config)


In [None]:
# retraining with dynamic model-pred labels

import numpy as np
import json
import os
from sklearn.metrics import accuracy_score


template = '\
--dynamic-train-label --dynamic-frac-start 0.5  --dynamic-frac-end 0.9 \
--arch {arch} --seed {seed} --gpu {gpu} --workers 4 \
--batch-size 128 --epochs 200 --learning-rate 0.1 \
--lr-scheduler CosineAnnealingLR --momentum 0.9 --weight-decay 0.0005 --lr-tmax 200 \
--train-labels {dataset}_noisy_labels/{base_folder}/{dataset}_noisy_labels__frac_zero_noise_rates__0.{sparsity}__noise_amount__0.{noise}.json \
--dir-train-mask {mask_path} \
datasets/datasets/{dataset}/{dataset}/ \
> {training_folder}/out_{arch}.log \
'

folder_template = '{dataset}_noisy_labels__frac_zero_noise_rates__0_{sparsity}__noise_amount__0_{noise}/seed_{seed}'
s_template = '{dataset}_noisy_labels/{dataset}_noisy_labels__frac_zero_noise_rates__0_{sparsity}__noise_amount__0_{noise}/seed_{seed}/{dataset}_noisy_labels__frac_zero_noise_rates__0.{sparsity}__noise_amount__0.{noise}.json'
mask_template = '{dataset}_mask{scl}/{base_folder}/{method}/model_{arch}_train_mask.npy'


###
dataset = 'cifar10'
seeds = [0, 1, 2]
sparsities = [0, 20, 40, 60] #[0, 20, 40, 60]
noise_rates = [0, 10, 20] #[0, 10, 20]
methods = ['kmeans_pred_dyn'] # 'kmeans_pred_dyn'
jobs_per_gpu = 1
arch = 'resnet50'
###


num_jobs = 0
all_tasks = []
for seed in seeds:
    for sparsity in sparsities:
        for noise in noise_rates:
            if noise < 1e-2 and sparsity>0:
                continue
            sparsity_str = '{0:0=2d}'.format(sparsity)
            noise_str = '{0:0=2d}'.format(noise)
            base_folder = folder_template.format(dataset=dataset, sparsity=sparsity_str, noise=noise_str, seed=seed)
            
            for method in methods:
                orig_method = method.split('_')[0]
                orig_mask_path = mask_template.format(dataset=dataset, scl='', base_folder=base_folder, method=orig_method, arch=arch)
                mask_path = mask_template.format(dataset=dataset, scl='', base_folder=base_folder, method=method, arch=arch)
                print('mkdir -p', os.path.dirname(mask_path))
                print('cp', orig_mask_path, mask_path)
                
                training_folder = '{}_training_masked/{}/{}'.format(dataset, base_folder, method)
                print('mkdir -p', training_folder)
                
                arg_str = template.format(
                    dataset=dataset,
                    seed=seed,
                    arch=arch,
                    gpu=0,
                    sparsity=sparsity_str,
                    noise=noise_str,
                    base_folder=base_folder,
                    mask_path=mask_path,
                    training_folder=training_folder,
                )
                all_tasks.append(arg_str)
                num_jobs += 1
print('\n'+'='*20, '\nnum_jobs:',num_jobs, '\nnum_tasks:',(num_jobs+jobs_per_gpu-1)//jobs_per_gpu)


config = {}
for i in range(len(all_tasks)):
    task = all_tasks[i]
    task_id = i // jobs_per_gpu
    if task_id not in config:
        config[task_id] = [task]
    else:
        config[task_id].append(task)
print('\ntask_assignment =', config)
