forked from k-styles/KD-Calibration
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scripts.sh
186 lines (163 loc) · 4.81 KB
/
scripts.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
### CIFAR10/100 ###############################################################
# train teacher resnet56 on cross_entropy ####################################
CUDA_VISIBLE_DEVICES=0 accelerate launch --mixed_precision fp16 train_teacher.py \
--dataset cifar100 \
--model resnet56 \
--lr 0.1 \
--epochs 160 \
--scheduler multistep \
--schedule-steps 80 120 \
--lr-decay-factor 0.1 \
--wd 1e-4 \
--train-batch-size 128 \
--loss cross_entropy
# train teacher resnet56 on FL+MDCA ###########################
CUDA_VISIBLE_DEVICES=0 accelerate launch --mixed_precision fp16 train_teacher.py \
--dataset cifar100 \
--model resnet56 \
--lr 0.1 \
--epochs 160 \
--scheduler multistep \
--schedule-steps 80 120 \
--lr-decay-factor 0.1 \
--wd 1e-4 \
--train-batch-size 128 \
--loss FL+MDCA --gamma 3.0 --beta 1.0
# train teacher convnet on cross entropy ###########################
CUDA_VISIBLE_DEVICES=0 accelerate launch --mixed_precision fp16 train_teacher.py \
--dataset cifar100 \
--model convnet10 \
--lr 0.1 \
--epochs 160 \
--wd 1e-4 \
--train-batch-size 128 \
--loss cross_entropy
# Train students #############################################
CUDA_VISIBLE_DEVICES=0 accelerate launch --mixed_precision fp16 train_student.py \
--dataset cifar100 \
--model resnet8 \
--teacher resnet56 \
--teacher_path checkpoint/cifar100/2022-10-06-17:52:57.291536_resnet56_cross_entropy \
--lr 0.1 \
--epochs 160 \
--wd 1e-4 \
--train-batch-size 128 \
--T 5 --Lambda 0.95
###############################################################################################################
# train resnet on tiny_imagenet using warmup_cosine lr scheduling
# currently 352 steps per epoch on 4 GPUs using 64 batch-size per GPU
CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch --mixed_precision fp16 train_teacher.py \
--dataset tiny_imagenet \
--model resnet18_tin \
--lr 0.1 \
--scheduler warmupcosine \
--warmup 1000 \
--wd 1e-4 \
--train-batch-size 64 \
--epochs 50 \
--loss cross_entropy
# train a student resnet on tiny_imagenet
# same config as above
CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch --mixed_precision fp16 train_student.py \
--dataset tiny_imagenet \
--model resnet18_tin \
--lr 0.1 \
--scheduler warmupcosine \
--warmup 1000 \
--wd 1e-4 \
--train-batch-size 64 \
--epochs 5 \
--dw 1.0 \
--temp 1.0 \
--teacher resnet152_tin --checkpoint checkpoint/tiny_imagenet/2022-09-29-17:03:39.337810_resnet152_tin_cross_entropy/model_best.pth
# train a student using OE trained model
CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch --mixed_precision fp16 train_student.py \
--dataset cifar100 \
--model wide-resnet-40-1 \
--lr 0.1 \
--scheduler warmupcosine \
--wd 5e-4 \
--train-batch-size 64 \
--epochs 100 \
--dw 1.0 \
--temp 1.0 \
--teacher wide-resnet-40-2 \
--checkpoint pretrained_models/wide-resnet-40-2_cifar100/cifar100_wrn_oe_tune_epoch_9.pt
# train convnet
python train_teacher.py \
--dataset cifar10 \
--model convnet \
--lr 0.001 \
--wd 1e-4 \
--train-batch-size 128 \
--epochs 200 \
--loss cross_entropy
# try to run on all configurations for (gamma, beta) pairs gamma in [1,2,3] beta in [1,5,10]
CUDA_VISIBLE_DEVICES=7 python train_teacher.py \
--dataset cifar100 \
--model resnet110 \
--lr 0.1 \
--lr-decay-factor 0.1 \
--wd 5e-4 \
--train-batch-size 128 \
--schedule-steps 100 150 \
--epochs 200 \
--loss FL+MDCA \
--gamma 1.0 \
--beta 1.0
# try for all gamma values = [1, 2, 3]
CUDA_VISIBLE_DEVICES=7 python train_teacher.py \
--dataset cifar100 \
--model resnet110 \
--lr 0.1 \
--lr-decay-factor 0.1 \
--wd 5e-4 \
--train-batch-size 128 \
--schedule-steps 100 150 \
--epochs 200 \
--loss focal_loss \
--gamma 1.0
# Free gpus = [5, 6, 7]
# run the following commmand
simple_gpu_scheduler --gpus 5 6 7 < gpu_commands_hard.txt
# not so free = [0, 1, 2, 3, 4, 5, 6, 7]
# run the following commmand
simple_gpu_scheduler --gpus 0 1 2 3 4 5 6 7 < gpu_commands_easy.txt
# Train cifar10
# Just replace resnet20 with other model names such as resnet18, resnet50, resnet110 to train on them
# you can also tweak hyper-parameters, look at utils/argparser.py for more parameters.
# teacher training on CIFAR10
CUDA_VISIBLE_DEVICES=6 python train_teacher.py \
--dataset cifar100 \
--model resnet50 \
--lr 0.1 \
--lr-decay-factor 0.1 \
--wd 5e-4 \
--train-batch-size 128 \
--schedule-steps 100 150 \
--epochs 200 \
--loss mdca \
--gamma 3.0 \
--beta 10.0
CUDA_VISIBLE_DEVICES=6 python train_teacher.py \
--dataset cifar100 \
--model resnet50 \
--lr 0.1 \
--lr-decay-factor 0.1 \
--wd 5e-4 \
--train-batch-size 128 \
--schedule-steps 100 150 \
--epochs 200 \
--loss cross_entropy
# loss = [cross_entropy, mdca]
CUDA_VISIBLE_DEVICES=7 python train_student.py \
--dataset cifar10 \
--model resnet18 \
--teacher resnet152 \
--checkpoint checkpoint/cifar10/15-May_resnet152_cross_entropy/model_best.pth \
--lr 0.1 \
--lr-decay-factor 0.1 \
--wd 5e-4 \
--train-batch-size 128 \
--schedule-steps 100 150 \
--epochs 200