/
train_boundaries.py
129 lines (112 loc) · 4.67 KB
/
train_boundaries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
from os.path import join as ojoin
import argparse
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler
from concurrent.futures import ProcessPoolExecutor, wait
def train_boundary(data_path, save_path, split_ratio=0.7, ratio_pos_samples=0.5):
"""train one boundary and saves it as numpy array
args:
data_path: path to dir containing id_bound_latents and id_bound_labels
save_path: path to directory where to save the boundary
split_ratio: ratio to split training and validation sets
ratio_pos_samples: ratio of positive and negative samples.
0.5 results in using half of all data as positive and half as negative
"""
i = int(data_path.split("_")[-1])
latent_codes = np.load(ojoin(data_path, "id_bound_latents.npy"))
scores = np.load(ojoin(data_path, "id_bound_labels.npy"))
num_samples = latent_codes.shape[0]
latent_space_dim = latent_codes.shape[1]
scaling = MinMaxScaler(feature_range=(-1, 1)).fit(latent_codes)
latent_codes = scaling.transform(latent_codes)
sorted_idx = np.argsort(scores, axis=0)[::-1, 0]
latent_codes = latent_codes[sorted_idx]
scores = scores[sorted_idx]
chosen_num = int(num_samples * ratio_pos_samples)
chosen_num = min(chosen_num, num_samples // 2)
train_num = int(chosen_num * split_ratio)
val_num = chosen_num - train_num
# Positive samples.
positive_idx = np.arange(chosen_num)
np.random.shuffle(positive_idx)
positive_train = latent_codes[:chosen_num][positive_idx[:train_num]]
positive_val = latent_codes[:chosen_num][positive_idx[train_num:]]
# Negative samples.
negative_idx = np.arange(chosen_num)
np.random.shuffle(negative_idx)
negative_train = latent_codes[-chosen_num:][negative_idx[:train_num]]
negative_val = latent_codes[-chosen_num:][negative_idx[train_num:]]
# Training set.
train_data = np.concatenate([positive_train, negative_train], axis=0)
train_label = np.concatenate(
[np.ones(train_num, dtype=int), np.zeros(train_num, dtype=int)], axis=0
)
# Validation set.
val_data = np.concatenate([positive_val, negative_val], axis=0)
val_label = np.concatenate(
[np.ones(val_num, dtype=int), np.zeros(val_num, dtype=int)], axis=0
)
clf = LinearSVC(dual=False, verbose=False)
classifier = clf.fit(train_data, train_label)
accuracy = 0
if val_num and i % 100 == 0:
val_prediction = classifier.predict(val_data)
correct_num = np.sum(val_label == val_prediction)
accuracy = correct_num / (val_num * 2)
accuracy = round(accuracy * 100, 2)
print(f"\nValidation accuracy of SVM {i}: {accuracy}")
print(f"Number of iterations of SVM {i}: {classifier.n_iter_}")
train_pred = classifier.predict(train_data)
correct_num = np.sum(train_label == train_pred)
accuracy = correct_num / (train_num * 2)
accuracy = round(accuracy * 100, 2)
print(f"Training accuracy of SVM {i}: {accuracy}")
a = classifier.coef_.reshape(1, latent_space_dim).astype(np.float32)
boundary = a / np.linalg.norm(a)
filename = "boundary_" + data_path.split("_")[-1] + ".npy"
b_save_path = ojoin(save_path, filename)
np.save(b_save_path, boundary)
def train_all_boundaries(datadir, save_path, pool_size=20):
"""train all boundaries and save them
args:
datadir: directory containing all boundary training data
save_path: directory where to save all boundaries
pool_size: size of process pool to run in parallel
"""
executor = ProcessPoolExecutor(pool_size)
processes = []
os.makedirs(save_path, exist_ok=True)
classes = sorted(os.listdir(datadir))
print("Training boundaries...")
for cls in classes:
data_path = ojoin(datadir, cls)
p = executor.submit(train_boundary, data_path, save_path)
processes.append(p)
wait(processes)
def main(args):
b_save_path = args.savepath
train_all_boundaries(args.datadir, b_save_path, args.processes)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Train Identity-Separating Boundaries")
parser.add_argument(
"--datadir",
type=str,
default="/data/synthetic_imgs/SG3_SVM_data",
help="path to SVM training data",
)
parser.add_argument(
"--savepath",
type=str,
default="/home/boundaries/boundaries_SG3_w_space",
help="where to save the boundaries",
)
parser.add_argument(
"--processes",
type=int,
default=30,
help="number of processes to run training in parallel",
)
args = parser.parse_args()
main(args)