-
Notifications
You must be signed in to change notification settings - Fork 0
/
experiment.py
201 lines (156 loc) · 10.1 KB
/
experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import datetime
import argparse
from dataloader import DualLoader
import numpy as np
from encoder import *
from simsiam import *
from classifier import *
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.data.dataloader import default_collate
# decide on set of hyperparams
# train model end to end with hyperparams
# eval model with hyper params
def load_dataloader(X_path, y_path, batch_size=64, target_size=(28,28), subset_percentage=1.0):
# load dataset
start_time = time.time()
dataset = DualLoader(X_path,y_path,target_size=target_size)
# subsample dataset for data loader
subset_percentage = args.subset_percentage
num_samples = int(len(dataset) * subset_percentage)
loader = DataLoader(Subset(dataset,range(num_samples)), batch_size=batch_size, shuffle=True, num_workers=0)
print(f"{len(loader.dataset)} records loaded in {round(time.time()-start_time,2)} seconds")
print()
return loader
# change to load data inside function so can cross test on
def eval_models(encoder,classifier,device,target_shape=(28,28)):
encoder.eval(),classifier.eval()
def main(args):
# set default device
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
print(f"Running torch on {device}"),print()
# create dirs for logging if not present
if 'experiments' not in os.listdir():
os.makedirs('experiments')
self_supervised_model_path = os.path.join('experiments', args.self_supervised_model)
if not os.path.exists(self_supervised_model_path):
os.makedirs(self_supervised_model_path)
model_dir = os.path.join(self_supervised_model_path, 'model')
if not os.path.exists(model_dir):
os.makedirs(model_dir)
# see if run with same id already exists
response=None
if str(args.run_id) in os.listdir(f'experiments/{args.self_supervised_model}/model/'):
# if default (no manual input), just increment
if args.run_id == 0:
while str(args.run_id) in os.listdir(f'experiments/{args.self_supervised_model}/model/'):
args.run_id += 1
# if named run, prompt user for inputs
else:
print(f'Run ID \"{args.run_id}\" already in use')
print('Would you like to overwrite? [[y]/n]')
response = input()
if response == 'n':
print('quitting...')
quit()
print("overwriting...")
if response==None: os.makedirs(f'experiments/{args.self_supervised_model}/model/{args.run_id}')
# data paths
data_dir = "/scratch/fc1132/capstone_data"
X_path_train = f"{data_dir}/X_train_split.csv"
y_path_train = f"{data_dir}/y_train_split.csv"
X_path_val = f"{data_dir}/X_val_split.csv"
y_path_val = f"{data_dir}/y_val_split.csv"
X_path_test = f"{data_dir}/X_test_sat6.csv"
y_path_test = f"{data_dir}/y_test_sat6.csv"
# SELF-SUPERVISED
# train/val data for encoder/decoder
print(f"Creating self-supervised training DataLoader: batch size = {args.batch_size}, target size = {args.encoder_image_shape}, subset % = {round(args.subset_percentage*100,2)}")
train_loader = load_dataloader(X_path_train,y_path_train,batch_size=args.batch_size,
target_size=args.encoder_image_shape,
subset_percentage=args.subset_percentage)
print(f"Creating self-supervised validation DataLoader: batch size = {args.batch_size}, target size = {args.encoder_image_shape}, subset % = {round(args.subset_percentage*100,2)}")
val_loader = load_dataloader(X_path_val,y_path_val,batch_size=args.batch_size,
target_size=args.encoder_image_shape,
subset_percentage=args.subset_percentage)
# load and train autoencoder or simsiam
if args.self_supervised_model=='autoencoder':
if args.latent_shape=='conv':
autoencoder = Autoencoder(args.latent_dim,args.encoder_image_shape)
autoencoder.to(device)
if args.latent_shape=='flat':
autoencoder = FlatAutoencoder(args.encoder_image_shape[0],args.latent_dim,4)
train_autoencoder(autoencoder,train_loader,val_loader,args.epochs,loss=args.encoder_loss, run_id=args.run_id)
torch.save(autoencoder,f'experiments/Autoencoder/model/{args.run_id}/autoencoder_final.pth')
elif args.self_supervised_model=='simsiam':
# TODO: incoporate conv autoencoder
autoencoder = FlatAutoencoder(args.encoder_image_shape[0],args.latent_dim,4)
train_simsiam(autoencoder, train_loader, val_loader, num_epochs=args.epochs, run_id=args.run_id)
torch.save(autoencoder, f'experiments/SimSiam/model/{args.run_id}/SimSiam_final.pth')
# CLASSIFICATION
# train/val data for classifier
print(f"Creating classifier training DataLoader: batch size = {args.batch_size}, target size = {args.classifier_image_shape}, subset % = {round(args.subset_percentage*100,2)}")
train_loader = load_dataloader(X_path_train,y_path_train,batch_size=args.batch_size,
target_size=args.classifier_image_shape,
subset_percentage=args.subset_percentage)
print(f"Creating classifier validation DataLoader: batch size = {args.batch_size}, target size = {args.classifier_image_shape}, subset % = {round(args.subset_percentage*100,2)}")
val_loader = load_dataloader(X_path_val,y_path_val,batch_size=args.batch_size,
target_size=args.classifier_image_shape,
subset_percentage=args.subset_percentage)
# train classifier with command line args
if args.latent_shape=='conv':
# determine conv input shape
for images,y in train_loader:
inputs,_,_=images
inputs = inputs.to(device)
embeddings = autoencoder.encoder(inputs)
latent_shape = (embeddings.shape[2],embeddings.shape[3],embeddings.shape[1])
classifier = ConvClassifier(latent_shape[0],latent_shape[1],in_channels=latent_shape[2],out_channels=latent_shape[2], dropout_prob=args.dropout)
elif args.latent_shape=='flat':
classifier = Classifier(input_dim=args.latent_dim,dropout_prob=args.dropout)
classifier.to(device)
train_classifier(autoencoder,classifier,train_loader,val_loader,args.epochs,run_id=args.run_id)
torch.save(classifier,f"experiments/{args.self_supervised_model}/model/{args.run_id}/classifier_final.pth")
# eval classifier on test set
print(f"Creating classifier test DataLoader: batch size = {args.batch_size}, target size = {args.classifier_image_shape}, subset % = {round(args.subset_percentage*100,2)}")
test_loader = load_dataloader(X_path_test,y_path_test,batch_size=args.batch_size,
target_size=args.classifier_image_shape,
subset_percentage=args.subset_percentage)
test_accuracy = evaluate_accuracy(autoencoder,classifier,test_loader)
print(test_accuracy)
# df to hold outputs
df = pd.DataFrame(columns=['Timestamp','Run ID/Name','Epochs','Batch Size','Subset %','Latent Dimension Depth','Latent Dimension Output Shape','Encoder Training Dimensions','Self-Supervised Model','Autoencoder Loss Function','Classifier Training Dimensions','Classifier Test Accuracy'])
if "run_logs.csv" not in os.listdir(f"experiments/{args.self_supervised_model}/"):
df.to_csv(f"experiments/{args.self_supervised_model}/run_logs.csv",index=False)
# conditional for encoder loss function
if args.self_supervised_model!='autoencoder':
encoder_loss='N/A'
else:
encoder_loss = args.encoder_loss
# add entry to file
entry = [datetime.datetime.fromtimestamp(time.time()).strftime('%c'),args.run_id,args.epochs,args.batch_size,args.subset_percentage,args.latent_dim,args.latent_shape,args.encoder_image_shape,args.self_supervised_model,encoder_loss,args.classifier_image_shape,round(test_accuracy,2)]
df.loc[len(df.index)] = entry
df.to_csv(f"experiments/{args.self_supervised_model}/run_logs.csv",index=False,header=False,mode='a')
return
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Evaluate stock autoencoder on a test data set for benchmarking")
# shared arguments
parser.add_argument("--run_id", type=str, default=0, help="id / name for models to be stored as")
parser.add_argument("--epochs", type=int, default=15, help="Number of epochs to train for")
parser.add_argument("--subset_percentage", type=float, default=1.0, help="0.0-1.0 percentage of data to actually train on")
parser.add_argument("--batch_size", type=int, default=64, help="Batch size for data loader")
parser.add_argument("--latent_dim", type=int, default=128, help="Depth of the latent dimension (encoder outputs)")
parser.add_argument("--latent_shape", type=str, default="conv", help="Shape of the latent dimension - \"conv\" or \"flat\"")
# encoder arguments
parser.add_argument("--encoder_image_shape", type=int, nargs=2, default=(28,28), help="Shape of encoder training images")
parser.add_argument("--self_supervised_model", type=str, default='autoencoder', help="Which self-supervised model (simsiam or autoencoder) to use in training the encoder portion")
parser.add_argument("--encoder_loss", type=str, default="binary_crossentropy", help="Loss function to train on (binary_crossentropy or mse)")
# NOT YET IMPLEMENTED parser.add_argument("--encoder_model_path", type=str, default=None, help="Model path to use a pretrained encoder")
# classifier arguments
parser.add_argument("--classifier_image_shape", type=int, nargs=2, default=(28,28), help="Shape of encoder input images during classifier training")
parser.add_argument("--dropout", type=float, default=0.3, help="dropout rate for classifier")
args = parser.parse_args()
# run main
main(args)