In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import matplotlib.pyplot as plt
import os

import glob

import pandas as pd
import numpy as np


In [None]:
# my home-written modules
import image_helpers
import split_sets
import pytorch_helpers


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
plt.rcParams['savefig.dpi'] = 80*2
plt.rcParams['figure.dpi'] = 80*2
plt.rcParams['figure.figsize'] = np.array((10,6))*.5
plt.rcParams['figure.facecolor'] = "white"

In [None]:
data_dir = image_helpers.data_dir


# Load Data

In [None]:
df = pd.read_csv(os.path.join(data_dir, "matched_galaxies.csv"))
df = df.set_index("SpecObjID")
print(df.shape)
df.head()

In [None]:
# targets
df_Y = df[["MEDIAN"]]
df_Y.head()


In [None]:
plt.hist(df_Y.MEDIAN.values, bins=30)
print(df_Y.var())

In [None]:
ids_with_images = glob.glob(os.path.join(data_dir, 
                                         "images",
                                         "processed",
                                         "*.npy"))
ids_with_images = [os.path.split(filename)[1].replace(".npy", "")
                   for filename in ids_with_images]
ids_with_images = np.array(ids_with_images, dtype=int)


## Create a temporary directory of symlinks to images

Remember, since these are symlinks, you probably can't have this within a Dropbox-tracked folder, since Dropbox will often convert symlinks into actual files, which would be a waste of space.

In [None]:
id_sets = split_sets.split_indices(ids_with_images)
training_ids, validation_ids, testing_ids = id_sets

df_Y["target"] = df_Y["MEDIAN"] - df_Y.loc[training_ids].MEDIAN.mean()


In [None]:
print("temp_directory: ", pytorch_helpers.temp_directory)

pytorch_helpers.create_pytorch_directory_structure(
    ids_with_images,
    training_ids, validation_ids, testing_ids,
    start_from_scratch=False,
    verbose=True,
)


# Load data

In [None]:
data_loaders = pytorch_helpers.create_data_loaders(
    df_Y,
    verbose=True,
)

In [None]:
plt.hist(np.hstack([z_tensor 
                    for _, z_tensor in data_loaders["training"]
                   ]))

In [None]:
pytorch_helpers.device

# Build Model


In [None]:
model = pytorch_helpers.Model(data_loaders, 
                              continue_training=False)
model = model.to(model.device)

criterion = nn.MSELoss()

# Observe that all parameters are being optimized
optimizer = optim.Adam(model.parameters())
optimizer.param_groups[0]["initial_lr"] = optimizer.param_groups[0]["lr"]

# Evolve LR using cosine annealing
# note: in order to setup the restarts, I should read: https://arxiv.org/abs/1608.03983
scheduler = lr_scheduler.CosineAnnealingLR(optimizer, 
                                           T_max = 10, # currently in units of epochs
                                           last_epoch=model.epoch_counter,
                                          )


In [None]:
num_threads = 12
num_epochs = 3

torch.set_num_threads(num_threads)

model = model.train_model(criterion, optimizer, scheduler,
                          num_epochs=num_epochs)


In [None]:
targets, outputs = model.apply("validation")
print(targets.shape)

plt.hist2d(targets, outputs, bins=30, cmap="Greys")
plt.plot(*([-1, 1],)*2, color="black")
plt.axvline(0, linestyle="dashed", color="black")
plt.axhline(0, linestyle="dashed", color="black")

print("RMS error: ", np.mean((targets - outputs)**2)**.5)

In [None]:
df = pd.read_csv(model.filename_logger)

In [None]:
plt.plot(df.loss**.5, label="training")
plt.plot(df.val_loss**.5, label="validation")

plt.axhline(df_Y.MEDIAN.std(), 
            linestyle="dashed", color="black",
            label="std.dev.",
           )

plt.xlabel("Epoch #")
plt.ylabel("RMS Error (dex)")

ylim = plt.ylim()

plt.ylim(top=min(1, max(ylim)))
plt.legend()
