In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy

import pandas as pd
import numpy as np
import sys
import shutil

import types

import glob


In [None]:
# my home-written modules
import image_helpers
import split_sets
# import model_helpers

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
plt.rcParams['savefig.dpi'] = 80*2
plt.rcParams['figure.dpi'] = 80*2
plt.rcParams['figure.figsize'] = np.array((10,6))*.5
plt.rcParams['figure.facecolor'] = "white"

In [None]:
data_dir = image_helpers.data_dir


# Load Data

In [None]:
df = pd.read_csv(os.path.join(data_dir, "matched_galaxies.csv"))
df = df.set_index("SpecObjID")
print(df.shape)
df.head()

In [None]:
# targets
df_Y = df[["MEDIAN"]]
df_Y.head()


In [None]:
plt.hist(df_Y.MEDIAN.values, bins=30)
print(df_Y.var())

In [None]:
ids_with_images = glob.glob(os.path.join(data_dir, 
                                         "images",
                                         "processed",
                                         "*.npy"))
ids_with_images = [os.path.split(filename)[1].replace(".npy", "")
                   for filename in ids_with_images]
ids_with_images = np.array(ids_with_images, dtype=int)

ids_with_images_full = ids_with_images.copy()
# ids_with_images = ids_with_images[:10000]

## Create a temporary directory of symlinks to images

In [None]:
id_sets = split_sets.split_indices(ids_with_images, 
                                  )
training_ids, validation_ids, testing_ids = id_sets

df_Y["target"] = df_Y["MEDIAN"] - df_Y.loc[training_ids].MEDIAN.mean()

In [None]:
start_from_scratch=True

temp_directory = "/Users/egentry/tmp_pytorch/"

if start_from_scratch:
    shutil.rmtree(temp_directory)

if not os.path.isdir(temp_directory):
    os.makedirs(temp_directory)
    
source_format = os.path.join(
    os.getcwd(),
    data_dir, 
    "images",
    "processed",
    "{galaxy_id}.npy")

target_dir_format = os.path.join(temp_directory, "{val_train}", "{galaxy_id}")
target_format = os.path.join(target_dir_format, "{galaxy_id}.npy")

def get_val_train(galaxy_id):
    if galaxy_id in training_ids:
        return "training"
    
    if galaxy_id in validation_ids:
        return "validation"
    
    if galaxy_id in testing_ids:
        return "testing"
    

for i, galaxy_id in enumerate(ids_with_images):
    val_train = get_val_train(galaxy_id)
    
    target_dir = target_dir_format.format(galaxy_id=galaxy_id, val_train=val_train)
    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    
    target_filename = target_format.format(galaxy_id=galaxy_id, val_train=val_train)
    try:
        os.symlink(
            source_format.format(galaxy_id=galaxy_id),    
            target_filename,
        )
    except FileExistsError:
        pass
    
    if i > 1000:
        break


# Load data

I should just use `torchvision.datasets.DatasetFolder` and create a directory with the structure `torch_data/<galaxy_id>/<galaxy_id>.npy`, with a loader that reads in the path, and then transforms the label "`galaxy_id`" using `df.loc`. I shouldn't actually copy the image files; I should just symlink (**BUT I NEED TO DO THIS OUTSIDE DROPBOX**)

In [None]:
# Data augmentation and normalization for training
# Just normalization for validation
data_transforms = {
    'training': transforms.Compose([
#         transforms.RandomResizedCrop(224),
#         transforms.RandomHorizontalFlip(), # requires a PIL-able image
#         transforms.RandomVerticalFlip(), # requires a PIL-able image
        transforms.ToTensor(),
#         transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'validation': transforms.Compose([
#         transforms.Resize(256),
#         transforms.CenterCrop(224),
#         transforms.RandomHorizontalFlip(), # requires a PIL-able image
#         transforms.RandomVerticalFlip(), # requires a PIL-able image
        transforms.ToTensor(),
#         transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [None]:
def loader(path):
    img =  np.load(path)
    img = img[:3]
    if np.random.choice((True, False)):
        img = img[:,:,::-1]
        img = np.array(img)
    if np.random.choice((True, False)):
        img = img[:,::-1,:]
        img = np.array(img)
    
    img = img.transpose((1, 2, 0)) # annoying, but pytorch is going to rotate it back
    return img

def target_transform(target):
    """transforms `target` from a galaxy_id to the metallicity (regression target)"""
    target = int(target)
    return df_Y.loc[galaxy_id].target

In [None]:
extensions = ["npy"]

image_datasets = {x: datasets.DatasetFolder(os.path.join(temp_directory, x),
                                            loader,
                                            extensions,
                                            data_transforms[x],
                                            target_transform=target_transform)
                  for x in ['training', 'validation']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=64,
                                             shuffle=True, num_workers=4)
              for x in ['training', 'validation']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['training', 'validation']}
image_datasets['training']

In [None]:
ds = image_datasets['training']

In [None]:
res = ds.__getitem__(0)

In [None]:
res[0].shape

In [None]:
res[1]

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

# Build Model


In [None]:

def train_model(model, criterion, optimizer, scheduler, 
                num_epochs=10,
                verbose=False
               ):
    """
    model: the full pytorch model
    criterion: the loss function; callable(prediction, targets)
    optimizer: pytorch optimizer object
    scheduler: LR scheduler (see `torch.optim.lr_scheduler`)
    
    I guess optimizer already needs to be linked to criterion?
    """
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = np.inf

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['training', 'validation']:
            if phase == 'training':
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0

            # Iterate over data.
            for inputs, targets in dataloaders[phase]:
                inputs = inputs.to(device)
                targets = targets.reshape((-1, 1))
                targets = targets.to(device=device, dtype=torch.float)
                

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'training'):
                    outputs = model(inputs)
                    if verbose: print("outputs shape: ", outputs.shape)
                    loss = criterion(outputs, targets)

                    # backward + optimize only if in training phase
                    if phase == 'training':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)

            epoch_loss = running_loss / dataset_sizes[phase]

            print('{} loss: {:.4f}'.format(
                phase, epoch_loss))

            # deep copy the model
            if phase == 'validation' and best_loss > epoch_loss:
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val loss: {:4f}'.format(best_loss))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [None]:
model_ft = models.resnet34(pretrained=True)

def forward(self, x, verbose=False):
    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)
    x = self.maxpool(x)

    x = self.layer1(x)
    if verbose: print("after layer 1: ", x.shape)

    x = self.layer2(x)
    if verbose: print("after layer 2: ", x.shape)

    x = self.layer3(x)
    if verbose: print("after layer 3: ", x.shape)

    x = self.layer4(x)
    if verbose: print("after layer 4: ", x.shape)

    x = self.avgpool(x)
    x = x.view(x.size(0), -1)
    x = self.fc(x)

    return x

model_ft.forward = types.MethodType(forward, model_ft)

model_ft.avgpool = nn.AdaptiveAvgPool2d((1,1), )

num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 1, )

model_ft = model_ft.to(device)

criterion = nn.MSELoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.Adam(model_ft.parameters())

# Evolve LR using cosine annealing
# note: in order to setup the restarts, I should read: https://arxiv.org/abs/1608.03983
scheduler = lr_scheduler.CosineAnnealingLR(optimizer_ft, 
                                           T_max = 20, # in units of epochs
                                          )


In [None]:
torch.set_num_threads(6)

model_ft = train_model(model_ft, criterion, optimizer_ft, scheduler,
                       num_epochs=3)