In [1]:
import sys
import os
from nbfinder import NotebookFinder
sys.meta_path.append(NotebookFinder())

import netCDF4 as nc
from netCDF4 import MFDataset
from os import listdir, system
from os.path import isfile, join, isdir
import numpy as np
import imp
import itertools
import os
import sys
import time
import inspect
import copy
from util import get_camfiles,convert_nc_data_to_tensor
from labels.yolo_maker import make_yolo_masks_for_dataset, make_multiple_yolo_masks
import random
from configs import configs
import collections

importing Jupyter notebook from util.ipynb
importing Jupyter notebook from labels/util.ipynb
importing Jupyter notebook from labels/yolo_maker.ipynb
importing Jupyter notebook from labels/label_loader.ipynb
importing Jupyter notebook from labels/configs.ipynb
importing Jupyter notebook from configs.ipynb


In [2]:
class DataSet(object):

    def __init__(self,
               examples):


        self._examples = examples
        self._num_examples = self._examples.total_examples
        self._epochs_completed = 0
        self._index_in_epoch = 0

    @property
    def examples(self):
        return self._examples


    @property
    def num_examples(self):
        return self._num_examples

    @property
    def epochs_completed(self):
        return self._epochs_completed
    
    def shuffle(self):
        pass

    def next_batch(self, batch_size, shuffle=True):
        """Return the next `batch_size` examples from this data set."""
        start = self._index_in_epoch
        # Shuffle for the first epoch
        if self._epochs_completed == 0 and start == 0 and shuffle:
            
            #shuffle files
            seed = np.random.randint(0,1000)
            self._examples.shuffle(seed)

        # Go to the next epoch
        if start + batch_size > self._num_examples:
            # Finished epoch
            self._epochs_completed += 1
            
            # Get the rest examples in this epoch
            rest_num_examples = self._num_examples - start
            images_rest_part, labels_rest_part = self._examples[start:self._num_examples]
            
            # Shuffle the data
            if shuffle:
                seed = np.random.randint(0,1000)
                self._examples.shuffle(seed)
            # Start next epoch
            start = 0
            self._index_in_epoch = batch_size - rest_num_examples
            end = self._index_in_epoch
            images_new_part, labels_new_part = self._examples[start:end]
            return np.concatenate((images_rest_part, images_new_part), axis=0), np.concatenate((labels_rest_part, labels_new_part), axis=0)
        else:
            self._index_in_epoch += batch_size
            end = self._index_in_epoch
            images, labels = self._examples[start:end]
            return images,labels


In [3]:
class ClimateImageLabelPairs(object):
    def __init__(self,filepaths,labels_csv_file, variables=["TMQ", "VBOT", "PSL"],
                 time_step_sample_freq=2, time_steps_per_example=1,time_steps_per_file=8,):
        
        assert time_steps_per_example == 1, "3d not quite supported for labels"
        frame = inspect.currentframe()
        # set self.k = v for every k,v pair in __init__ except self of course
        self.set_constructor_args(frame)
        
        self.num_files = len(self.filepaths)
        self.examples_per_file = (time_steps_per_file / time_step_sample_freq) / time_steps_per_example
        self.total_examples = self.num_files * self.examples_per_file

    def set_constructor_args(self,frame):
        #set data members for object from constructor args
        _, _, _, params = inspect.getargvalues(frame)
        del params["frame"]
        for k,v in params.iteritems():
            setattr(self,k,v)
        
        
    
    def shuffle(self, seed):
        '''only shufflez files'''
        random.shuffle(self.filepaths)

    
    #overloading of bracket operators
    def __getitem__(self, slice_):
        slices = self.convert_slice_to_file_and_ex_inds(slice_)
        file_slice = slices["file_slice"]
        ex_slice = slices["ex_slice"]
        
        filepaths = self.filepaths[file_slice]
        tens = self.grab_data_chunk(filepaths)
        lbls = make_multiple_yolo_masks(camfile_paths=filepaths,
                                    labels_csv_file=self.labels_csv_file,
                                    caffe_format=True)
        images = tens[ex_slice]
        labels = lbls[ex_slice]
        return images, labels
    
    def convert_slice_to_file_and_ex_inds(self, slice_):
        if isinstance(slice_, slice):
            start, stop, step = [getattr(slice_,k) for k in ["start", "stop", "step"]]
            assert step==1 or step is None, "step must be 1 or None"
        
        elif isinstance(slice_, int):
            start, stop = [slice_, slice_]
            
        slices =  self.get_file_and_ex_inds(start, stop)
        return slices
        
        
        
            
    def get_file_and_ex_inds(self, start, stop):
        #file start stop indices to index filenames
        file_start, file_stop = self.get_file_ind(start), self.get_file_ind(stop)
        
        # get some useful numbers
        tot_examples_desired = stop - start 
        
        #relative example indices after examples read in
        ex_start = self.get_relative_ex_ind(start)
        ex_stop = ex_start + tot_examples_desired
        
        file_slice = slice(file_start,file_stop + 1)
        ex_slice = slice(ex_start,ex_stop)
        
        return {"file_slice":file_slice, "ex_slice": ex_slice}
    

    

    def get_file_ind(self,ex_ind):
        return ex_ind / self.examples_per_file

    def get_relative_ex_ind(self, ex_ind):
        return ex_ind % self.examples_per_file
        
        
        
    def grab_data_chunk(self, filepaths):
        """grabs input data (converts filepaths to np tensors)
        returns len(filepaths)*4, 16, 768,1152 array"""
        


        dataset=MFDataset(filepaths)
        
        tensor = convert_nc_data_to_tensor(dataset,self.variables, 
                                           self.time_step_sample_freq, self.time_steps_per_example)
 
        return tensor

In [4]:
def make_datasets():
    Datasets = collections.namedtuple('Datasets', ['tr', 'val', 'test', "tr_unlabelled"])
    data_dir = configs["data_dir"]
    
    tr, val, test, tr_unlabelled = [make_dataset(data_dir,type_) for type_ in ['tr', 'val', 'test', "tr_unlabelled"]]
    climate_data = Datasets(tr=tr, val=val, test=test, tr_unlabelled=tr_unlabelled)
    return climate_data
    

In [5]:
def make_dataset(data_dir, type_):
    
    camfiles = get_camfiles(data_dir, configs[type_ + "_years"], with_dir=True)
    image_lbl_pair = ClimateImageLabelPairs(filepaths=camfiles,
                      labels_csv_file=configs["labels_file"],
                      time_steps_per_example=configs["time_steps_per_example"])
    dataset = DataSet(image_lbl_pair)
    return dataset

In [6]:
if __name__ == "__main__":
    cl_data = make_datasets()

In [7]:
cl_data.tr.next_batch(batch_size=10)

(array([[[[ -9.93355779e-01,  -9.93355779e-01,  -9.93355779e-01, ...,
            -9.93355779e-01,  -9.93355779e-01,  -9.93355779e-01],
          [ -9.93154031e-01,  -9.93154344e-01,  -9.93154640e-01, ...,
            -9.93153068e-01,  -9.93153381e-01,  -9.93153677e-01],
          [ -9.92848043e-01,  -9.92849943e-01,  -9.92851937e-01, ...,
            -9.92843042e-01,  -9.92844736e-01,  -9.92846388e-01],
          ..., 
          [ -7.09872437e-01,  -7.10026638e-01,  -7.10105322e-01, ...,
            -7.09797615e-01,  -7.09799277e-01,  -7.09787667e-01],
          [ -7.08947631e-01,  -7.08963440e-01,  -7.08990926e-01, ...,
            -7.08881837e-01,  -7.08900587e-01,  -7.08933305e-01],
          [ -7.09227247e-01,  -7.09227247e-01,  -7.09227247e-01, ...,
            -7.09227247e-01,  -7.09227247e-01,  -7.09227247e-01]],
 
         [[ -7.82165890e-02,  -7.87176601e-02,  -7.92187586e-02, ...,
            -7.67136890e-02,  -7.72145885e-02,  -7.77155602e-02],
          [ -9.71674776e-02, 