# Data loading with ExternalSource operator
In this notebook, we will see how to use the `ExternalSource` operator, which allows us to use an external data source as input to the Pipeline.

In [1]:
import collections
import numpy as np
from random import shuffle
from nvidia.dali.pipeline import Pipeline
from nvidia.dali import ops, types

batch_size = 16

## Defining the data source
We use an infinite iterator as a data source, on the sample dogs & cats images.

In [2]:
class ExternalInputIterator:
    def __init__(self, batch_size):
        self.images_dir = 'data/images/'
        self.batch_size = batch_size
        with open(self.images_dir + 'file_list.txt') as file:
            self.files = [line.rstrip() for line in file if line]
        shuffle(self.files)
    
    def __iter__(self):
        """ (this is not typical __iter__ ?) """
        self.i = 0
        self.n = len(self.files)
        return self
    
    def __next__(self):
        batch  = []
        labels = []
        for _ in range(self.batch_size):
            jpg_fname, label = self.files[self.i].split(' ')
            file = open(self.images_dir + jpg_fname, 'rb')
            batch.append(np.frombuffer(f.read(), dtype=np.uint8))
            labels.append(np.array([label], dtype=np.uint8))
            self.i = (self.i + 1) % self.n
        return batch, labels

# Defining the pipeline
The next step is to define the Pipeline.

The `ExternalSource` op accepts an iterable or a callable. If the source provides multiple outputs (eg images and labels), that number must also be specified as `num_outputs` argument.

Internally, the pipeline will call `source` (if callable) or run `next(source)`(if iterable) whenever more data is needed to keep the pipeline running.

In [3]:
external_input_iter = ExternalInputIterator(batch_size)

In [4]:
class ExternalSourcePipeline(Pipeline):
    def __init__(self, batch_size, ext_inp_iter, 
                 num_threads, device_id):
        super().__init__(batch_size, num_threads, device_id, seed=12)
        self.source  = ops.ExternalSource(source=ext_inp_iter, num_outputs=2)
        self.decode  = ops.ImageDecoder(device='mixed', output_type=types.RGB)
        self.enhance = ops.BrightnessContrast(device='gpu', contrast=2)
        
    def define_graph(self):
        jpgs, labels = self.source()
        images = self.decode(jpgs)
        output = self.enhance(images)
        return output, labels