# DataSource

Node definition for defining access and structure of data sources. The base class for data sources is `podpac.data.DataSource`.

The child class must implement the method `get_data`.
This method defines how to select data from the source given input PODPAC Coordinates.

The child class often implements the method `get_native_coordinates`.
This method defines defines how to select the native coordinates from the source and returns PODPAC Coordinates.

## Array Example

In [1]:
from podpac.data import DataSource
from podpac import Coordinates
import traitlets as tl     # traitlets is used to enforce types
import numpy as np

class MyDataType(DataSource):
    
    source = tl.Instance(np.ndarray)   # input "source" to this data type must be a numpy ndarray
                                       # DataSource includes input "native_coordinates" and enforces Coordinates type

    def get_data(self, coordinates, coordinates_index):
        # this will run when the node is evaluated
        
        # use the index of the coordinates to select data from the "source"
        idx = coordinates_index
        data = self.source[idx]
        
        # Create a UnitsDataArray from coordinates and data from "source"
        uda = self.create_output_array(coordinates, data=data)
        
        return uda

In [2]:
# make a node from your DataSource
data = np.random.rand(5, 5)   # random 5x5 grid of data
native_coordinates = Coordinates([np.linspace(0, 5, 5), np.linspace(0, 5, 5)], dims=['lat', 'lon'])

node = MyDataType(source=data, native_coordinates=native_coordinates)
node

MyDataType DataSource
	source: 
[[0.45338973 0.19546453 0.85469134 0.03867773 0.45204412]
 [0.11564198 0.26785353 0.99861128 0.11012025 0.04001062]
 [0.80555105 0.02089708 0.54990447 0.89955082 0.42252872]
 [0.62862538 0.82598183 0.26089483 0.64778125 0.43701609]
 [0.3864625  0.07191054 0.68502981 0.0211749  0.68967417]]
	native_coordinates: 
		lat: ArrayCoordinates1d(lat): Bounds[0.0, 5.0], N[5], ctype['midpoint']
		lon: ArrayCoordinates1d(lon): Bounds[0.0, 5.0], N[5], ctype['midpoint']
	interpolation: nearest

In [3]:
# evaluate the node at coordinates in the top left of the grid
coords = Coordinates([[0, 1, 2], [0, 1, 2]], dims=['lat', 'lon'])
output = node.eval(coords)
output

## DataFrame Example

In [4]:
from podpac.data import DataSource
from podpac import Coordinates

import traitlets as tl     # traitlets is used to enforce types
import numpy as np
import pandas as pd

class OtherType(DataSource):
    
    source = tl.Instance(pd.DataFrame)   # input "source" to this data type must be a pandas DataFrame
    lat_key = tl.Unicode()               # where to find "lat" data in this dataframe
    lon_key = tl.Unicode()               # where to find "lon" in this dataframe
    data_key = tl.Unicode()              # where to find specific data of interest in this dataframe
    
    def get_native_coordinates(self):
        # this will return native coordinates as a podpac.Coordinates class
        
        lat = self.source[self.lat_key]
        lon = self.source[self.lon_key]
        
        # since this is tabular dat, our coordinates are point data, or "stacked"
        # in podpac we use the "_" to refer to stacked coordinates
        return Coordinates([(lat, lon)], dims=['lat_lon'])
    
    
    def get_data(self, coordinates, coordinates_index):
        # this will run when the node is evaluated
        
        # use the index of the coordinates to select data from the "source"
        idx = coordinates_index
        data = self.source[self.data_key].iloc[idx]
        
        # Create a UnitsDataArray from coordinates and data from "source"
        uda = self.create_output_array(coordinates, data=data)
        
        return uda

In [5]:
# make a node from your DataSource
data = pd.DataFrame(data={
    'values': np.random.rand(5),
    'lon_val': np.linspace(0, 5, 5),
    'lat_val': np.linspace(0, 5, 5)
})

node = OtherType(source=data,
                 lat_key='lat_val',
                 lon_key='lon_val',
                 data_key='values'
                )
node

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


OtherType DataSource
	source: 
     values  lon_val  lat_val
0  0.697030     0.00     0.00
1  0.385011     1.25     1.25
2  0.561560     2.50     2.50
3  0.335049     3.75     3.75
4  0.359821     5.00     5.00
	interpolation: nearest

In [6]:
# evaluate the node at coordinates in the top left of the grid
coords = Coordinates([([0, 1, 2], [0, 1, 2])], dims=['lat_lon'])
output = node.eval(coords)
output