# Eland Demo Notebook

In [1]:
import eland as ed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from elasticsearch import Elasticsearch

# Import standard test settings for consistent results
from eland.conftest import *

## Compare eland DataFrame vs pandas DataFrame

Create an eland.DataFrame from a `flights` index

In [2]:
ed_flights = ed.read_es('localhost', 'flights')

In [3]:
type(ed_flights)

eland.dataframe.DataFrame

Compare to pandas DataFrame (created from the same data)

In [4]:
pd_flights = ed.eland_to_pandas(ed_flights)

KeyboardInterrupt: 

In [None]:
type(pd_flights)

## Attributes and underlying data

### DataFrame.columns

In [None]:
pd_flights.columns

In [None]:
ed_flights.columns

### DataFrame.dtypes

In [None]:
pd_flights.dtypes

In [None]:
ed_flights.dtypes

### DataFrame.select_dtypes

In [None]:
pd_flights.select_dtypes(include=np.number)

In [None]:
ed_flights.select_dtypes(include=np.number)

### DataFrame.empty

In [None]:
pd_flights.empty

In [None]:
ed_flights.empty

### DataFrame.shape

In [None]:
pd_flights.shape

In [None]:
ed_flights.shape

### DataFrame.index

Note, `eland.DataFrame.index` does not mirror `pandas.DataFrame.index`. 

In [None]:
pd_flights.index

In [None]:
# NBVAL_IGNORE_OUTPUT
ed_flights.index

In [None]:
ed_flights.index.index_field

### DataFrame.values

Note, `eland.DataFrame.values` is not supported.

In [None]:
pd_flights.values

In [None]:
try:
    ed_flights.values
except AttributeError as e:
    print(e)

## Indexing, iteration

### DataFrame.head

In [None]:
pd_flights.head()

In [None]:
ed_flights.head()

### DataFrame.tail

In [None]:
pd_flights.tail()

In [None]:
ed_flights.tail()

### DataFrame.keys

In [None]:
pd_flights.keys()

In [None]:
ed_flights.keys()

### DataFrame.get

In [None]:
pd_flights.get('Carrier')

In [None]:
ed_flights.get('Carrier')

In [None]:
pd_flights.get(['Carrier', 'Origin'])

List input not currently supported by `eland.DataFrame.get`

In [None]:
try:
    ed_flights.get(['Carrier', 'Origin'])
except TypeError as e:
    print(e)

### DataFrame.query

In [None]:
pd_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled == True')

`eland.DataFrame.query` requires qualifier on bool i.e.

`ed_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled')` fails

In [None]:
ed_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled == True')

#### Boolean indexing query

In [None]:
pd_flights[(pd_flights.Carrier=="Kibana Airlines") & 
           (pd_flights.AvgTicketPrice > 900.0) &
           (pd_flights.Cancelled == True)]

In [None]:
ed_flights[(ed_flights.Carrier=="Kibana Airlines") & 
           (ed_flights.AvgTicketPrice > 900.0) &
           (ed_flights.Cancelled == True)]

## Function application, GroupBy & window

### DataFrame.aggs

In [None]:
pd_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])

`eland.DataFrame.aggregate` currently only supported numeric columns

In [None]:
ed_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])

## Computations / descriptive stats

### DataFrame.count

In [None]:
pd_flights.count()

In [None]:
ed_flights.count()

### DataFrame.describe

In [None]:
pd_flights.describe()

Values returned from `eland.DataFrame.describe` may vary due to results of Elasticsearch aggregations.

In [None]:
# NBVAL_IGNORE_OUTPUT
ed_flights.describe()

### DataFrame.info

In [None]:
pd_flights.info()

In [None]:
ed_flights.info()

### DataFrame.max, DataFrame.min, DataFrame.mean, DataFrame.sum

#### max

In [None]:
pd_flights.max(numeric_only=True)

`eland.DataFrame.max,min,mean,sum` only aggregate numeric columns

In [None]:
ed_flights.max(numeric_only=True)

#### min

In [None]:
pd_flights.min(numeric_only=True)

In [None]:
ed_flights.min(numeric_only=True)

#### mean

In [None]:
pd_flights.mean(numeric_only=True)

In [None]:
ed_flights.mean(numeric_only=True)

#### sum

In [None]:
pd_flights.sum(numeric_only=True)

In [None]:
ed_flights.sum(numeric_only=True)

### DataFrame.nunique

In [None]:
pd_flights[['Carrier', 'Origin', 'Dest']].nunique()

In [None]:
ed_flights[['Carrier', 'Origin', 'Dest']].nunique()

### DataFrame.drop

In [None]:
pd_flights.drop(columns=['AvgTicketPrice', 
                         'Cancelled', 
                         'DestLocation',
                         'Dest', 
                         'DestAirportID', 
                         'DestCityName', 
                         'DestCountry'])

In [None]:
ed_flights.drop(columns=['AvgTicketPrice', 
                         'Cancelled', 
                         'DestLocation',
                         'Dest', 
                         'DestAirportID', 
                         'DestCityName', 
                         'DestCountry'])

### Plotting

In [None]:
pd_flights.select_dtypes(include=np.number).hist(figsize=[10,10])
plt.show()

In [None]:
ed_flights.select_dtypes(include=np.number).hist(figsize=[10,10])
plt.show()

### Elasticsearch utilities

In [None]:
ed_flights2 = ed_flights[(ed_flights.OriginAirportID == 'AMS') & (ed_flights.FlightDelayMin > 60)]
ed_flights2 = ed_flights2[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']]
ed_flights2 = ed_flights2.tail()

In [None]:
print(ed_flights2.info_es())