<a href="https://colab.research.google.com/github/dayfine/xlab/blob/di%2Fcolab/data_coverage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Xlab Data Coverage

This notebook is used to check the coverage and stats of xlab data, e.g. what type of data exists for a particular security, over what date / time range, etc.

TODO: Optionally can be used to correct faulty data.

## Setup

In [None]:
import collections
import pandas as pd

from google.protobuf import json_format

from xlab.data import store
from xlab.data.converters import mongo as mongo_converter
from xlab.data.proto import data_type_pb2
from xlab.data.store import mongo
from xlab.trading.dates import trading_days

_DataType = data_type_pb2.DataType

In [None]:
#@title Inputs
symbol = "SPY" #@param {type:"string"}


## Fetch Data for a Security

In [None]:
_store = mongo.MongoDataStore()

all_data = _store.lookup(store.DataStore.LookupKey(data_space=1,symbol=symbol))

## Generate Stats

In [None]:
data_as_dicts = [mongo_converter.to_mongo_doc(data_entry) for data_entry in all_data.entries]

In [None]:
def groupby(iterable, key_fn):
    res = collections.OrderedDict()
    for item in iterable:
        key = key_fn(item)
        res.setdefault(key, []).append(item)
    return res

table_columns = ['dataType', 'startDate', 'endDate', '# Duplicates']
table_data = []

for dataType, dataEntries in groupby(data_as_dicts, lambda x: x['dataType']
                                     ).items():
    dataEntries.sort(key=lambda x: x['timestamp'])

    seen_dates = set()
    duplicates = []

    expected_trading_day = dataEntries[0]['timestamp']
    for data in dataEntries:
        t = data['timestamp']
        if data['timestamp'] in seen_dates:
            duplicates.append(data)
            continue
        if t != expected_trading_day:
            raise ValueError(
                f'Time error: expected({expected_trading_day}) got({t})')
        seen_dates.add(data['timestamp'])
        expected_trading_day = trading_days.get_next_n(
            expected_trading_day, 1)[0]

    table_data.append([_DataType.Enum.Name(dataType),
                       dataEntries[0]['timestamp'],
                       dataEntries[-1]['timestamp'],
                       len(duplicates)])

## Display Data Stats for the Security

In [None]:
print(f'Data stats for {symbol}\n')
df = pd.DataFrame(data=table_data, columns=table_columns)
df