<a href="https://colab.research.google.com/github/dayfine/xlab/blob/master/xlab/colab/notebooks/data_coverage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Xlab Data Coverage

This notebook is used to check the coverage and stats of xlab data, e.g. what type of data exists for a particular security, over what date / time range, etc.

TODO: Optionally can be used to correct faulty data.

## Setup

In [None]:
import pandas as pd

from xlab.data import store
from xlab.data.store import mongo
from xlab.data.proto import data_type_pb2
from xlab.data.store.mongo import admin
from xlab.data.validation import duplicates
from xlab.net.proto import time_util
from xlab.util.itertools import groupby

In [None]:
#@title Inputs
symbol = "X" #@param {type:"string"}


## Fetch Data for a Security

In [None]:
_store = mongo.MongoDataStore()

all_data = _store.lookup(store.DataStore.LookupKey(data_space=1,symbol=symbol))

## Generate Stats

In [None]:
table_columns = ['dataType', 'startDate', 'endDate', '# Duplicates']
table_data = []

all_duplicate_ids = []
all_duplicates_with_different_values = {}

for data_type, data_entries in groupby.groupby(
                                 all_data.entries, lambda x: x.data_type).items():
    duplicate_ids, duplicates_with_different_values = duplicates.find_all_duplicates(data_entries)
    all_duplicate_ids.extend(duplicate_ids)
    all_duplicates_with_different_values.update(duplicates_with_different_values)

    table_data.append([data_type_pb2.DataType.Enum.Name(data_type),
                       time_util.to_civil(data_entries[0].timestamp),
                       time_util.to_civil(data_entries[-1].timestamp),
                       len(duplicate_ids)])

## Display Data Stats for the Security

In [None]:
print(f'Data stats for {symbol}\n')
df = pd.DataFrame(data=table_data, columns=table_columns)
df

In [None]:
if all_duplicates_with_different_values:
    print(f'Found {len(all_duplicates_with_different_values)} duplicates with different values')
    print('Please double check if they should be deleted or kept in favor of the oldest value')

def _collect_ids(all_duplicates_with_different_values):
    return [data.id for data in data_entries 
            for id, data_entries in all_duplicates_with_different_values.items()]

if all_duplicate_ids:
    print(f'Found {len(all_duplicate_ids)} duplicates.')

simple_duplicate_ids = [id for id in all_duplicate_ids 
                        if id not in _collect_ids(all_duplicates_with_different_values)]
delete_result = admin.delete_by_ids(all_duplicate_ids)
print(f'{delete_result.deleted_count} items deleted.')