Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Calculate the default start date using all the data sources provided #69

Merged
merged 1 commit into from Jul 3, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
31 changes: 28 additions & 3 deletions bin/manuscripts
Expand Up @@ -38,6 +38,8 @@ from manuscripts.report import Report
from manuscripts.config import Config
from manuscripts._version import __version__

from manuscripts.esquery import get_first_date_of_index

def get_params():
"""Parse command line arguments"""

Expand All @@ -53,8 +55,8 @@ def get_params():
Use *field for the inverse filter.')
parser.add_argument('-g', '--debug', dest='debug', action='store_true')
parser.add_argument('-i', '--interval', default='month', help="Analysis interval (month (default), quarter, year)")
parser.add_argument('-s', '--start-date', default='2015-01-01',
help="Start date for the report (UTC) (>=) (default: 2015-01-01)")
parser.add_argument('-s', '--start-date', default=None,
help="Start date for the report (UTC) (>=) (default: None)")
parser.add_argument('--offset', help="Offset to be used in date histogram aggregations (e.g.: +31d)")
parser.add_argument('-u', '--elastic-url', help="Elastic URL with the enriched indexes")
parser.add_argument('--data-sources', nargs='*',
Expand All @@ -78,6 +80,19 @@ def get_offset_days(offset):
days = int(offset[1:-1])
return days

def get_min_date(url, indices, data_sources):
"""Get the min date from all the data sources/indices available"""
if indices:
min_date = min([get_first_date_of_index(url, index) for index in indices])
else:
if "github" in data_sources:
data_sources.remove("github")
data_sources.append("github_issues")
data_sources.append("github_prs")
min_date = min([get_first_date_of_index(url, Report.ds2index[Report.ds2class[ds]]) \
for ds in data_sources])
return min_date


if __name__ == '__main__':

Expand All @@ -95,6 +110,11 @@ if __name__ == '__main__':
logging.error('Missing needed params for Report: elastic_url and data_sources')
sys.exit(1)

# Check if each data source has a corresponding index, if available
if len(args.data_sources) < len(args.indices):
logging.error('Number of data sources do not match the corresponding number of indices provided')
sys.exit(1)

elastic = args.elastic_url
report_name = args.name
data_dir = args.data_dir
Expand All @@ -111,7 +131,12 @@ if __name__ == '__main__':
end_date = parser.parse(args.end_date).replace(tzinfo=timezone.utc)
# The end date is not included, the report must finish the day before
end_date += timedelta(microseconds=-1)
start_date = parser.parse(args.start_date).replace(tzinfo=timezone.utc)

start_date = args.start_date
# if start date is not present, it is calculated by querying all the indices given
if not start_date:
start_date = get_min_date(elastic, args.indices, args.data_sources)
start_date = parser.parse(start_date).replace(tzinfo=timezone.utc)

offset = args.offset if args.offset else None

Expand Down
13 changes: 13 additions & 0 deletions manuscripts/esquery.py
Expand Up @@ -25,6 +25,7 @@

from datetime import timezone

from elasticsearch import Elasticsearch
from elasticsearch_dsl import A, Search, Q
# elasticsearch_dsl is referred to as es_dsl in the comments, henceforth

Expand Down Expand Up @@ -383,3 +384,15 @@ def get_agg(cls, field=None, date_field=None, start=None, end=None,
s.aggs.bucket(agg_id, query_agg)

return s.to_dict()


def get_first_date_of_index(elastic_url, index):
"""Get the first/min date present in the index"""
es = Elasticsearch(elastic_url)
search = Search(using=es, index=index)
agg = A("min", field="grimoire_creation_date")
search.aggs.bucket("1", agg)
search = search.extra(size=0)
response = search.execute()
start_date = response.to_dict()['aggregations']['1']['value_as_string'][:10]
return start_date
4 changes: 0 additions & 4 deletions manuscripts/report.py
Expand Up @@ -152,10 +152,6 @@ def __init__(self, es_url, start, end, data_dir=None, filters=None,
elif self.interval == 'year':
self.end_prev_month = end - relativedelta.relativedelta(months=12)

# Check if each data source has a corresponding index, if available
if len(data_sources) < len(indices):
logger.error('Insufficient data sources provided')
sys.exit(1)
# Create a dict of indices which, for each data_source, will give the
# name of the elasticsearch index that has to be used.
self.index_dict = defaultdict(lambda: None)
Expand Down