Skip to content

Commit

Permalink
Closes #48
Browse files Browse the repository at this point in the history
If start date is not provided then all the indices present will be
queried and the min "grimoire_creationd_date" amongst those indices
will be set as the start date for that report.
  • Loading branch information
aswanipranjal authored and jgbarah committed Jul 3, 2018
1 parent 2f9bb21 commit ffc3634
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 7 deletions.
31 changes: 28 additions & 3 deletions bin/manuscripts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ from manuscripts.report import Report
from manuscripts.config import Config
from manuscripts._version import __version__

from manuscripts.esquery import get_first_date_of_index

def get_params():
"""Parse command line arguments"""

Expand All @@ -53,8 +55,8 @@ def get_params():
Use *field for the inverse filter.')
parser.add_argument('-g', '--debug', dest='debug', action='store_true')
parser.add_argument('-i', '--interval', default='month', help="Analysis interval (month (default), quarter, year)")
parser.add_argument('-s', '--start-date', default='2015-01-01',
help="Start date for the report (UTC) (>=) (default: 2015-01-01)")
parser.add_argument('-s', '--start-date', default=None,
help="Start date for the report (UTC) (>=) (default: None)")
parser.add_argument('--offset', help="Offset to be used in date histogram aggregations (e.g.: +31d)")
parser.add_argument('-u', '--elastic-url', help="Elastic URL with the enriched indexes")
parser.add_argument('--data-sources', nargs='*',
Expand All @@ -78,6 +80,19 @@ def get_offset_days(offset):
days = int(offset[1:-1])
return days

def get_min_date(url, indices, data_sources):
"""Get the min date from all the data sources/indices available"""
if indices:
min_date = min([get_first_date_of_index(url, index) for index in indices])
else:
if "github" in data_sources:
data_sources.remove("github")
data_sources.append("github_issues")
data_sources.append("github_prs")
min_date = min([get_first_date_of_index(url, Report.ds2index[Report.ds2class[ds]]) \
for ds in data_sources])
return min_date


if __name__ == '__main__':

Expand All @@ -95,6 +110,11 @@ if __name__ == '__main__':
logging.error('Missing needed params for Report: elastic_url and data_sources')
sys.exit(1)

# Check if each data source has a corresponding index, if available
if len(args.data_sources) < len(args.indices):
logging.error('Number of data sources do not match the corresponding number of indices provided')
sys.exit(1)

elastic = args.elastic_url
report_name = args.name
data_dir = args.data_dir
Expand All @@ -111,7 +131,12 @@ if __name__ == '__main__':
end_date = parser.parse(args.end_date).replace(tzinfo=timezone.utc)
# The end date is not included, the report must finish the day before
end_date += timedelta(microseconds=-1)
start_date = parser.parse(args.start_date).replace(tzinfo=timezone.utc)

start_date = args.start_date
# if start date is not present, it is calculated by querying all the indices given
if not start_date:
start_date = get_min_date(elastic, args.indices, args.data_sources)
start_date = parser.parse(start_date).replace(tzinfo=timezone.utc)

offset = args.offset if args.offset else None

Expand Down
13 changes: 13 additions & 0 deletions manuscripts/esquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

from datetime import timezone

from elasticsearch import Elasticsearch
from elasticsearch_dsl import A, Search, Q
# elasticsearch_dsl is referred to as es_dsl in the comments, henceforth

Expand Down Expand Up @@ -383,3 +384,15 @@ def get_agg(cls, field=None, date_field=None, start=None, end=None,
s.aggs.bucket(agg_id, query_agg)

return s.to_dict()


def get_first_date_of_index(elastic_url, index):
"""Get the first/min date present in the index"""
es = Elasticsearch(elastic_url)
search = Search(using=es, index=index)
agg = A("min", field="grimoire_creation_date")
search.aggs.bucket("1", agg)
search = search.extra(size=0)
response = search.execute()
start_date = response.to_dict()['aggregations']['1']['value_as_string'][:10]
return start_date
4 changes: 0 additions & 4 deletions manuscripts/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,6 @@ def __init__(self, es_url, start, end, data_dir=None, filters=None,
elif self.interval == 'year':
self.end_prev_month = end - relativedelta.relativedelta(months=12)

# Check if each data source has a corresponding index, if available
if len(data_sources) < len(indices):
logger.error('Insufficient data sources provided')
sys.exit(1)
# Create a dict of indices which, for each data_source, will give the
# name of the elasticsearch index that has to be used.
self.index_dict = defaultdict(lambda: None)
Expand Down

0 comments on commit ffc3634

Please sign in to comment.