From eff13415d5a45ab3b05637b81e2677214450333b Mon Sep 17 00:00:00 2001 From: aswanipranjal Date: Fri, 13 Jul 2018 16:38:28 +0530 Subject: [PATCH] Add initial infrastructure to generate the reports - manuscripts2/metrics folder containing files for different metrics - bin/manuscripts2 to generate report from new_functions - manuscripts2/report.py to create the report using Report class --- bin/manuscripts2 | 32 +++ manuscripts2/metrics/__init__.py | 0 manuscripts2/metrics/git.py | 69 ++++++ manuscripts2/metrics/github_issues.py | 71 ++++++ manuscripts2/metrics/github_prs.py | 71 ++++++ manuscripts2/report.py | 150 ++++++++++++ manuscripts2/test_new_functions.py | 338 -------------------------- 7 files changed, 393 insertions(+), 338 deletions(-) create mode 100644 bin/manuscripts2 create mode 100644 manuscripts2/metrics/__init__.py create mode 100644 manuscripts2/metrics/git.py create mode 100644 manuscripts2/metrics/github_issues.py create mode 100644 manuscripts2/metrics/github_prs.py create mode 100644 manuscripts2/report.py delete mode 100644 manuscripts2/test_new_functions.py diff --git a/bin/manuscripts2 b/bin/manuscripts2 new file mode 100644 index 0000000..c092097 --- /dev/null +++ b/bin/manuscripts2 @@ -0,0 +1,32 @@ +#! /usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Script for producing Reports from data in ElasticSearch +# +# Copyright (C) 2018 CHAOSS +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Authors: +# Pranjal Aswani +# + +import sys +sys.path.insert(0, '.') + +from manuscripts2.report import Report + +test_report = Report(data_dir="PERCEVAL_TESTS", data_sources=['git', 'github_issues', 'github_prs']) +test_report.get_activity_metrics() \ No newline at end of file diff --git a/manuscripts2/metrics/__init__.py b/manuscripts2/metrics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/manuscripts2/metrics/git.py b/manuscripts2/metrics/git.py new file mode 100644 index 0000000..a4045b6 --- /dev/null +++ b/manuscripts2/metrics/git.py @@ -0,0 +1,69 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# +# Copyright (C) 2018 CHAOSS +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Author: +# Pranjal Aswani +# + +import sys +sys.path.insert(0, '..') + +from manuscripts2.new_functions import Query + + +class GitMetrics(): + + def __init__(self, index): + + self.name = "git" + self.commits = Query(index) + + def get_section_metrics(self): + + return { + "overview": { + "activity_metrics": [self.commits.get_cardinality("hash").by_period()], + "author_metrics": [], + "bmi_metrics": [], + "time_to_close_metrics": [], + "projects_metrics": [] + }, + "com_channels": { + "activity_metrics": [], + "author_metrics": [] + }, + "project_activity": { + # TODO: Authors is not activity but we need two metrics here + "metrics": [] + }, + "project_community": { + "author_metrics": [], + "people_top_metrics": [], + "orgs_top_metrics": [], + }, + "project_process": { + "bmi_metrics": [], + "time_to_close_metrics": [], + "time_to_close_title": "", + "time_to_close_review_metrics": [], + "time_to_close_review_title": "", + "patchsets_metrics": [] + } + } diff --git a/manuscripts2/metrics/github_issues.py b/manuscripts2/metrics/github_issues.py new file mode 100644 index 0000000..229f6be --- /dev/null +++ b/manuscripts2/metrics/github_issues.py @@ -0,0 +1,71 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# +# Copyright (C) 2018 CHAOSS +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Author: +# Pranjal Aswani +# + +import sys +sys.path.insert(0, '..') + +from manuscripts2.derived_classes import Issues + + +class IssuesMetrics(): + + def __init__(self, index): + + self.name = "github_issues" + self.opened_issues = Issues(index) + self.closed_issues = Issues(index) + self.closed_issues.is_closed() + + def get_section_metrics(self): + + return { + "overview": { + "activity_metrics": [self.opened_issues.get_cardinality("id").by_period(), + self.closed_issues.get_cardinality("id").by_period()], + "author_metrics": [], + "bmi_metrics": [], + "time_to_close_metrics": [], + "projects_metrics": [] + }, + "com_channels": { + "activity_metrics": [], + "author_metrics": [] + }, + "project_activity": { + "metrics": [] + }, + "project_community": { + "author_metrics": [], + "people_top_metrics": [], + "orgs_top_metrics": [], + }, + "project_process": { + "bmi_metrics": [], + "time_to_close_metrics": [], + "time_to_close_title": "Days to close (median and average)", + "time_to_close_review_metrics": [], + "time_to_close_review_title": "", + "patchsets_metrics": [] + } + } diff --git a/manuscripts2/metrics/github_prs.py b/manuscripts2/metrics/github_prs.py new file mode 100644 index 0000000..5d38d25 --- /dev/null +++ b/manuscripts2/metrics/github_prs.py @@ -0,0 +1,71 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# +# Copyright (C) 2018 CHAOSS +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Author: +# Pranjal Aswani +# + +import sys +sys.path.insert(0, '..') + +from manuscripts2.derived_classes import PullRequests + + +class PullRequestsMetrics(): + + def __init__(self, index): + + self.name = "github_prs" + self.opened_prs = PullRequests(index) + self.closed_prs = PullRequests(index) + self.closed_prs.is_closed() + + def get_section_metrics(self): + + return { + "overview": { + "activity_metrics": [self.opened_prs.get_cardinality("id").by_period(), + self.closed_prs.get_cardinality("id").by_period()], + "author_metrics": [], + "bmi_metrics": [], + "time_to_close_metrics": [], + "projects_metrics": [] + }, + "com_channels": { + "activity_metrics": [], + "author_metrics": [] + }, + "project_activity": { + "metrics": [] + }, + "project_community": { + "author_metrics": [], + "people_top_metrics": [], + "orgs_top_metrics": [], + }, + "project_process": { + "bmi_metrics": [], + "time_to_close_metrics": [], + "time_to_close_title": "Days to close (median and average)", + "time_to_close_review_metrics": [], + "time_to_close_review_title": "Days to close review (median and average)", + "patchsets_metrics": [] + } + } diff --git a/manuscripts2/report.py b/manuscripts2/report.py new file mode 100644 index 0000000..1ceb919 --- /dev/null +++ b/manuscripts2/report.py @@ -0,0 +1,150 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# +# Copyright (C) 2018 CHAOSS +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Author: +# Pranjal Aswani +# + +import os +import logging + +from collections import defaultdict + +from elasticsearch import Elasticsearch + +from .new_functions import (Query, + Index, + get_trend, + get_timeseries) + +from .metrics import git +from .metrics import github_prs +from .metrics import github_issues + +logger = logging.getLogger(__name__) + + +def create_csv(filename, csv_data, mode="w"): + with open(filename, mode) as f: + csv_data.replace("_", r"\_") + f.write(csv_data) + + +class Report(): + + # Elasticsearch index names in which metrics data is stored + GIT_INDEX = 'git' + GITHUB_ISSUES_INDEX = 'github_issues' + GITHUB_PRS_INDEX = 'github_prs' + + # Helper dict to map a data source class with its Elasticsearch index + class2index = { + git.GitMetrics: GIT_INDEX, + github_issues.IssuesMetrics: GITHUB_ISSUES_INDEX, + github_prs.PullRequestsMetrics: GITHUB_PRS_INDEX, + } + + # Helper dict to map a data source name with its python class + ds2class = { + "git": git.GitMetrics, + "github_issues": github_issues.IssuesMetrics, + "github_prs": github_prs.PullRequestsMetrics, + } + + def __init__(self, es_url=None, start=None, end=None, data_dir=None, filters=None, + interval="month", offset=None, data_sources=None, + report_name=None, projects=False, indices=[], logo=None): + + Query.interval_ = interval + + self.es = "http://localhost:9200" + self.es_client = Elasticsearch(self.es) + # Set the client for all metrics + Index.es = self.es_client + + self.data_dir = data_dir + self.index_dict = defaultdict(lambda: None) + for pos, index in enumerate(indices): + self.index_dict[data_sources[pos]] = Index(index_name=index) + + self.config = self.__get_config(data_sources=data_sources) + + def get_metric_index(self, data_source): + if data_source in self.index_dict: + return self.index_dict[data_source] + else: + return Index(index_name=self.class2index[self.ds2class[data_source]]) + + def __get_config(self, data_sources=None): + + if not data_sources: + # For testing + data_sources = ["git", "github_issues", "github_prs"] + + # In new_config a dict with all the metrics for all data sources is created + new_config = {} + for index, ds in enumerate(data_sources): + metric_class = self.ds2class[ds] + metric_index = self.get_metric_index(ds) + ds_config = metric_class(metric_index).get_section_metrics() + + for section in ds_config: + if section not in new_config: + # Just create the section with the data for the ds + new_config[section] = ds_config[section] + else: + for metric_section in ds_config[section]: + if ds_config[section][metric_section] is not None: + if (metric_section not in new_config[section] or + new_config[section][metric_section] is None): + new_config[section][metric_section] = ds_config[section][metric_section] + else: + new_config[section][metric_section] += ds_config[section][metric_section] + + activity_metrics = ds_config['project_activity']['metrics'] + new_config['project_activity']['ds' + str(index + 1) + "_metrics"] = activity_metrics + + # Fields that are not linked to a data source + new_config['overview']['activity_file_csv'] = "data_source_evolution.csv" + new_config['overview']['efficiency_file_csv'] = "efficiency.csv" + new_config['project_process']['time_to_close_title'] = "Days to close (median and average)" + new_config['project_process']['time_to_close_review_title'] = "Days to close review (median and average)" + + return new_config + + def get_activity_metrics(self): + + metrics = self.config['overview']['activity_metrics'] + file_name = self.config['overview']['activity_file_csv'] + data_path = os.path.join(self.data_dir, "data") + if not os.path.exists(data_path): + os.makedirs(data_path) + file_name = os.path.join(data_path, file_name) + + logger.debug("CSV file %s generation in progress", file_name) + + csv = "metricsnames, netvalues, relativevalues, datasource\n" + + for metric in metrics: + (last, percentage) = get_trend(get_timeseries(metric)) + csv += "{}, {}, {}, {}\n".format(metric.index.index_name, last, + percentage, metric.index.index_name) + + create_csv(file_name, csv) diff --git a/manuscripts2/test_new_functions.py b/manuscripts2/test_new_functions.py deleted file mode 100644 index 3144afb..0000000 --- a/manuscripts2/test_new_functions.py +++ /dev/null @@ -1,338 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# -# Copyright (C) 2018 CHAOSS -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA. -# -# Authors: -# Pranjal Aswani - -import sys -import unittest - -from datetime import datetime, timezone - -from elasticsearch_dsl import A -# Hack to make sure that tests import the right packages -# due to setuptools behaviour -sys.path.insert(0, '..') - -from manuscripts2.new_functions import Query, Index - - -class TestNewFunctions(unittest.TestCase): - """Base class to test new_functions.py""" - - maxDiff = None - - def setUp(self): - """Set up the necessary functions to run unittests""" - - self.github_data_source = "perceval_github" - - self.github_index = Index(index_name=self.github_data_source) - - self.Query_test_object = Query(self.github_index) - - self.field = "AGG_FIELD" # field to aggregate - self.date_field = "DATE_FIELD" # field for range - self.filters = [{"name1": "value1"}, {"name2": "value2"}] - self.offset = 2 - self.interval = "month" - self.timezone = "UTC" - self.start = datetime(2016, 1, 1) # from date - self.end = datetime(2018, 1, 1) # to date - self.size = 10000 - self.precision_threshold = 3000 - - def test_initialization(self): - """ - Test if we can create an Query object without parameters - """ - - with self.assertRaises(TypeError): - github_obj = Query() - - query = Query(self.github_index) - - def test_add_query(self): - """ - Test if we can add a normal query into the search variable of Query object - """ - - # Add the query - self.Query_test_object.add_query(self.filters[0]) - # check whether the query was inserted into the Search object or not - self.assertDictEqual(self.Query_test_object.search.query.to_dict()['match'], {'name1': 'value1'}) - - def test_add_inverse_query(self): - """ - Test if we can add a inverse query into the search variable of Query object - """ - - self.Query_test_object.add_inverse_query(self.filters[1]) - # check whether the query was inserted into the Search object or not - self.assertDictEqual(self.Query_test_object.search.query.to_dict()['bool']['must_not'][0], - {'match': {'name2': 'value2'}}) - - def test_get_sum(self): - """ - Test the sum aggregation - """ - - field = self.field - # without field param - with self.assertRaises(AttributeError): - self.Query_test_object.get_sum() - - # with field param - self.Query_test_object.get_sum(field) - test_agg = A("sum", field=field) - agg_name, agg = self.Query_test_object.aggregations.popitem() - self.assertEqual('sum_' + field, agg_name) - self.assertEqual(agg, test_agg) - - def test_get_average(self): - """ - Test the average aggregation - """ - - field = self.field - # without field param - with self.assertRaises(AttributeError): - self.Query_test_object.get_average() - - # with field param - self.Query_test_object.get_average(field) - test_agg = A("avg", field=field) - agg_name, agg = self.Query_test_object.aggregations.popitem() - self.assertEqual('avg_' + field, agg_name) - self.assertEqual(agg, test_agg) - - def test_get_percentiles(self): - """ - Test the percentiles aggregation - """ - - field = self.field - # without field param - with self.assertRaises(AttributeError): - self.Query_test_object.get_percentiles() - - # with field param - self.Query_test_object.get_percentiles(field) - test_agg = A("percentiles", field=field, percents=[1.0, 5.0, 25.0, 50.0, 75.0, 95.0, 99.0]) - agg_name, agg = self.Query_test_object.aggregations.popitem() - self.assertEqual('percentiles_' + field, agg_name) - self.assertEqual(agg, test_agg) - - def test_get_terms(self): - """ - Test the terms aggregation - """ - - field = self.field - # without field param - with self.assertRaises(AttributeError): - self.Query_test_object.get_terms() - - # with field param - self.Query_test_object.get_terms(field) - test_agg = A("terms", field=field, size=self.size, order={"_count": "desc"}) - agg_name, agg = self.Query_test_object.aggregations.popitem() - self.assertEqual('terms_' + field, agg_name) - self.assertEqual(agg, test_agg) - - def test_get_min(self): - """ - Test the min aggregation - """ - - field = self.field - # without field param - with self.assertRaises(AttributeError): - self.Query_test_object.get_min() - - # with field param - self.Query_test_object.get_min(field) - test_agg = A("min", field=field) - agg_name, agg = self.Query_test_object.aggregations.popitem() - self.assertEqual('min_' + field, agg_name) - self.assertEqual(agg, test_agg) - - def test_get_max(self): - """ - Test the max aggregation - """ - - field = self.field - # without field param - with self.assertRaises(AttributeError): - self.Query_test_object.get_max() - - # with field param - self.Query_test_object.get_max(field) - test_agg = A("max", field=field) - agg_name, agg = self.Query_test_object.aggregations.popitem() - self.assertEqual('max_' + field, agg_name) - self.assertEqual(agg, test_agg) - - def test_get_cardinality(self): - """ - Test the cardniality(count) aggregation - """ - - field = self.field - # without field param - with self.assertRaises(AttributeError): - self.Query_test_object.get_cardinality() - - # with field param - self.Query_test_object.get_cardinality(field) - test_agg = A("cardinality", field=field, precision_threshold=self.precision_threshold) - agg_name, agg = self.Query_test_object.aggregations.popitem() - self.assertEqual('cardinality_' + field, agg_name) - self.assertEqual(agg, test_agg) - - def test_get_extended_stats(self): - """ - Test the extended statistics aggregation - """ - - field = self.field - # without field param - with self.assertRaises(AttributeError): - self.Query_test_object.get_extended_stats() - - # with field param - self.Query_test_object.get_extended_stats(field) - test_agg = A("extended_stats", field=field) - agg_name, agg = self.Query_test_object.aggregations.popitem() - self.assertEqual('extended_stats_' + field, agg_name) - self.assertEqual(agg, test_agg) - - def test_multiple_aggregations(self): - """ - Test when multiple aggrgations are being added - """ - field = self.field - - def test_since(self): - """ - Test the start date in range for a field - """ - - self.Query_test_object.since(start=self.start, field="closed_at") - self.assertEqual(self.Query_test_object.range['closed_at']['gte'], self.start.isoformat()) - - def test_until(self): - """ - Test the end date in range for a field - """ - - self.Query_test_object.until(end=self.end, field="created_at") - self.assertEqual(self.Query_test_object.range['created_at']['lte'], self.end.isoformat()) - - def test_since_and_until(self): - """ - Since the since and until functions can be for different fields, test them together - """ - - self.Query_test_object.since(start=self.start, field="closed_at") - self.Query_test_object.until(end=self.end, field="closed_at") - test_dict = {'gte': self.start.isoformat(), 'lte': self.end.isoformat()} - self.assertDictEqual(self.Query_test_object.range['closed_at'], test_dict) - - def test_by_authors(self): - """ - Test nested aggregation wrt authors - """ - - test_agg = A("terms", field="author_uuid", missing="others", size=self.size) - test_agg.metric(0, "cardinality", field="id_in_repo", precision_threshold=self.precision_threshold) - self.Query_test_object.get_cardinality("id_in_repo").by_authors("author_uuid") - agg_name, agg = self.Query_test_object.aggregations.popitem() - - self.assertEqual(agg, test_agg, msg='\n{0}\n{1}'.format(agg, test_agg)) - # 'msg' parameter gives us details between the dicts in case of a failure - - def test_by_organizations(self): - """ - Test nested aggregation wrt author organizations - """ - - test_agg = A("terms", field="author_org_name", missing="others", size=self.size) - test_agg.metric(0, "cardinality", field="id_in_repo", precision_threshold=self.precision_threshold) - self.Query_test_object.get_cardinality("id_in_repo").by_organizations("author_org_name") - agg_name, agg = self.Query_test_object.aggregations.popitem() - - self.assertEqual(agg, test_agg, msg='\n{0}\n{1}'.format(agg, test_agg)) - - def test_by_period_without_args(self): - """ - Test the date histogram aggregation with no parameters - """ - - test_agg = A("date_histogram", field="grimoire_creation_date", interval="month", time_zone="UTC", - min_doc_count=0, **{}) - test_agg.metric(0, "cardinality", field=self.field, precision_threshold=self.precision_threshold) - self.Query_test_object.get_cardinality(self.field).by_period() - agg_name, agg = self.Query_test_object.aggregations.popitem() - - self.assertEqual(agg, test_agg, msg='\n{0}\n{1}'.format(agg, test_agg)) - - def test_by_period_with_params(self): - """ - Test the date_histogram aggregation with all the parameters - """ - - start_date = self.start.replace(microsecond=0) - start_date = start_date.replace(tzinfo=timezone.utc).timestamp() - start_date = start_date * 1000 - end_date = self.end.replace(microsecond=0) - end_date = end_date.replace(tzinfo=timezone.utc).timestamp() - end_date = end_date * 1000 - bounds_dict = {"extended_bounds": {"min": start_date, "max": end_date}} - - test_agg = A("date_histogram", field="created_at", interval="week", time_zone="UTC", - min_doc_count=0, **bounds_dict) - test_agg.metric(0, "cardinality", field=self.field, precision_threshold=self.precision_threshold) - self.Query_test_object.since(self.start).until(self.end) - self.Query_test_object.get_cardinality(self.field).by_period(field="created_at", period="week") - agg_name, agg = self.Query_test_object.aggregations.popitem() - - self.assertEqual(agg, test_agg, msg='\n{0}\n{1}'.format(agg, test_agg)) - - def test_fetch_aggregation_results(self): - pass - - def test_fetch_results_from_source(self): - pass - - def test_get_ts(self): - pass - - def test_get_aggs(self): - pass - - def test_get_trend(self): - pass - - def test_calculate_bmi(self): - pass - - def test_buckets_to_df(self): - pass