# Sample Metrics

This notebook uses the new_functions that are being added to manuscripts and calculates some of the metrics to show how the module works.

In [1]:
import sys
# to be able to import the manuscripts2 module
sys.path.insert(0, '..')

# utility and support modules
import pandas as pd
from pprint import pprint
from datetime import datetime, timezone, timedelta

from manuscripts2.new_functions import Query, Index, calculate_bmi, buckets_to_df
from manuscripts2.new_functions import get_timeseries, get_trend, get_aggs
from manuscripts2.derived_classes import Issues, PullRequests

# declare the necessary variables
github_index_name = "perceval_github"
git_index_name = "perceval_git"

github_index = Index(index_name=github_index_name)
git_index = Index(index_name=git_index_name)

start_date = datetime(2015, 1, 1)
end_date = datetime.now()
end_date = end_date.replace(hour=0, minute=0, second=0, microsecond=0)

# Metrics currently supported by Manuscripts

In this section, we calculate the metrics which manuscripts supports currently.

#### Trends for Closed/Open issues and PRs

In [2]:
# We get the trend for closed prs by month
closed_pr = PullRequests(github_index).is_closed().get_cardinality("id_in_repo").by_period()
print("Trend for month: ", get_trend(get_timeseries(closed_pr)))

Trend for month:  (8, 37)


In [3]:
# And in a similar manner, we get the trend by quarter
closed_pr = PullRequests(github_index).is_closed().get_cardinality("id_in_repo").by_period(period="quarter")
print("Trend for quarter: ", get_trend(get_timeseries(closed_pr)))

Trend for quarter:  (24, -233)


In [4]:
# get trend by month
opened_prs = PullRequests(github_index).get_cardinality("id_in_repo").by_period()
print("Trend for month: ", get_trend(get_timeseries(opened_prs)))

# get trend by quarter:
opened_pr = PullRequests(github_index).get_cardinality("id_in_repo").by_period(period="quarter")
print("Trend for quarter: ", get_trend(get_timeseries(opened_pr)))

Trend for month:  (2, -300)
Trend for quarter:  (2, -1100)


In [5]:
# Trend of closed issues by month
closed_issues = Issues(github_index).is_closed().get_cardinality("id_in_repo").by_period(field="closed_at")
print("Trend for month: ", get_trend(get_timeseries(closed_issues)))

# Similarly, trend of closed issues by quarter
closed_issues = Issues(github_index).is_closed().get_cardinality("id_in_repo").by_period(period="quarter")
print("Trend for quarter: ", get_trend(get_timeseries(closed_issues)))

Trend for month:  (2, 100)
Trend for quarter:  (4, -250)


In [6]:
# Trend of Open issues by month
open_issues = Issues(github_index).is_open().get_cardinality("id_in_repo").by_period()
print("Trend for month: ", get_trend(get_timeseries(open_issues)))

# Similarly, trend of Open issues by quarter
open_issues = Issues(github_index).is_open().get_cardinality("id_in_repo").by_period(period="quarter")
print("Trend for quarter: ", get_trend(get_timeseries(open_issues)))

Trend for month:  (1, 0)
Trend for quarter:  (1, -200)


In [7]:
# Trend for the number of commits created per month
commits = Query(git_index).get_cardinality("hash").by_period()
print("Trend for month: ", get_trend(get_timeseries(commits)))

# Created per quarter
commits = Query(git_index).get_cardinality("hash").by_period(period="quarter")
print("Trend for quarter: ", get_trend(get_timeseries(commits)))

Trend for month:  (16, 6)
Trend for quarter:  (64, -335)


In [8]:
previous_month_date = end_date - timedelta(days=30)

# PRs closed in the last month
pr = PullRequests(github_index).is_closed().get_cardinality("id")\
                               .since(field="closed_at", start=previous_month_date)\
                               .until(field="closed_at", end=end_date)
get_aggs(pr)

8

In [9]:
previous_month_date = end_date - timedelta(days=30)

# PRs opened in the last month
pr = PullRequests(github_index).get_cardinality("id").since(start=previous_month_date).until(end=end_date)
get_aggs(pr)

10

In [10]:
previous_month_date = end_date - timedelta(days=30)

# Percentile PR closed
PR = PullRequests(github_index).is_closed().get_percentiles("time_to_close_days")\
                               .since(start=previous_month_date)\
                               .until(end=end_date)
get_aggs(PR)

0.10500000044703484

### Project Activities

In [21]:
# number of commits made by month 
commits = Query(git_index).get_cardinality("hash")\
                          .since(start=start_date)\
                          .until(end=end_date)\
                          .by_period()

print(get_timeseries(commits, dataframe=True).tail())

                               unixtime  value
date                                          
2018-03-01 00:00:00+00:00  1.519862e+09    126
2018-04-01 00:00:00+00:00  1.522541e+09     33
2018-05-01 00:00:00+00:00  1.525133e+09     15
2018-06-01 00:00:00+00:00  1.527811e+09     16
2018-07-01 00:00:00+00:00  1.530403e+09      0


In [14]:
# number of active authors per month
authors = Query(git_index).get_cardinality("author_name").by_period()

print(get_timeseries(authors, dataframe=True).tail())

                               unixtime  value
date                                          
2018-02-01 00:00:00+00:00  1.517443e+09      3
2018-03-01 00:00:00+00:00  1.519862e+09      4
2018-04-01 00:00:00+00:00  1.522541e+09      6
2018-05-01 00:00:00+00:00  1.525133e+09      4
2018-06-01 00:00:00+00:00  1.527811e+09      3


### Process

In [22]:
# Issues closed/ issues created
closed_issues = Issues(github_index).is_closed()\
                                    .get_cardinality("id")\
                                    .since(start=start_date)\
                                    .until(end=end_date)\
                                    .by_period()
closed_ts = get_timeseries(closed_issues)

opened_issues =Issues(github_index).get_cardinality("id")\
                                   .since(start=start_date).until(end=end_date)\
                                   .by_period()
opened_ts = get_timeseries(opened_issues)

print(pd.DataFrame(calculate_bmi(closed_ts, opened_ts)).tail())

                      period       bmi
38 2018-03-01 00:00:00+00:00  0.875000
39 2018-04-01 00:00:00+00:00  1.000000
40 2018-05-01 00:00:00+00:00  0.333333
41 2018-06-01 00:00:00+00:00  0.500000
42 2018-07-01 00:00:00+00:00  0.000000


In [24]:
# PRs closed/ PRs submitted

closed_pr = PullRequests(github_index).get_cardinality("id")\
                                      .is_closed()\
                                      .since(start=start_date).until(end=end_date)\
                                      .by_period()
closed_ts = get_timeseries(closed_pr)

opened_pr = PullRequests(github_index).get_cardinality("id")\
                                      .since(start=start_date).until(end=end_date)\
                                      .by_period()
opened_ts = get_timeseries(opened_pr)

print(pd.DataFrame(calculate_bmi(closed_ts, opened_ts)).tail())

                      period   bmi
38 2018-03-01 00:00:00+00:00  0.92
39 2018-04-01 00:00:00+00:00  1.00
40 2018-05-01 00:00:00+00:00  1.00
41 2018-06-01 00:00:00+00:00  1.00
42 2018-07-01 00:00:00+00:00  0.00


In [25]:
# days to close review(PR) average
closed_pr = PullRequests(github_index).get_average("time_to_close_days")\
                                      .is_closed()\
                                      .since(start=start_date).until(end=end_date)\
                                      .by_period()
print(get_timeseries(closed_pr, dataframe=True).tail())

                               unixtime     value
date                                             
2018-03-01 00:00:00+00:00  1.519862e+09  2.409130
2018-04-01 00:00:00+00:00  1.522541e+09  4.794545
2018-05-01 00:00:00+00:00  1.525133e+09  2.822000
2018-06-01 00:00:00+00:00  1.527811e+09  1.718750
2018-07-01 00:00:00+00:00  1.530403e+09  0.000000


In [20]:
# days to close review(PR) average
closed_pr = PullRequests(github_index).is_closed()\
                                      .since(start=start_date).until(end=end_date)\
                                      .get_percentiles("time_to_close_days").by_period()
print(get_timeseries(closed_pr, dataframe=True).tail())

                               unixtime  value
date                                          
2018-03-01 00:00:00+00:00  1.519862e+09  2.310
2018-04-01 00:00:00+00:00  1.522541e+09  0.850
2018-05-01 00:00:00+00:00  1.525133e+09  2.510
2018-06-01 00:00:00+00:00  1.527811e+09  0.105
2018-07-01 00:00:00+00:00  1.530403e+09  0.000


<h1 align="center">Growth Maturity and Decline Metrics</h1>

## Issue Resolution
Goal: Identify how effective the community is at addressing issues identified by community partcipants.

Name | Question | Implemented | Issue | PR | Visualisation 
--- | --- | --- | --- | --- | --- |
[Open Issues](https://github.com/chaoss/metrics/tree/master/activity-metrics/open-issues.md) | What is the number of open issues? | Yes | None | None | No
[Closed Issues](https://github.com/chaoss/metrics/tree/master/activity-metrics/closed-issues.md) | What is the number of closed issues? | Yes | None | None | No
[Issue Resolution Efficiency](https://github.com/chaoss/metrics/tree/master/activity-metrics/issue-resolution-efficiency.md) | What is the number of closed issues/number of abandoned issues? | Yes | [wg-gmd#5](https://github.com/chaoss/wg-gmd/issues/5) | None | No
[Open Issue Age](https://github.com/chaoss/metrics/tree/master/activity-metrics/open-issue-age.md) | What is the the age of open issues? | Yes | None | None | No
[First Response to Issue Duration](https://github.com/chaoss/metrics/tree/master/activity-metrics/first-response-to-issue-duration.md) | What is the duration of time for a first response to an issue? | No | [wg-gmd#8](https://github.com/chaoss/wg-gmd/issues/8) | None | No
[Closed Issue Resolution Duration](https://github.com/chaoss/metrics/tree/master/activity-metrics/closed-issue-resolution-duration.md) | What is the duration of time for issues to be resolved? | Yes | [wg-gmd#7](https://github.com/chaoss/wg-gmd/issues/7) | None | No

<a id="open_issues"></a>
### open issues

Here, we can see that we get the issues open by authors, by organizations and by the month in which they were created.

In [36]:
num_open_issues = Issues(github_index).is_open().get_cardinality("id_in_repo")
print("Number of currently open issues: ", get_aggs(num_open_issues))

Number of currently open issues:  26


<a id="closed_issues"></a>
### closed issues

In [37]:
num_closed_issues = Issues(github_index).is_closed().get_cardinality("id_in_repo")
print("Number of closed issues: ", get_aggs(num_closed_issues))
print()

closed_by_authors = Issues(github_index).get_cardinality("id_in_repo")\
                                        .by_authors("author_name")\
                                        .fetch_aggregation_results()
print(buckets_to_df(closed_by_authors['aggregations']['0']['buckets']).tail())

Number of closed issues:  115

    0  doc_count                key
26  1          1            Maëlick
27  1          1     Michael Downey
28  1          1    Phillip Furtado
29  1          1  Samuel Ytterbrink
30  1          1         Taewan Kim


<a id="closed_issues"></a>
### open issue age

As per the [discussion here](https://github.com/chaoss/metrics/blob/master/activity-metrics/open-issue-age.md), We'll calculate the percentile, mean, variance and create some visualisations for this metric.

In [38]:
issues = Issues(github_index).is_open().get_percentiles("time_open_days")
print("Percentiles: ", get_aggs(issues))
print()

issues.get_extended_stats("time_open_days")
extended_stats = issues.fetch_aggregation_results()['aggregations']['1']
pprint(extended_stats)

Percentiles:  457.7949981689453

{'avg': 433.9188483609603,
 'count': 26,
 'max': 859.280029296875,
 'min': 0.800000011920929,
 'std_deviation': 300.27044915502813,
 'std_deviation_bounds': {'lower': -166.62204994909598,
                          'upper': 1034.4597466710165},
 'sum': 11281.890057384968,
 'sum_of_squares': 7239645.649565274,
 'variance': 90162.34263576234}


#### visualizations

In [39]:
# visualisations
time_open_days_issues_df = Issues(github_index).is_open()\
                                               .fetch_results_from_source('time_open_days', \
                                                                          'id_in_repo', dataframe=True)
time_open_days_issues_df.head()

Unnamed: 0,id_in_repo,time_open_days
0,58,662.44
1,104,552.54
2,319,144.46
3,385,43.47
4,91,602.51


### Closed issue resolution duration (Time to resolution of closed issue)

#### percentiles

In [40]:
closed_issues = Issues(github_index).is_closed().get_percentiles("time_to_close_days")
print("Percentile closed issues: ", get_aggs(closed_issues))
print()

extended_stats = closed_issues.get_extended_stats("time_to_close_days")\
                              .fetch_aggregation_results()['aggregations']['1']
pprint(extended_stats)

Percentile closed issues:  3.6500000953674316

{'avg': 30.83017426458714,
 'count': 115,
 'max': 582.3300170898438,
 'min': 0.0,
 'std_deviation': 92.63210085425231,
 'std_deviation_bounds': {'lower': -154.4340274439175,
                          'upper': 216.09437597309176},
 'sum': 3545.470040427521,
 'sum_of_squares': 1096088.661693576,
 'variance': 8580.706108672372}


#### visualizations

In [42]:
# visualisations
closed_issue_age = Issues(github_index).is_closed()\
                                       .fetch_results_from_source('time_to_close_days', 'id_in_repo', dataframe=True)
print(closed_issue_age.head())

  id_in_repo  time_to_close_days
0         32                0.76
1         50                3.19
2         63                0.24
3         97                2.62
4         77               71.78


## Code Development
Goal: Identify how effective the community is at merging new code into the codebase.

Name | Question | Implemented | Issue | PR
--- | --- | --- | --- | --- |
[Code Commits](https://github.com/chaoss/metrics/tree/master/activity-metrics/code-commits.md) | What is the number of code commits? | Yes | None | None
[Lines of Code Changed](https://github.com/chaoss/metrics/tree/master/activity-metrics/lines-of-code-changed.md) | What is the number of lines of code changed? | Yes | None | None
[Code Reviews](https://github.com/chaoss/metrics/tree/master/activity-metrics/code-reviews.md) | What is the number of code reviews?
[Code Merge Duration](https://github.com/chaoss/metrics/tree/master/activity-metrics/code-merge-duration.md) | What is the duration of time between code merge request and code commit?
[Code Review Efficiency](https://github.com/chaoss/metrics/tree/master/activity-metrics/code-review-efficiency.md) | What is the number of merged code changes/number of abandoned code change requests?
[Maintainer Response to Merge Request Duration](https://github.com/chaoss/metrics/tree/master/activity-metrics/maintainer-response-to-merge-request-duration.md) | What is the duration of time for a maintainer to make a first response to a code merge request?
[Code Review Iteration](https://github.com/chaoss/metrics/tree/master/activity-metrics/code-review-iteration.md) | What is the number of iterations that occur before a merge request is accepted or declined?
[Forks](https://github.com/chaoss/metrics/tree/master/activity-metrics/forks.md) | Forks are a concept in distributed version control systems like GitHub. It is a proxy for the approximate number of developers who have taken a shot at building and deploying the codebase *for development*.
[Pull Requests Open](https://github.com/chaoss/metrics/tree/master/activity-metrics/pull-requests-open.md) | Number of open pull requests. | Yes | None | None | 
[Pull Requests Closed](https://github.com/chaoss/metrics/tree/master/activity-metrics/pull-requests-made-closed.md) | Number of closed pull requests. | Yes | None | None |
[Pull Request Comment Duration](https://github.com/chaoss/metrics/tree/master/activity-metrics/pull-requests-comment-duration.md) | The difference between the timestamp of the pull request creation date and the most recent comment on the pull request.
[Pull Request Comment Diversity](https://github.com/chaoss/metrics/tree/master/activity-metrics/pull-requests-comment-diversity.md) | Number of each people discussing each pull request.
[Pull Request Comments](https://github.com/chaoss/metrics/tree/master/activity-metrics/pull-request-comments.md) | Number of comments on each pull request. 


### code commits

**NOTE:** HERE THE INDEX WILL HAVE TO BE CHANGED

In [43]:
commits = Query(git_index).get_cardinality("hash")
print("Total commits: ", get_aggs(commits))

all_commits = commits.fetch_results_from_source("hash", "commit_date", dataframe=True)
print("All commits: ", all_commits.head())

Total commits:  1204
All commits:             commit_date                                      hash
0  2015-08-18T18:54:45  2355d18310d8e15c8e5d44f688d757df33b0e4be
1  2015-11-18T14:41:17  b0f6eb81d9b1dc5f77dce9954744016dfbb3cb4a
2  2015-11-19T18:12:47  525f43db7764d5f3cbc846fa3bf48a0c74550e4e
3  2015-11-25T20:56:29  4dd88bdefc77fd88fa51d3994f103f2b44003b99
4  2015-12-01T19:44:07  95d5e90b1ff7ff07db289c03ffe1239f6f54e232


When you go to the [perceval github repo](https://github.com/chaoss/grimoirelab-perceval), you'll see that actually 1182 commit are present. That maybe because of some empty commit messages. 

#### by months

In [44]:
buckets_to_df(commits.get_cardinality("hash")\
                     .by_period()\
                     .fetch_aggregation_results()['aggregations']['0']['buckets']).head()

Unnamed: 0_level_0,0,date_in_seconds
key,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-08-01,8,1438387200000
2015-09-01,0,1441065600000
2015-10-01,0,1443657600000
2015-11-01,23,1446336000000
2015-12-01,21,1448928000000


### Lines of code changed

In [45]:
commits = Query(git_index)
lc = get_aggs(commits.get_sum("lines_changed"))
la = get_aggs(commits.get_sum("lines_added"))
lr = get_aggs(commits.get_sum("lines_removed"))

print("Total lines changed: ", lc)
print("Total lines added: ", la)
print("Total lines removed: ", lr)

Total lines changed:  196087.0
Total lines added:  151353.0
Total lines removed:  44734.0


### Pull requests Open

In [49]:
open_prs = PullRequests(github_index).is_open()\
                                     .get_cardinality("id_in_repo")
print("Number of open PRs: ", get_aggs(open_prs))
print()


Number of open PRs:  8



In [51]:
response = open_prs.get_cardinality("id_in_repo")\
                   .by_authors("author_name")\
                   .fetch_aggregation_results()['aggregations']
open_prs_by_authors = response['0']['buckets']
print(buckets_to_df(open_prs_by_authors).head())

   0  doc_count                         key
0  2          2  Jesus M. Gonzalez-Barahona
1  2          2               Keanu Nichols
2  1          1               Gustavo Silva
3  1          1                 Jose Miguel
4  1          1      Miguel Ángel Fernández


### Pull requests closed

In [53]:
closed_prs = PullRequests(github_index).is_closed()\
                                       .get_cardinality("id_in_repo")
print("Number of closed PRs: ", get_aggs(closed_prs))
print()

response = closed_prs.get_cardinality("id_in_repo")\
          .by_authors("author_name")\
          .fetch_aggregation_results()['aggregations']
print("Closed prs by authors:")
print(buckets_to_df(response['0']['buckets']).head())

Number of closed PRs:  252

Closed prs by authors:
     0  doc_count                         key
0  138        138                     valerio
1   33         33         Alvaro del Castillo
2   22         22             Santiago Dueñas
3   18         18  Jesus M. Gonzalez-Barahona
4   10         10              Alberto Martín


## Community Growth
Goal: Identify the size of the project community and whether it's growing, shrinking, or staying the same.

Name | Question | Implemented | Issue | PR
--- | --- | --- | --- | --- |
[Contributors](https://github.com/chaoss/metrics/tree/master/activity-metrics/contributors.md) | What is the number of contributors? | Yes | None | None
[New Contributors](https://github.com/chaoss/metrics/tree/master/activity-metrics/new-contributors.md) | What is the number of new contributors? | Yes | None | None
[Contributing Organizations](https://github.com/chaoss/metrics/tree/master/activity-metrics/contributing-organizations.md) | What is the number of contributing organizations? | Yes | None | None
[New Contributing Organizations](https://github.com/chaoss/metrics/tree/master/activity-metrics/new-contributing-organizations.md) | What is the number of new contributing organizations?
[Sub-Projects](https://github.com/chaoss/metrics/tree/master/activity-metrics/sub-projects.md) | What is the number of sub-projects?

### Number of contributors

In [54]:
contributors = Query(git_index)
contributors.get_sum("lines_changed").by_authors("author_name")
contributors.get_sum("lines_added").by_authors("author_name")
contributors.get_sum("lines_removed").by_authors("author_name")
contributors.get_average("files").by_authors("author_name")
contributors.get_cardinality("author_uuid")

<manuscripts2.new_functions.Query at 0xa13f2cc50>

In [56]:
# maybe a pie chart showing the different users and the magnitude of their contributions is the total number of lines changed/removed/added??

buckets_to_df(contributors.fetch_aggregation_results()['aggregations']['0']['buckets']).head()

Unnamed: 0,0,1,2,3,doc_count,key
0,61910.0,45588.0,16322.0,2.109354,759,Santiago Dueñas
1,88034.0,61939.0,26095.0,2.221088,294,Valerio Cosentino
2,15725.0,15335.0,390.0,2.5,54,Alvaro del Castillo
3,22862.0,21781.0,1081.0,1.921569,51,Alberto Martín
4,1118.0,1048.0,70.0,2.222222,18,Jesus M. Gonzalez-Barahona


### New contributors

For new contributors, we have to get the names and counts of the people who made commits to the project. [This](https://grimoirelab.gitbooks.io/tutorial/python/pandas-for-grimoirelab-indexes.html) tutorial of Grimoirelab actually gets the dates on which the authors made their first commits. Based on that we can get the months when the authors made their first commits and those authors will be the new authors for that month. We can do a similar thing for Year. (We can also get the authors by week, but there is little point in calculating that and it will be complex to calculate that too.)

In [59]:
# new contributors by month
response = Query(git_index).get_min("author_date")\
                                   .by_authors("author_name")\
                                   .fetch_aggregation_results()
print(buckets_to_df(response['aggregations']['0']['buckets']).head())

              0  doc_count                         key
0  1.439921e+12        759             Santiago Dueñas
1  1.505391e+12        294           Valerio Cosentino
2  1.449255e+12         54         Alvaro del Castillo
3  1.455033e+12         51              Alberto Martín
4  1.451589e+12         18  Jesus M. Gonzalez-Barahona


### Contributing Organizations

In [61]:
response = Query(github_index).get_terms("user_org")\
                                       .fetch_aggregation_results()
buckets = response['aggregations']['0']['buckets']
organizations = pd.Series([item['key'] for item in buckets])
print(organizations.head())


0               @Bitergia 
1                 Bitergia
2                 GNUmedia
3      @amrita-university 
4    BBVA Data & Analytics
dtype: object


In [63]:
response = Query(github_index).get_terms("author_name")\
                                   .by_organizations("user_orgs")\
                                   .fetch_aggregation_results()
pprint(response['aggregations']['0']['buckets'])


[{'0': {'buckets': [{'doc_count': 141, 'key': 'valerio'},
                    {'doc_count': 61, 'key': 'Alvaro del Castillo'},
                    {'doc_count': 35, 'key': 'Jesus M. Gonzalez-Barahona'},
                    {'doc_count': 33, 'key': 'Alberto Martín'},
                    {'doc_count': 31, 'key': 'Santiago Dueñas'},
                    {'doc_count': 10, 'key': 'Manrique Lopez'},
                    {'doc_count': 8, 'key': 'Jose Miguel'},
                    {'doc_count': 7, 'key': 'Quan Zhou'},
                    {'doc_count': 6, 'key': 'David Pose Fernández'},
                    {'doc_count': 4, 'key': 'Daniel Izquierdo Cortazar'},
                    {'doc_count': 4, 'key': 'Keanu Nichols'},
                    {'doc_count': 3, 'key': 'Brylie Christopher Oxley'},
                    {'doc_count': 2, 'key': 'David Esler'},
                    {'doc_count': 2, 'key': 'Gustavo Silva'},
                    {'doc_count': 2, 'key': 'Israel Herraiz'},
                    {'d