#

This notebook collect information for the paper ***Promises and Perils of minining releases on Git***

# Imports

In [1]:
import datetime
import json

## Data Analysis

In [3]:
# %matplotlib inline
%matplotlib notebook

import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
pd.options.display.max_rows = 1000

from scipy.stats import wilcoxon
from scipy.stats import ranksums

##  Releasy development version

In [6]:
import os
import sys

releasy_module = os.path.abspath(os.path.join('..','..','..','dev','releasy'))
if releasy_module not in sys.path:
    sys.path.insert(0, releasy_module)

In [7]:
for mod in sorted(sys.modules.keys()):
    if mod.startswith("releasy"):
        del sys.modules[mod]

In [8]:
from releasy.miner.vcs.miner import Miner
from releasy.miner.vcs.git import GitVcs

In [9]:
from releasy.miner.vcs import miner as releasy_miner
from releasy.miner.vcs import git as releasy_git

In [10]:
import releasy

In [11]:
import importlib
importlib.reload(releasy_miner)
importlib.reload(releasy_git)

<module 'releasy.miner.vcs.git' from '/home/felipecrp/dev/releasy/releasy/miner/vcs/git.py'>

# Dataset setup

## Variables

In [12]:
repo_path = os.path.join('..','..','..','repos')

In [13]:
projects = pd.read_pickle("projects.zip")
projects["data"] = None

### Mining projects using Releasy

  - Currently, the **track_base_release** param has some performance issues. So to enable rapid mining, it is recommended to disable this param.
  
```{python}
params = {
    "track_base_release": False
}
```

In [14]:
count = 1
for name, project in projects[(projects.data.isnull())].iterrows():
    print(f"{datetime.datetime.now()} - {count:3} - Processing {name}")
    project_group, project_name = name.split("/") 
    path = os.path.join(repo_path, project_group, f"{project_name}.git")
    projects.loc[name, "path"] = path    
    
    params = {
        "track_base_release": False
    }
    miner = releasy_miner.Miner(name=name,vcs=releasy_git.GitVcs(path), **params)
    project = miner.mine_commits()
    projects.loc[name, "data"] = project
    count += 1
print(f"{datetime.datetime.now()} - Ended")

2019-12-18 16:57:47.227231 -   1 - Processing freeCodeCamp/freeCodeCamp
2019-12-18 16:57:47.241605 -   2 - Processing facebook/react
2019-12-18 16:57:47.565494 -   3 - Processing twbs/bootstrap
2019-12-18 16:57:47.980891 -   4 - Processing facebook/create-react-app
2019-12-18 16:57:48.088835 -   5 - Processing axios/axios
2019-12-18 16:57:48.114644 -   6 - Processing nodejs/node
2019-12-18 16:57:49.345882 -   7 - Processing FortAwesome/Font-Awesome
2019-12-18 16:57:49.400112 -   8 - Processing angular/angular.js
2019-12-18 16:57:49.733273 -   9 - Processing mrdoob/three.js
2019-12-18 16:57:49.757549 -  10 - Processing puppeteer/puppeteer
2019-12-18 16:57:49.807641 -  11 - Processing microsoft/vscode
2019-12-18 16:57:51.011872 -  12 - Processing microsoft/TypeScript
2019-12-18 16:57:51.753640 -  13 - Processing angular/angular
2019-12-18 16:57:52.133081 -  14 - Processing ant-design/ant-design
2019-12-18 16:57:52.577967 -  15 - Processing reduxjs/redux
2019-12-18 16:57:52.635702 -  16 -

## Projects dataset

In [15]:
projects["num_releases"] = projects["data"].apply(lambda p : len(p.releases))
projects["num_tags"] = projects["data"].apply(lambda p : len(p.tags))
projects["per_releases"] = projects["num_releases"] / projects["num_tags"]
projects.loc[projects["num_tags"] == 0, "per_releases"] = 0

In [16]:
projects.shape

(100, 11)

## Tags dataset

In [17]:
tags = []
for project in projects["data"]:
    for tag in project.tags:
        tag = {
            "project": project.name,
            "name": tag.name,
            "data": tag
        }
        tags.append(tag)
tags = pd.DataFrame(tags)
tags.set_index(["project", "name"], inplace=True)

In [18]:
tags["annotated"] = tags["data"].apply(lambda t: t.is_annotated)

## Releases dataset

In [19]:
releases = []
for project in projects["data"]:#.apply(lambda project: project):
    for release in project.releases:
        releases.append({
            "project": project.name,
            "name": release.name,
            "data": release
        })
releases = pd.DataFrame(releases)
releases.set_index(["project","name"], inplace=True)

In [20]:
releases["prefix"] = releases["data"].apply(lambda r: r.prefix)
releases["head_commit"] = releases["data"].apply(lambda r: r.head_commit.hashcode)

releases["time"] = releases["data"].apply(lambda r: r.get_time())
releases["start_development"] = releases["data"].apply(lambda r: r.get_time(of=releasy.START_DEVELOPMENT_TIME))
releases["length"] = releases["data"].apply(lambda r: r.get_length())
# releases.loc[releases["length"] == pd.to_timedelta(0), "length"] = np.NaN


In [21]:
releases.shape

(16852, 6)

In [42]:
def get_prefixes(project):
    prefixes = {}
    for release in project.releases:
        prefix = release.prefix
        if not prefix:
            prefix = "None"
        if prefix not in prefixes:
            prefixes[prefix] = 0
        prefixes[prefix] += 1
    return prefixes

projects["num_prefixes"] = projects["data"].apply(lambda project: len(get_prefixes(project).keys()))
projects["prefixes"] = projects["data"].apply(lambda project: ",".join(get_prefixes(project).keys()))

### show release prefixes

In [43]:
projects.loc["nodejs/node"]

description     Node.js JavaScript runtime :sparkles::turtle::...
stars                                                       66329
url                      https://api.github.com/repos/nodejs/node
git_url                          git://github.com/nodejs/node.git
language                                               JavaScript
created_at                                    2014-11-26 19:57:11
data                                                  nodejs/node
path                               ../../../repos/nodejs/node.git
num_releases                                                  540
num_tags                                                      543
per_releases                                             0.994475
num_prefixes                                                    2
prefixes                                           v,heads/tags/v
Name: nodejs/node, dtype: object

In [52]:
projects[projects["num_prefixes"] > 1].sort_values(["name"])["prefixes"]

name
FFmpeg/FFmpeg                                                                  v,ffmpeg-,n
FortAwesome/Font-Awesome                                                            v,None
PHPMailer/PHPMailer                                                           phpmailer-,v
Wox-launcher/Wox                                                                    v,None
airbnb/lottie-android                                                              v,v.,vv
angular/angular                                                          None,ngcontainer_
angular/angular.js                                                                  v,g3-v
ansible/ansible                                                                     None,v
antirez/redis                                                                       None,v
aspnet/AspNetCore                                                                   v,None
bcit-ci/CodeIgniter                                                                 v

## Check tags

In [None]:
# bkp = projects.copy()

In [None]:
# projects = bkp[projects.data.notnull()].copy()

In [None]:
projects

In [146]:
projects["tags"] = projects["data"].apply(lambda project: len(project.tags))
projects["releases"] = 0
projects["percent_releases"] = 0
projects["releases"] = projects["data"].apply(lambda project: len(project.releases))
projects.loc[projects["tags"] > 0,"percent_releases"] = projects["releases"]/projects["tags"]
projects.loc[projects["tags"] == 0,"percent_releases"] = 0


# Promises and Perils

## Promise 1: Projects often use tags to represent software releases

In [147]:
projects[projects["num_releases"] > 0].shape[0] / projects.shape[0]

0.95

In [191]:
np.arange(-0.1,0,0.9,0.1)

TypeError: data type not understood

In [215]:
bins = np.concatenate((np.arange(-0.1,0.9,0.1), np.arange(0.9,1.01,0.01)), axis=None)
bins = np.concatenate((np.arange(-0.1,1,0.1), 0.99, 1), axis=None)
#bins = np.concatenate((np.arange(-0.1,1.1,0.1)), axis=None)
projects["per_releases_bucket"] = pd.cut(projects["per_releases"],bins=bins)
hist = projects.groupby("per_releases_bucket")["per_releases"].count()
print(hist)

fig, ax = plt.subplots()

hist.plot(kind="bar", ax=ax)
ax.minorticks_on()
ax.grid(True, which="major")
#ax.grid(True, which="minor")

ax.set_axisbelow(True)

fig.tight_layout()
fig.savefig("figs/hist_per_releases.png")

per_releases_bucket
(-0.1, 0.0]     5
(0.0, 0.1]      0
(0.1, 0.2]      0
(0.2, 0.3]      0
(0.3, 0.4]      0
(0.4, 0.5]      1
(0.5, 0.6]      2
(0.6, 0.7]      0
(0.7, 0.8]      1
(0.8, 0.9]      3
(0.9, 0.99]    16
(0.99, 1.0]    72
Name: per_releases, dtype: int64


<IPython.core.display.Javascript object>

## Promise: Tags can provide aditional information

In [129]:
tags[tags["annotated"] == True].shape

(17944, 2)

In [130]:
tags[tags["annotated"] == False].shape

(0, 2)

## Peril 1: Some projects do not use tags at all

In [None]:
projects[projects.releases == 0].shape[0] / projects.shape[0]

In [None]:
list(projects[projects.tags == 0].index)

In [None]:
list(projects[projects.releases == 0].index)

## Release prefixes

## Peril - Different tags can reference the same release

In [98]:
release_clones = releases.groupby(["project", "head_commit"]).count()
release_clones = release_clones[release_clones.data > 1].copy()
release_clones.describe()

Unnamed: 0,data,prefix,time,start_development,length,duplicated
count,513.0,513.0,513.0,513.0,513.0,513.0
mean,2.750487,2.750487,2.750487,2.750487,2.750487,2.750487
std,2.315902,2.315902,2.315902,2.315902,2.315902,2.315902
min,2.0,2.0,2.0,2.0,2.0,2.0
25%,2.0,2.0,2.0,2.0,2.0,2.0
50%,2.0,2.0,2.0,2.0,2.0,2.0
75%,2.0,2.0,2.0,2.0,2.0,2.0
max,29.0,29.0,29.0,29.0,29.0,29.0


In [92]:
releases["duplicated"] = False
releases.loc[(releases.index.get_level_values(0).isin(release_clones.index.get_level_values(0))) & (releases["head_commit"].isin(release_clones.index.get_level_values(1))), "duplicated"] = True


In [99]:
releases[releases["duplicated"] == True]

Unnamed: 0_level_0,Unnamed: 1_level_0,data,prefix,head_commit,time,start_development,length,duplicated
project,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
facebook/create-react-app,babel-preset-react-app@0.2.1,babel-preset-react-app@0.2.1,babel-preset-react-app@,7ac8777ed1cebec6084a03e84272cfd21128b356,2016-09-23 22:31:36+01:00,2016-09-23 22:31:36+01:00,4 days 13:40:11,True
facebook/create-react-app,create-react-app@0.5.0,create-react-app@0.5.0,create-react-app@,7ac8777ed1cebec6084a03e84272cfd21128b356,2016-09-23 22:31:36+01:00,2016-09-23 22:31:36+01:00,0 days 00:00:00,True
facebook/create-react-app,eslint-config-react-app@0.2.1,eslint-config-react-app@0.2.1,eslint-config-react-app@,7ac8777ed1cebec6084a03e84272cfd21128b356,2016-09-23 22:31:36+01:00,2016-09-23 22:31:36+01:00,0 days 00:00:00,True
facebook/create-react-app,react-dev-utils@0.1.0,react-dev-utils@0.1.0,react-dev-utils@,7ac8777ed1cebec6084a03e84272cfd21128b356,2016-09-23 22:31:36+01:00,2016-09-23 22:31:36+01:00,0 days 00:00:00,True
facebook/create-react-app,react-scripts@0.5.0,react-scripts@0.5.0,react-scripts@,7ac8777ed1cebec6084a03e84272cfd21128b356,2016-09-23 22:31:36+01:00,2016-09-23 22:31:36+01:00,0 days 00:00:00,True
...,...,...,...,...,...,...,...,...
syncthing/syncthing,v1.0.1,v1.0.1,v,1e69997ecdbf87ceaad76bd0149d98f560f4fdb5,2019-02-05 07:36:23+01:00,2019-02-05 07:36:23+01:00,49 days 16:44:08,True
syncthing/syncthing,v1.1.0-rc.2,v1.1.0-rc.2,v,f0f79a3e3e861b79da2b09f563e9d373255419f0,2019-02-25 19:29:16+01:00,2019-02-25 19:29:16+01:00,0 days 08:59:45,True
syncthing/syncthing,v1.1.0,v1.1.0,v,f0f79a3e3e861b79da2b09f563e9d373255419f0,2019-03-05 16:37:54+01:00,2019-03-05 16:37:54+01:00,46 days 03:36:15,True
syncthing/syncthing,v1.1.1-rc.2,v1.1.1-rc.2,v,e4ab9d3312dfd768e269f5a7ed75eb9cfe50bffb,2019-03-25 12:45:49+01:00,2019-03-25 12:45:49+01:00,11 days 05:08:31,True


In [106]:
pd.DataFrame(release_clones.index.get_level_values(0).unique()).shape

(35, 1)

In [109]:
release_clones[release_clones.prefix == release_clones.prefix.max()]

Unnamed: 0_level_0,Unnamed: 1_level_0,data,prefix,time,start_development,length,duplicated
project,head_commit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
apple/swift,f4134ebef606f675bd372db74cd6db32d30c6178,29,29,29,29,29,29


In [112]:
releases[releases.head_commit == "f4134ebef606f675bd372db74cd6db32d30c6178"]

Unnamed: 0_level_0,Unnamed: 1_level_0,data,prefix,head_commit,time,start_development,length,duplicated
project,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
apple/swift,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-09-25-a,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-09-25-a,swift-,f4134ebef606f675bd372db74cd6db32d30c6178,2018-09-25 20:13:24-05:00,2018-09-25 20:13:24-05:00,5 days 02:24:43,True
apple/swift,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-09-26-a,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-09-26-a,swift-,f4134ebef606f675bd372db74cd6db32d30c6178,2018-09-26 20:13:22-05:00,2018-09-26 20:13:22-05:00,1 days 19:25:20,True
apple/swift,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-09-27-a,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-09-27-a,swift-,f4134ebef606f675bd372db74cd6db32d30c6178,2018-09-27 20:14:07-05:00,2018-09-27 20:14:07-05:00,2 days 19:26:05,True
apple/swift,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-09-28-a,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-09-28-a,swift-,f4134ebef606f675bd372db74cd6db32d30c6178,2018-09-28 20:13:19-05:00,2018-09-28 20:13:19-05:00,3 days 19:25:17,True
apple/swift,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-10-01-a,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-10-01-a,swift-,f4134ebef606f675bd372db74cd6db32d30c6178,2018-10-01 20:13:18-05:00,2018-10-01 20:13:18-05:00,6 days 19:25:16,True
apple/swift,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-10-02-a,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-10-02-a,swift-,f4134ebef606f675bd372db74cd6db32d30c6178,2018-10-02 20:13:12-05:00,2018-10-02 20:13:12-05:00,7 days 19:25:10,True
apple/swift,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-10-03-a,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-10-03-a,swift-,f4134ebef606f675bd372db74cd6db32d30c6178,2018-10-03 20:13:51-05:00,2018-10-03 20:13:51-05:00,8 days 19:25:49,True
apple/swift,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-10-05-a,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-10-05-a,swift-,f4134ebef606f675bd372db74cd6db32d30c6178,2018-10-05 20:14:14-05:00,2018-10-05 20:14:14-05:00,10 days 19:26:12,True
apple/swift,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-10-06-a,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-10-06-a,swift-,f4134ebef606f675bd372db74cd6db32d30c6178,2018-10-06 20:14:15-05:00,2018-10-06 20:14:15-05:00,11 days 19:26:13,True
apple/swift,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-10-07-a,swift-4.2-DEVELOPMENT-SNAPSHOT-2018-10-07-a,swift-,f4134ebef606f675bd372db74cd6db32d30c6178,2018-10-07 20:14:14-05:00,2018-10-07 20:14:14-05:00,12 days 19:26:12,True


In [115]:
release_clones.groupby(["project"]).count()

Unnamed: 0_level_0,data,prefix,time,start_development,length,duplicated
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PHPMailer/PHPMailer,5,5,5,5,5,5
Wox-launcher/Wox,2,2,2,2,2,2
angular/angular.js,1,1,1,1,1,1
ant-design/ant-design,1,1,1,1,1,1
antirez/redis,5,5,5,5,5,5
apple/swift,68,68,68,68,68,68
aspnet/AspNetCore,4,4,4,4,4,4
bilibili/ijkplayer,1,1,1,1,1,1
bitcoin/bitcoin,16,16,16,16,16,16
cdr/code-server,1,1,1,1,1,1


In [114]:
release_clones.groupby(["project"]).sum()

Unnamed: 0_level_0,data,prefix,time,start_development,length,duplicated
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PHPMailer/PHPMailer,10,10,10,10,10,10
Wox-launcher/Wox,4,4,4,4,4,4
angular/angular.js,2,2,2,2,2,2
ant-design/ant-design,2,2,2,2,2,2
antirez/redis,10,10,10,10,10,10
apple/swift,292,292,292,292,292,292
aspnet/AspNetCore,11,11,11,11,11,11
bilibili/ijkplayer,2,2,2,2,2,2
bitcoin/bitcoin,33,33,33,33,33,33
cdr/code-server,2,2,2,2,2,2


In [116]:
release_clones.groupby(["project"]).mean()

Unnamed: 0_level_0,data,prefix,time,start_development,length,duplicated
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PHPMailer/PHPMailer,2.0,2.0,2.0,2.0,2.0,2.0
Wox-launcher/Wox,2.0,2.0,2.0,2.0,2.0,2.0
angular/angular.js,2.0,2.0,2.0,2.0,2.0,2.0
ant-design/ant-design,2.0,2.0,2.0,2.0,2.0,2.0
antirez/redis,2.0,2.0,2.0,2.0,2.0,2.0
apple/swift,4.294118,4.294118,4.294118,4.294118,4.294118,4.294118
aspnet/AspNetCore,2.75,2.75,2.75,2.75,2.75,2.75
bilibili/ijkplayer,2.0,2.0,2.0,2.0,2.0,2.0
bitcoin/bitcoin,2.0625,2.0625,2.0625,2.0625,2.0625,2.0625
cdr/code-server,2.0,2.0,2.0,2.0,2.0,2.0


# Others

In [None]:
projects[projects.percent_releases < 1][["tags","releases","percent_releases"]]

In [None]:
project_df["created_at"] = pd.to_datetime(project_df["created_at"])

In [None]:
project_df["n_tags"] = project_df["data"].apply(lambda project: len(project.tags))
project_df["n_releases"] = 0
project_df["p_releases"] = 0
project_df["n_releases"] = project_df["data"].apply(lambda project: len(project.releases))
project_df.loc[project_df["n_tags"] > 0,"p_releases"] = project_df["n_releases"]/project_df["n_tags"]

In [None]:
project_df.loc[(project_df.n_releases < 5), "discarded_by"] = "few releases"
project_df.loc[(project_df.p_releases < 0.85), "discarded_by"] = "nom semantic"
project_df.loc[(project_df.created_at > "2018-10-01"), "discarded_by"] = "too young"
project_df.loc[(project_df.stars < 1000), "discarded_by"] = "few stars"

In [None]:
project_df[(project_df.created_at > "2018-10-01")]

In [None]:
summary = pd.DataFrame()
summary["n_projects"] = project_df[(project_df.discarded_by.isnull())].groupby(["language"])["n_releases"].count()
summary["older_project_birth"] = project_df[(project_df.discarded_by.isnull())].groupby(["language"])["created_at"].min()
summary["younger_project_birth"] = project_df[(project_df.discarded_by.isnull())].groupby(["language"])["created_at"].max()
summary["min_stars"] = project_df[(project_df.discarded_by.isnull())].groupby(["language"])["stars"].min()
summary["max_stars"] = project_df[(project_df.discarded_by.isnull())].groupby(["language"])["stars"].max()
summary["min_releases"] = project_df[(project_df.discarded_by.isnull())].groupby(["language"])["n_releases"].min()
summary["max_releases"] = project_df[(project_df.discarded_by.isnull())].groupby(["language"])["n_releases"].max()
summary["mean_releases"] = project_df[(project_df.discarded_by.isnull())].groupby(["language"])["n_releases"].mean().round(0)
summary["std_deviation_releases"] = project_df[(project_df.discarded_by.isnull())].groupby(["language"])["n_releases"].std().round(0)
summary["total_releases"] = project_df[(project_df.discarded_by.isnull())].groupby(["language"])["n_releases"].sum()
summary

In [None]:
summary.total_releases.sum()

In [None]:
project_df[(project_df.discarded_by.notnull()) & (project_df.language == "Go")].sort_values(by=["n_releases"], ascending=False)

In [None]:
project_df[(project_df.n_tags < 5)].sort_values(by=["n_releases"], ascending=False)

In [None]:
project_df[project_df.p_releases > 0.85].groupby("language").count()

In [None]:
project_df[(project_df.n_tags < 10)].head(20)

In [None]:
project_df[(project_df.p_releases < 0.85) & (project_df.discarded_by.isnull()) & (project_df.n_tags > 0)].head(20)

In [None]:
for name,project_data in projects_data.items():
    if "discarded_by" in project_data:
        continue
    project = project_data["data"]
    tags = project.tags
    nom_release_tags = [tag for tag in project.tags if not tag.release or tag.name.startswith("zone") or tag.name.startswith("ngcontainer")]
    if len(tags) > 0:
        percent_releases = round(100*(1-len(nom_release_tags)/len(tags)),2)
    else:
        percent_releases = 0
    project_data["percent_releases"] = percent_releases
    
    prefixes = []
    for release in project.releases:
        if release.prefix not in prefixes:
            prefixes.append(release.prefix)

    print(f"{project.name:30} {percent_releases:10} {len(project.releases):10}/{len(project.tags)}")
    print("  Prefixes:")
    for prefix in prefixes:
         print(f"    - {str(prefix):30}")

    print("  Non releases:")            
    for tag in tags:
        if not tag.release:
           print(f"    - {tag.name}")
#        print(f"-- {tag.name:30} {tag.release != None}")
    print("---")





In [None]:
f_releases = []
index = 0
last_project = None
for name,project_data in projects_data.items():
    if "percent_releases" in project_data and project_data["percent_releases"] > 90:
        project = project_data["data"]
        feature_releases = project.get_releases(skip_patches=True, skip_pre=True)
        feature_releases = sorted(feature_releases, key=lambda r: r.version)
        last_release = None
        for release in feature_releases:
            if release.patches:
                last_patch = release.patches[-1]
                maintenance_length = last_patch.time - release.time
                num_patches = len(release.patches)
            else:
                maintenance_length = pd.to_timedelta(0)
                last_patch = None
                num_patches = 0
            if release.pre_releases:
                first_pre_release = release.pre_releases[0]
                stage_length = release.time - first_pre_release.time 
                num_pre_releases = len(release.pre_releases)
            else:
                first_pre_release = None
                stage_length = pd.to_timedelta(0)
                num_pre_releases = 0

            if last_release and last_release.major != release.major:
                f_releases[index-1]["is_last_minor"] = True
            last_release = release
            
            if last_project and last_project != name:
                f_releases.pop()
                index -= 1
            last_project = name
            index += 1

            f_releases.append({
                "project": project.name,
                "release": release.name,
                "version": release.version,
                "is_last_minor": False,
                "time": pd.to_datetime(release.time, utc=True),
                "developtment_length": release.length,
                "last_patch": last_patch, 
                "last_patch_time": pd.to_datetime(last_patch.time, utc=True) if last_patch else None, 
                "maintenance_length": maintenance_length,
                "n_patches": num_patches,
                
                #"num_pre_releases": num_pre_releases,
                #"num_patches": num_patches,
                #"first_pre_release": first_pre_release, 
                #"stage_length": stage_length
            })

f_releases = pd.DataFrame(f_releases)    
f_releases = f_releases.sort_values(by=["project", "version"])

f_releases["maintenance_secs"] = f_releases["maintenance_length"].dt.total_seconds()
f_releases["time"] = f_releases["time"].dt.tz_convert(None)
f_releases["last_patch_time"] = f_releases["last_patch_time"].dt.tz_convert(None)

# remove releases sem manutenção
# f_releases = f_releases[f_releases.maintenance_length > pd.to_timedelta(0)].copy()

f_releases.to_excel("feature_releases_ds.xlsx")
f_releases.head(200)

In [None]:
fig = plt.figure()                                                                                                                                                                                                                                                             
ax = fig.add_subplot(111)

f_releases.boxplot("maintenance_length", by="is_last_minor", ax=ax)

def timeTicks(x, pos):
    return str(pd.to_timedelta(x))
    #d = datetime.timedelta(seconds=x)
    #return str(d)
    #return x / 60 / 60 / 24
formatter = matplotlib.ticker.FuncFormatter(timeTicks)                                                                                                                                                                                                                         
ax.yaxis.set_major_formatter(formatter)

plt.suptitle("")
plt.tight_layout()

## Statistical Test

## Analise pareada por Projeto


In [None]:
gr = f_releases.groupby(["project","is_last_minor"]).mean()

In [None]:
gr.shape

In [None]:
(gr.groupby(level=[0]).size() == 2)

In [None]:
# gr = gr.drop(index="akveo/ngx-admin")
gr = gr.drop(index="apache/dubbo")
gr = gr.drop(index="facebook/react-native")

In [None]:
gr.to_excel("pareado.xlsx")

In [None]:
gr.shape

In [None]:
a = gr.xs(True, level=1)["maintenance_secs"]

In [None]:
b = gr.xs(False, level=1)["maintenance_secs"]

In [None]:
wilcoxon(a,b)

## Análise de todas as releases

In [None]:
a = f_releases[f_releases["is_last_minor"] == True]["maintenance_secs"]

In [None]:
b = f_releases[f_releases["is_last_minor"] == False]["maintenance_secs"]

In [None]:
ranksums(a,b)

## Análise intra-projetos

In [None]:
intr_project_ds = pd.DataFrame()
intr_project_ds["project"] = f_releases["project"].unique()
intr_project_ds["n_feature_release"] = intr_project_ds["project"].apply(lambda project_name: len(f_releases[f_releases["project"] == project_name]))
intr_project_ds["n_last_minor"] = intr_project_ds["project"].apply(lambda project_name: len(f_releases[(f_releases["project"] == project_name) & (f_releases["is_last_minor"] == True)]))
intr_project_ds = intr_project_ds[intr_project_ds.n_last_minor > 0]
intr_project_ds["p-value"] = intr_project_ds["project"].apply(lambda project_name: ranksums(
    f_releases[(f_releases["is_last_minor"] == True) & (f_releases["project"] == project_name)]["maintenance_secs"],
    f_releases[(f_releases["is_last_minor"] == False) & (f_releases["project"] == project_name)]["maintenance_secs"]
)[1])
intr_project_ds["h0"] = intr_project_ds["p-value"] >= 0.05
intr_project_ds.to_excel("intr_project_ds.xlsx")
intr_project_ds

In [None]:
project_ds.head()

In [None]:
project_ds = f_releases.groupby(["project"])["project"]
a

In [None]:
f_releases.release.count() + f_releases.num_patches.sum() + f_releases.num_pre_releases.sum()

In [None]:
gr.to_csv("gr_releases.csv", sep=";")

In [None]:
f_releases.to_csv("releases.csv", sep=";")

In [None]:
f_releases[f_releases.is_last_minor == False]

In [None]:
for project in projects.values():
    feature_releases = project.get_releases(skip_patches=True, skip_pre=True)
    print(f"{project.name:20}")
    print(f"{'release':10} {'time':28} {'length':20} {'last_patch':10} {'maintenance':20}")
    for release in feature_releases:
        if release.name.startswith("ng") or release.name.startswith("zone"):
            continue
        if release.patches:
            last_patch = release.patches[-1]
            maintenance = last_patch.time - release.time
        else:
            maintenance = None
            last_patch = None
        if release.pre_releases:
            first_pre = release.pre_releases[0]
        else:
            first_pre = None
          
        print(f"{release.name:10} {str(release.time):28} {str(release.length):20} {str(first_pre):15} {str(last_patch):10} {str(maintenance):20}")

In [None]:
tags = project.tags
nom_release_tags = [tag for tag in project.tags if not tag.release or tag.name.startswith("zone") or tag.name.startswith("ngcontainer")]
percent_releases = round(100*(1-len(nom_release_tags)/len(tags)),2)
percent_releases

In [None]:
nom_release_tags

In [None]:
feature_releases = project.get_releases(skip_patches=True, skip_pre=True)
print(f"{'release':10} {'time':28} {'length':20} {'last_patch':10} {'maintenance':20}")
for release in feature_releases:
    if release.name.startswith("ng") or release.name.startswith("zone"):
        continue
    if release.patches:
        last_patch = release.patches[-1]
        maintenance = last_patch.time - release.time
    else:
        last_patch = None
    print(f"{release.name:10} {str(release.time):28} {str(release.length):20} {str(last_patch):10} {str(maintenance):20}")

In [None]:
release.patches[-1]

In [None]:
releases = project.get_releases(skip_pre=True)
for release in releases:
    if release.is_patch():
        print(f"{release.name:20} {str(release.time):30} {str(release.length):10}")