#

This notebook evaluate the releases

# Imports

In [1]:
import datetime
import json

## Data Analysis

In [2]:
# %matplotlib inline
%matplotlib notebook

import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
pd.options.display.max_rows = 1000

from scipy.stats import wilcoxon
from scipy.stats import ranksums

##  Releasy development version

In [3]:
import os
import sys

releasy_module = os.path.abspath(os.path.join('..','..','..','dev','releasy'))
if releasy_module not in sys.path:
    sys.path.insert(0, releasy_module)

In [4]:
for mod in sorted(sys.modules.keys()):
    if mod.startswith("releasy"):
        del sys.modules[mod]

In [5]:
from releasy.miner.vcs.miner import Miner
from releasy.miner.vcs.git import GitVcs

In [6]:
from releasy.miner.vcs import miner as releasy_miner
from releasy.miner.vcs import git as releasy_git

In [7]:
import releasy

In [8]:
import importlib
importlib.reload(releasy_miner)
importlib.reload(releasy_git)

<module 'releasy.miner.vcs.git' from '/home/felipecrp/dev/releasy/releasy/miner/vcs/git.py'>

# Dataset setup

## Variables

In [157]:
repo_path = os.path.join('..','..','..','repos')

In [158]:
projects = pd.read_pickle("projects.zip")
projects["data"] = None

### Mining projects using Releasy

  - Currently, the **track_base_release** param has some performance issues. So to enable rapid mining, it is recommended to disable this param.
  
```{python}
params = {
    "track_base_release": False
}
```

In [159]:
release_mine_params = {}
release_mine_params["curl/curl"] = {
    "version_separator": "_"
}

In [169]:
def mine_projects(mine_commits=False):
    count = 1
    projects["data"] = None
    for name, project in projects[(projects.data.isnull())].iterrows():
        print(f"{datetime.datetime.now()} - {count:3} - Processing {name}")
        project_group, project_name = name.split("/") 
        path = os.path.join(repo_path, project_group, f"{project_name}.git")
        projects.loc[name, "path"] = path    

        mine_params = {
            "track_base_release": False
        }
        if name in release_mine_params:
            mine_params.update(release_mine_params[name])
        miner = releasy_miner.Miner(name=name,vcs=releasy_git.GitVcs(path), **mine_params)
        project = miner.mine_releases()
        if mine_commits:
            project = miner.mine_commits()
        projects.loc[name, "data"] = project
        count += 1
    print(f"{datetime.datetime.now()} - Ended")

mine_projects()

2020-01-12 23:40:42.214847 -   1 - Processing freeCodeCamp/freeCodeCamp
2020-01-12 23:40:42.215907 -   2 - Processing vuejs/vue
2020-01-12 23:40:42.224671 -   3 - Processing facebook/react
2020-01-12 23:40:42.229920 -   4 - Processing twbs/bootstrap
2020-01-12 23:40:42.232813 -   5 - Processing facebook/react-native
2020-01-12 23:40:42.243267 -   6 - Processing facebook/create-react-app
2020-01-12 23:40:42.250170 -   7 - Processing axios/axios
2020-01-12 23:40:42.252900 -   8 - Processing nodejs/node
2020-01-12 23:40:42.282474 -   9 - Processing FortAwesome/Font-Awesome
2020-01-12 23:40:42.285418 -  10 - Processing angular/angular.js
2020-01-12 23:40:42.293852 -  11 - Processing microsoft/vscode
2020-01-12 23:40:42.299788 -  12 - Processing microsoft/TypeScript
2020-01-12 23:40:42.304737 -  13 - Processing angular/angular
2020-01-12 23:40:42.315046 -  14 - Processing ant-design/ant-design
2020-01-12 23:40:42.324263 -  15 - Processing reduxjs/redux
2020-01-12 23:40:42.327185 -  16 - Pro

## Projects trial dataset

In [170]:
def gen_project_stats():
    projects["num_common_releases"] = projects["data"].apply(lambda p : len(p.get_releases(releasy.RELEASE_TYPE_COMMON)))
    projects["num_major_releases"] = projects["data"].apply(lambda p : len(p.get_releases(releasy.RELEASE_TYPE_MAJOR)))
    projects["num_minor_releases"] = projects["data"].apply(lambda p : len(p.get_releases(releasy.RELEASE_TYPE_MINOR)))
    projects["num_patch_releases"] = projects["data"].apply(lambda p : len(p.get_releases(releasy.RELEASE_TYPE_PATCH)))
    projects["num_duplicated_releases"] = projects["data"].apply(lambda p : len(p.get_releases(releasy.RELEASE_TYPE_DUPLICATED)))
    projects["num_pre_releases"] = projects["data"].apply(lambda p : len(p.get_releases(releasy.RELEASE_TYPE_PRE)))
    projects["num_releases"] = projects["data"].apply(lambda p : len(p.get_releases(releasy.RELEASE_TYPE_ANY)))
    projects["num_tags"] = projects["data"].apply(lambda p : len(p.tags))
    projects["per_releases"] = projects["num_releases"] / projects["num_tags"]
    projects["per_common_releases"] = projects["num_common_releases"] / projects["num_tags"]
    projects["per_pre_releases"] = projects["num_pre_releases"] / projects["num_tags"]
    projects.loc[projects["num_tags"] == 0, "per_releases"] = 0
gen_project_stats()

### Handle release prefix

Manual analysis

In [171]:
i = 0
p = 0
for name, project in projects.iterrows():
    prefixes = project["data"].release_prefixes
    print(f"{name} ({len(prefixes)})")
    inc = False
    for prefix, count in prefixes.items():
        prefix_percent = round(count * 100 / project.num_releases,1)
        if prefix_percent < 5:
            i += 1
            if not inc:
                p += 1
                inc = True
        print(f"{'X' if prefix_percent < 5 else '':1} - {count:3} {prefix_percent:4}% - {prefix} ")
    print()
print(f"\n{i} prefixes with less than 5%")
print(f"{p} projects with prefixes with less than 5%")

freeCodeCamp/freeCodeCamp (0)

vuejs/vue (2)
  -  80 32.1% -  
  - 169 67.9% - v 

facebook/react (2)
  - 106 91.4% - v 
  -  10  8.6% -  

twbs/bootstrap (1)
  -  55 100.0% - v 

facebook/react-native (2)
  - 303 99.3% - v 
X -   2  0.7% -  

facebook/create-react-app (1)
  -  73 100.0% - v 

axios/axios (2)
  -  41 97.6% - v 
X -   1  2.4% -  

nodejs/node (2)
  - 539 99.8% - v 
X -   1  0.2% - heads/tags/v 

FortAwesome/Font-Awesome (2)
  -  22 37.9% - v 
  -  36 62.1% -  

angular/angular.js (1)
  - 202 100.0% - v 

microsoft/vscode (3)
  - 117 98.3% -  
X -   1  0.8% - release/ 
X -   1  0.8% - v 

microsoft/TypeScript (1)
  - 111 100.0% - v 

angular/angular (1)
  - 302 100.0% -  

ant-design/ant-design (1)
  - 284 100.0% -  

reduxjs/redux (1)
  -  61 100.0% - v 

storybookjs/storybook (1)
  - 693 100.0% - v 

denoland/deno (1)
  -  67 100.0% - v 

ionic-team/ionic (2)
  - 172 99.4% - v 
X -   1  0.6% -  

grafana/grafana (2)
  - 129 99.2% - v 
X -   1  0.8% - v5., 

zeit/hyper 

Results

In [172]:
release_prefixes = {
    "facebook/create-react-app": ["v"],
    "angular/angular.js": ["v"],
    "microsoft/vscode": ["","v","release/"],
    "angular/angular": [""],
    "google/guava": ["v"],
    "scikit-learn/scikit-learn": [""],
    "tensorflow/tensorflow": ["","v"],
    "dotnet/roslyn": ["version-"],
    "git/git": ["v"],
    "php/php-src": ["php-","PHP-"],
    "jekyll/jekyll": ["v"],
    "curl/curl":  ["curl-", "v", "curl_"],
    "fastlane/fastlane": ["","v"],
    "moby/moby": ["","v"]
}

for name in release_prefixes:
    if name not in release_mine_params:
        release_mine_params[name] = {}
    release_mine_params[name].update({ "release_prefixes": release_prefixes[name] })

### Handle release suffix

In [173]:
i = 0
for name, project in projects.iterrows():
    suffixes = project["data"].release_suffixes
    for suffix, count in suffixes.items():
        suffix_percent = round(count * 100 / project.num_releases,1)
        if suffix_percent > 20 and suffix != "":
            i += 1
            print(f"{name} ({len(suffixes)})")
            print(f"{'X' if (suffix_percent > 20 and suffix != '') else '':1} - {count:3} {suffix_percent:4}% - {suffix} ")
print(f"\n{i} suffixes with more than 20%")

tensorflow/tensorflow (6)
X -  19 24.4% - rc0 
tensorflow/tensorflow (6)
X -  16 20.5% - rc1 
microsoft/terminal (2)
X -  30 90.9% - 0 
bitcoin/bitcoin (16)
X -  44 20.5% - rc1 
x64dbg/x64dbg (1)
X -  26 100.0% - ALPHA 
php/php-src (53)
X - 243 30.2% - RC1 
kubernetes/kubernetes (27)
X - 159 31.9% - beta.0 

7 suffixes with more than 20%


In [174]:
release_ignored_suffixes = {
    "spring-projects/spring-framework": ["RELEASE"],
}

for name in release_ignored_suffixes:
    if name not in release_mine_params:
        release_mine_params[name] = {}
    release_mine_params[name] = { "ignored_suffixes": release_ignored_suffixes[name] }

In [175]:
release_mine_params

{'curl/curl': {'version_separator': '_',
  'release_prefixes': ['curl-', 'v', 'curl_']},
 'facebook/create-react-app': {'release_prefixes': ['v']},
 'angular/angular.js': {'release_prefixes': ['v']},
 'microsoft/vscode': {'release_prefixes': ['', 'v', 'release/']},
 'angular/angular': {'release_prefixes': ['']},
 'google/guava': {'release_prefixes': ['v']},
 'scikit-learn/scikit-learn': {'release_prefixes': ['']},
 'tensorflow/tensorflow': {'release_prefixes': ['', 'v']},
 'dotnet/roslyn': {'release_prefixes': ['version-']},
 'git/git': {'release_prefixes': ['v']},
 'php/php-src': {'release_prefixes': ['php-', 'PHP-']},
 'jekyll/jekyll': {'release_prefixes': ['v']},
 'fastlane/fastlane': {'release_prefixes': ['', 'v']},
 'moby/moby': {'release_prefixes': ['', 'v']},
 'spring-projects/spring-framework': {'ignored_suffixes': ['RELEASE']}}

## Project Dataset

In [176]:
mine_projects(True)
gen_project_stats()

2020-01-12 23:41:00.934673 -   1 - Processing freeCodeCamp/freeCodeCamp
2020-01-12 23:41:00.936112 -   2 - Processing vuejs/vue
2020-01-12 23:41:01.053336 -   3 - Processing facebook/react
2020-01-12 23:41:01.317581 -   4 - Processing twbs/bootstrap
2020-01-12 23:41:01.676928 -   5 - Processing facebook/react-native
2020-01-12 23:41:02.073380 -   6 - Processing facebook/create-react-app
2020-01-12 23:41:02.606117 -   7 - Processing axios/axios
2020-01-12 23:41:02.627291 -   8 - Processing nodejs/node
2020-01-12 23:41:03.686181 -   9 - Processing FortAwesome/Font-Awesome
2020-01-12 23:41:03.710217 -  10 - Processing angular/angular.js
2020-01-12 23:41:03.962011 -  11 - Processing microsoft/vscode
2020-01-12 23:41:05.456414 -  12 - Processing microsoft/TypeScript
2020-01-12 23:41:06.070688 -  13 - Processing angular/angular
2020-01-12 23:41:06.425136 -  14 - Processing ant-design/ant-design
2020-01-12 23:41:07.124271 -  15 - Processing reduxjs/redux
2020-01-12 23:41:07.181478 -  16 - Pro

## Tags dataset

In [178]:
tags = []
for project in projects["data"]:
    for tag in project.tags:
        tag = {
            "project": project.name,
            "name": tag.name,
            "data": tag
        }
        tags.append(tag)
tags = pd.DataFrame(tags)
tags.set_index(["project", "name"], inplace=True)

In [179]:
tags["annotated"] = tags["data"].apply(lambda t: t.is_annotated)

## Releases dataset

In [180]:
releases = []
for project in projects["data"]:#.apply(lambda project: project):
    for release in project.releases:
        releases.append({
            "project": project.name,
            "name": release.name,
            "data": release
        })
releases = pd.DataFrame(releases)
releases.set_index(["project","name"], inplace=True)

In [None]:
releases["prefix"] = releases["data"].apply(lambda r: r.prefix)
releases["head_commit"] = releases["data"].apply(lambda r: r.head_commit.hashcode)

releases["time"] = releases["data"].apply(lambda r: r.get_time())
releases["start_development"] = releases["data"].apply(lambda r: r.get_time(of=releasy.START_DEVELOPMENT_TIME))
releases["length"] = releases["data"].apply(lambda r: r.get_length())
# releases.loc[releases["length"] == pd.to_timedelta(0), "length"] = np.NaN


In [None]:
releases.shape

In [None]:
def get_prefixes(project):
    prefixes = {}
    for release in project.releases:
        prefix = release.prefix
        if not prefix:
            prefix = "None"
        if prefix not in prefixes:
            prefixes[prefix] = 0
        prefixes[prefix] += 1
    return prefixes

projects["num_prefixes"] = projects["data"].apply(lambda project: len(get_prefixes(project).keys()))
projects["prefixes"] = projects["data"].apply(lambda project: ",".join(get_prefixes(project).keys()))

### show release prefixes

In [None]:
projects_to_screen = projects[(projects["num_prefixes"] > 1) & (~projects.index.isin(release_prefixes.keys()))]
print(projects_to_screen.shape[0])
projects_to_screen.sort_values(["name"])["prefixes"]

### Manual inpection of releases

In [None]:
releases.loc["angular/angular"]


Other resources

In [None]:
n_tags = 0
for name, project in projects.iterrows():
    project_group, project_name = name.split("/") 
    path = os.path.join(repo_path, project_group, f"{project_name}.git")
    %cd $path
    n_tag = !(git tag | wc -l)
    n_tag = int(n_tag[0])
    n_tags += n_tag
print(n_tags)