In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import re

In [2]:
pd.options.display.max_rows = 500

In [3]:
import os
import sys

releasy_module = os.path.abspath(os.path.join('..','..','..','dev','releasy2'))
if releasy_module not in sys.path:
    sys.path.insert(0, releasy_module)
    
import releasy
from releasy.miner_git import GitVcs
from releasy.miner import TagReleaseMiner, PathCommitMiner, RangeCommitMiner, TimeCommitMiner, VersionReleaseMatcher, VersionReleaseSorter, TimeReleaseSorter, VersionWoPreReleaseMatcher


In [4]:
projects = pd.read_pickle('projects.zip')

In [15]:
releases = pd.DataFrame(columns=[
    "project","name", "lang","head", "time", "commits",
    "range_commits", "range_tpos", "range_fpos","range_fneg",
    "time_commits", "time_tpos", "time_fpos","time_fneg"])
    
suffix_exception_catalog = {
    "spring-projects/spring-boot": ".RELEASE",
    "spring-projects/spring-framework": ".RELEASE",
    "netty/netty": ".Final",
    "godotengine/godot": "-stable",
}

count = 0
for i,project in enumerate(projects.itertuples()):
    path = os.path.abspath(os.path.join('..','..','..','repos2',project.name))
    
    try:
        print(f"{i+1:3} {project.name}")
        if project.name in suffix_exception_catalog:
            suffix_exception = suffix_exception_catalog[project.name]
        else:
            suffix_exception = None
        
        vcs = GitVcs(path)
        release_matcher = VersionWoPreReleaseMatcher(suffix_exception=suffix_exception)
        time_release_sorter = TimeReleaseSorter()
        version_release_sorter = VersionReleaseSorter()

        time_release_miner = TagReleaseMiner(vcs, release_matcher, time_release_sorter)
        time_release_set = time_release_miner.mine_releases()

        version_release_miner = TagReleaseMiner(vcs, release_matcher, version_release_sorter)
        version_release_set = version_release_miner.mine_releases()

        path_miner = PathCommitMiner(vcs, time_release_set)
        range_miner = RangeCommitMiner(vcs, version_release_set)
        time_miner = TimeCommitMiner(vcs, version_release_set)
    
        print(f" - parsing by path")
        path_release_set = path_miner.mine_commits()
        print(f" - parsing by time")
        time_release_set = time_miner.mine_commits()
        print(f" - parsing by range")
        range_release_set = range_miner.mine_commits()
        
        print("")
        stats = []
        for release in version_release_set:
            path_commits = set(path_release_set[release.name].commits)
            range_commits = set(range_release_set[release.name].commits)
            time_commits = set(time_release_set[release.name].commits)
    
            stats.append({
                "project": project.name,
                "name": release.name,
                "lang": project.lang,
                "head": release.head,
                "time": release.time,
                "commits": len(path_commits),
                "range_commits": len(range_commits),
                "range_tpos": len(path_commits & range_commits),
                "range_fpos": len(range_commits - path_commits),
                "range_fneg": len(path_commits - range_commits),
                "time_commits": len(time_commits),
                "time_tpos": len(path_commits & time_commits),
                "time_fpos": len(time_commits - path_commits),
                "time_fneg": len(path_commits - time_commits)
            })
        releases = releases.append(pd.DataFrame(stats))
    except Exception as e:
        print(f" - error: {e}")
    
releases_bkp = releases.copy()  

  1 vuejs/vue
 - parsing by path
 - parsing by time
 - parsing by range

  2 facebook/react
 - parsing by path
 - parsing by time
 - parsing by range

  3 twbs/bootstrap
 - parsing by path
 - parsing by time
 - parsing by range

  4 d3/d3
 - parsing by path
 - parsing by time
 - parsing by range

  5 facebook/react-native
 - parsing by path
 - parsing by time
 - parsing by range

  6 nodejs/node
 - parsing by path
 - parsing by time
 - parsing by range

  7 mrdoob/three.js
 - parsing by path
 - parsing by time
 - parsing by range

  8 angular/angular.js
 - parsing by path
 - parsing by time
 - parsing by range

  9 webpack/webpack
 - parsing by path
 - parsing by time
 - parsing by range

 10 jquery/jquery
 - parsing by path
 - parsing by time
 - parsing by range

 11 microsoft/TypeScript
 - parsing by path
 - parsing by time
 - parsing by range

 12 ant-design/ant-design
 - parsing by path
 - parsing by time
 - parsing by range

 13 reduxjs/redux
 - parsing by path
 - parsing by time


In [16]:
releases = releases_bkp.copy()

In [17]:
releases['head'] = releases['head'].apply(lambda commit: commit.id)
releases['name'] = releases['name'].apply(lambda name: name.value)
releases.commits = pd.to_numeric(releases.commits)
releases.time = pd.to_datetime(releases.time, utc=True)
releases.range_commits = pd.to_numeric(releases.range_commits)
releases.range_tpos = pd.to_numeric(releases.range_tpos)
releases.range_fpos = pd.to_numeric(releases.range_fpos)
releases.range_fneg = pd.to_numeric(releases.range_fneg)
releases.time_commits = pd.to_numeric(releases.time_commits)
releases.time_tpos = pd.to_numeric(releases.time_tpos)
releases.time_fpos = pd.to_numeric(releases.time_fpos)
releases.time_fneg = pd.to_numeric(releases.time_fneg)

In [18]:
releases.dtypes

project                       object
name                          object
lang                          object
head                          object
time             datetime64[ns, UTC]
commits                        int64
range_commits                  int64
range_tpos                     int64
range_fpos                     int64
range_fneg                     int64
time_commits                   int64
time_tpos                      int64
time_fpos                      int64
time_fneg                      int64
dtype: object

In [19]:
def precision(row, column):
    if row[column+"_tpos"] + row[column+"_fpos"] == 0:
        return 1.0
    else:
        return row[column+"_tpos"] / (row[column+"_tpos"] + row[column+"_fpos"])
    
def recall(row, column):
    if row[column+"_tpos"] + row[column+"_fneg"] == 0:
        return 1.0
    else:
        return row[column+"_tpos"] / (row[column+"_tpos"] + row[column+"_fneg"])


In [20]:
releases['range_precision'] = releases.apply(precision, args=('range',), axis=1)
releases['range_recall'] = releases.apply(recall, args=('range',), axis=1)
releases['time_precision'] = releases.apply(precision, args=('time',), axis=1)
releases['time_recall'] = releases.apply(recall, args=('time',), axis=1)

In [21]:
releases.head()

Unnamed: 0,project,name,lang,head,time,commits,range_commits,range_tpos,range_fpos,range_fneg,time_commits,time_tpos,time_fpos,time_fneg,range_precision,range_recall,time_precision,time_recall
0,vuejs/vue,0.6.0,javascript,218557cdec830a629252f4a9e2643973dc1f1d2d,2013-12-08 00:32:17+00:00,354,354,354,0,0,354,354,0,0,1.0,1.0,1.0,1.0
1,vuejs/vue,v0.7.0,javascript,f4861ca9905a170b9a4b185e8a2038dc7c11c58e,2013-12-24 03:31:05+00:00,34,34,34,0,0,34,34,0,0,1.0,1.0,1.0,1.0
2,vuejs/vue,v0.7.1,javascript,590a7ee55b655f166ecdf8e7a5e22dab7a9e6dd7,2013-12-24 21:58:33+00:00,6,6,6,0,0,6,6,0,0,1.0,1.0,1.0,1.0
3,vuejs/vue,v0.7.2,javascript,e78fb82212ac229c91fc8b1f09a84b24f577541a,2013-12-28 05:43:40+00:00,3,3,3,0,0,3,3,0,0,1.0,1.0,1.0,1.0
4,vuejs/vue,v0.7.3,javascript,dcc839a4314de8825f7087f00c4e40ea2d22b46f,2014-01-06 19:13:12+00:00,15,15,15,0,0,15,15,0,0,1.0,1.0,1.0,1.0


In [22]:
releases.dtypes

project                         object
name                            object
lang                            object
head                            object
time               datetime64[ns, UTC]
commits                          int64
range_commits                    int64
range_tpos                       int64
range_fpos                       int64
range_fneg                       int64
time_commits                     int64
time_tpos                        int64
time_fpos                        int64
time_fneg                        int64
range_precision                float64
range_recall                   float64
time_precision                 float64
time_recall                    float64
dtype: object

In [28]:
releases.to_pickle("releases.zip")

In [24]:
releases.groupby(['project']).mean()[['range_precision','range_recall','time_precision','time_recall']].sort_values('range_recall')

Unnamed: 0_level_0,range_precision,range_recall,time_precision,time_recall
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dotnet/efcore,0.861097,0.907146,0.363637,0.775465
taosdata/TDengine,0.899112,0.919301,0.454907,0.819983
jellyfin/jellyfin,0.92805,0.966667,0.368276,0.909911
Wox-launcher/Wox,0.973684,0.973684,0.741645,0.936863
cefsharp/CefSharp,0.980903,0.984641,0.54115,0.784172
alibaba/fastjson,0.967328,0.988636,0.584946,0.957588
apache/dubbo,0.985109,0.988803,0.439195,0.832963
ionic-team/ionic-framework,0.964719,0.990948,0.656842,0.885974
spring-projects/spring-framework,0.986764,0.992801,0.369032,0.927707
ansible/ansible,0.98711,0.993056,0.239749,0.905605


In [25]:
releases.groupby(['project']).mean()[['range_precision','range_recall','time_precision','time_recall']].sort_values('range_recall')


Unnamed: 0_level_0,range_precision,range_recall,time_precision,time_recall
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dotnet/efcore,0.861097,0.907146,0.363637,0.775465
taosdata/TDengine,0.899112,0.919301,0.454907,0.819983
jellyfin/jellyfin,0.92805,0.966667,0.368276,0.909911
Wox-launcher/Wox,0.973684,0.973684,0.741645,0.936863
cefsharp/CefSharp,0.980903,0.984641,0.54115,0.784172
alibaba/fastjson,0.967328,0.988636,0.584946,0.957588
apache/dubbo,0.985109,0.988803,0.439195,0.832963
ionic-team/ionic-framework,0.964719,0.990948,0.656842,0.885974
spring-projects/spring-framework,0.986764,0.992801,0.369032,0.927707
ansible/ansible,0.98711,0.993056,0.239749,0.905605


In [27]:
releases[releases.project == "sinatra/sinatra"].sort_values('range_recall')

Unnamed: 0,project,name,lang,head,time,commits,range_commits,range_tpos,range_fpos,range_fneg,time_commits,time_tpos,time_fpos,time_fneg,range_precision,range_recall,time_precision,time_recall
0,sinatra/sinatra,0.0.1,ruby,72be291da2bf7a5e2dacf8b9119a258d8db53c43,2007-09-08 23:51:24+00:00,1,1,1,0,0,1,1,0,0,1.0,1.0,1.0,1.0
68,sinatra/sinatra,1.3.5,ruby,f4364d44f709b248aaa8f0661e2f883ae73ade01,2013-02-25 10:09:26+00:00,0,0,0,0,0,0,0,0,0,1.0,1.0,1.0,1.0
67,sinatra/sinatra,v1.3.5,ruby,f4364d44f709b248aaa8f0661e2f883ae73ade01,2013-02-25 10:09:25+00:00,12,12,12,0,0,53,12,41,0,1.0,1.0,0.226415,1.0
66,sinatra/sinatra,1.3.4,ruby,bc21cf1b983384b20f5572da0db529bf376ff1a3,2013-01-26 22:18:45+00:00,0,0,0,0,0,0,0,0,0,1.0,1.0,1.0,1.0
65,sinatra/sinatra,v1.3.4,ruby,bc21cf1b983384b20f5572da0db529bf376ff1a3,2013-01-26 22:18:44+00:00,27,27,27,0,0,144,27,117,0,1.0,1.0,0.1875,1.0
64,sinatra/sinatra,v1.3.3,ruby,77346226dbb9492993accee2d042bfe6c9ae036c,2012-08-19 12:55:02+00:00,0,0,0,0,0,0,0,0,0,1.0,1.0,1.0,1.0
63,sinatra/sinatra,1.3.3,ruby,77346226dbb9492993accee2d042bfe6c9ae036c,2012-08-19 12:55:02+00:00,92,92,92,0,0,411,92,319,0,1.0,1.0,0.223844,1.0
62,sinatra/sinatra,v1.3.2,ruby,e111243e813ede1f0f4c6918d9a8cc029e776fc3,2011-12-30 12:55:49+00:00,0,0,0,0,0,0,0,0,0,1.0,1.0,1.0,1.0
61,sinatra/sinatra,1.3.2,ruby,e111243e813ede1f0f4c6918d9a8cc029e776fc3,2011-12-30 12:55:49+00:00,91,91,91,0,0,136,91,45,0,1.0,1.0,0.669118,1.0
60,sinatra/sinatra,1.3.1,ruby,ea57aaaa9671124279c3ae4690c58a1c5f05be78,2011-10-05 01:29:29+00:00,0,0,0,0,0,0,0,0,0,1.0,1.0,1.0,1.0
