In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import re

In [2]:
pd.options.display.max_rows = 500

In [3]:
import os
import sys

releasy_module = os.path.abspath(os.path.join('..','..','..','dev','releasy2'))
if releasy_module not in sys.path:
    sys.path.insert(0, releasy_module)
    
import releasy
from releasy.miner_git import GitVcs
from releasy.miner import TagReleaseMiner, PathCommitMiner, RangeCommitMiner, TimeCommitMiner, VersionReleaseMatcher, VersionReleaseSorter, TimeReleaseSorter, VersionWoPreReleaseMatcher


In [4]:
projects = pd.read_pickle('projects.zip')

In [5]:
releases = pd.DataFrame(columns=[
    "project","name", "lang","head", "time", "commits",
    "range_commits", "range_tpos", "range_fpos","range_fneg",
    "time_commits", "time_tpos", "time_fpos","time_fneg"])
    
count = 0
for i,project in enumerate(projects.itertuples()):
    path = os.path.abspath(os.path.join('..','..','..','repos2',project.name))
    
    try:
        print(f"{i+1:3} {project.name}")
        vcs = GitVcs(path)
        release_matcher = VersionWoPreReleaseMatcher()
        time_release_sorter = TimeReleaseSorter()
        version_release_sorter = VersionReleaseSorter()

        time_release_miner = TagReleaseMiner(vcs, release_matcher, time_release_sorter)
        time_release_set = time_release_miner.mine_releases()

        
        version_release_miner = TagReleaseMiner(vcs, release_matcher, version_release_sorter)
        version_release_set = version_release_miner.mine_releases()

        path_miner = PathCommitMiner(vcs, time_release_set)
        range_miner = RangeCommitMiner(vcs, version_release_set)
        time_miner = TimeCommitMiner(vcs, version_release_set)
    
        print(f" - parsing by path")
        path_release_set = path_miner.mine_commits()
        print(f" - parsing by time")
        time_release_set = time_miner.mine_commits()
        print(f" - parsing by range")
        range_release_set = range_miner.mine_commits()
        
        print("")
        stats = []
        for release in version_release_set:
            path_commits = set(path_release_set[release.name].commits)
            range_commits = set(range_release_set[release.name].commits)
            time_commits = set(time_release_set[release.name].commits)
    
            stats.append({
                "project": project.name,
                "name": release.name,
                "lang": project.lang,
                "head": release.head,
                "time": release.time,
                "commits": len(path_commits),
                "range_commits": len(range_commits),
                "range_tpos": len(path_commits & range_commits),
                "range_fpos": len(range_commits - path_commits),
                "range_fneg": len(path_commits - range_commits),
                "time_commits": len(time_commits),
                "time_tpos": len(path_commits & time_commits),
                "time_fpos": len(time_commits - path_commits),
                "time_fneg": len(path_commits - time_commits)
            })
        releases = releases.append(pd.DataFrame(stats))
    except Exception as e:
        print(f" - error: {e}")
    
releases_bkp = releases.copy()  

  1 vuejs/vue
 - parsing by path
 - parsing by time
 - parsing by range

  2 facebook/react
 - parsing by path
 - parsing by time
 - parsing by range

  3 twbs/bootstrap
 - parsing by path
 - parsing by time
 - parsing by range

  4 d3/d3
 - parsing by path
 - parsing by time
 - parsing by range

  5 facebook/react-native
 - parsing by path
 - parsing by time
 - parsing by range

  6 facebook/create-react-app
 - parsing by path
 - parsing by time
 - parsing by range

  7 nodejs/node
 - parsing by path
 - parsing by time
 - parsing by range

  8 mrdoob/three.js
 - parsing by path
 - parsing by time
 - parsing by range

  9 mui-org/material-ui
 - parsing by path
 - parsing by time
 - parsing by range

 10 angular/angular.js
 - parsing by path
 - parsing by time
 - parsing by range

 11 microsoft/vscode
 - parsing by path
 - parsing by time
 - parsing by range

 12 denoland/deno
 - parsing by path
 - parsing by time
 - parsing by range

 13 angular/angular
 - parsing by path
 - parsing by

In [87]:
# releases_bkp = releases.copy()
releases = releases_bkp.copy()

In [88]:
releases['head'] = releases['head'].apply(lambda commit: commit.id)
releases['name'] = releases['name'].apply(lambda name: name.value)
releases.commits = pd.to_numeric(releases.commits)
releases.time = pd.to_datetime(releases.time, utc=True)
releases.range_commits = pd.to_numeric(releases.range_commits)
releases.range_tpos = pd.to_numeric(releases.range_tpos)
releases.range_fpos = pd.to_numeric(releases.range_fpos)
releases.range_fneg = pd.to_numeric(releases.range_fneg)
releases.time_commits = pd.to_numeric(releases.time_commits)
releases.time_tpos = pd.to_numeric(releases.time_tpos)
releases.time_fpos = pd.to_numeric(releases.time_fpos)
releases.time_fneg = pd.to_numeric(releases.time_fneg)

In [89]:
releases.dtypes

project                       object
name                          object
lang                          object
head                          object
time             datetime64[ns, UTC]
commits                        int64
range_commits                  int64
range_tpos                     int64
range_fpos                     int64
range_fneg                     int64
time_commits                   int64
time_tpos                      int64
time_fpos                      int64
time_fneg                      int64
dtype: object

In [90]:
def precision(row, column):
    if row[column+"_tpos"] + row[column+"_fpos"] == 0:
        return 0.0
    else:
        return row[column+"_tpos"] / (row[column+"_tpos"] + row[column+"_fpos"])
    
def recall(row, column):
    if row[column+"_tpos"] + row[column+"_fneg"] == 0:
        return 0.0
    else:
        return row[column+"_tpos"] / (row[column+"_tpos"] + row[column+"_fneg"])


In [91]:
releases['range_precision'] = releases.apply(precision, args=('range',), axis=1)
releases['range_recall'] = releases.apply(recall, args=('range',), axis=1)
releases['time_precision'] = releases.apply(precision, args=('time',), axis=1)
releases['time_recall'] = releases.apply(recall, args=('time',), axis=1)

In [92]:
type(releases.iloc[0]['commits'])

numpy.int64

In [93]:
releases

Unnamed: 0,project,name,lang,head,time,commits,range_commits,range_tpos,range_fpos,range_fneg,time_commits,time_tpos,time_fpos,time_fneg,range_precision,range_recall,time_precision,time_recall
0,vuejs/vue,0.6.0,javascript,218557cdec830a629252f4a9e2643973dc1f1d2d,2013-12-08 00:32:17+00:00,354,354,354,0,0,354,354,0,0,1.0,1.0,1.000000,1.000000
1,vuejs/vue,v0.7.0,javascript,f4861ca9905a170b9a4b185e8a2038dc7c11c58e,2013-12-24 03:31:05+00:00,34,34,34,0,0,34,34,0,0,1.0,1.0,1.000000,1.000000
2,vuejs/vue,v0.7.1,javascript,590a7ee55b655f166ecdf8e7a5e22dab7a9e6dd7,2013-12-24 21:58:33+00:00,6,6,6,0,0,6,6,0,0,1.0,1.0,1.000000,1.000000
3,vuejs/vue,v0.7.2,javascript,e78fb82212ac229c91fc8b1f09a84b24f577541a,2013-12-28 05:43:40+00:00,3,3,3,0,0,3,3,0,0,1.0,1.0,1.000000,1.000000
4,vuejs/vue,v0.7.3,javascript,dcc839a4314de8825f7087f00c4e40ea2d22b46f,2014-01-06 19:13:12+00:00,15,15,15,0,0,15,15,0,0,1.0,1.0,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,ethereum/go-ethereum,v1.9.18,go,f5382591874220287de253bfc08b10afd5244927,2020-07-27 11:53:53+00:00,19,19,19,0,0,18,18,0,1,1.0,1.0,1.000000,0.947368
154,ethereum/go-ethereum,v1.9.19,go,3e0641923d78bf1905e596a3a41a54277540bec7,2020-08-11 11:10:21+00:00,47,47,47,0,0,48,47,1,0,1.0,1.0,0.979167,1.000000
155,ethereum/go-ethereum,v1.9.20,go,979fc96899c77876e15807005eadd936da17b6c2,2020-08-25 14:20:37+00:00,26,26,26,0,0,27,26,1,0,1.0,1.0,0.962963,1.000000
156,ethereum/go-ethereum,2,go,198ef97108ec257d49a7b593a9c6fe49961319c0,2014-05-28 09:53:10+00:00,233,0,0,0,233,0,0,0,233,0.0,0.0,0.000000,0.000000


In [94]:
releases.dtypes

project                         object
name                            object
lang                            object
head                            object
time               datetime64[ns, UTC]
commits                          int64
range_commits                    int64
range_tpos                       int64
range_fpos                       int64
range_fneg                       int64
time_commits                     int64
time_tpos                        int64
time_fpos                        int64
time_fneg                        int64
range_precision                float64
range_recall                   float64
time_precision                 float64
time_recall                    float64
dtype: object

In [95]:
releases.to_pickle("releases.zip")