# Mine and select projects

This notebook mine the projects previous cloned using releasy

In [1]:
import os
import datetime
import json
import re
import traceback

from multiprocessing import Pool
import pandas as pd

from ipywidgets import IntProgress
from IPython.display import display

import releasy

In [2]:
from util import (
    DATA_PATH,
    REPO_PATH,
    TMP_PATH, 
    CPU,
    CycleType,
    RAPID_RELEASE_LIM,
    TRAD_RELEASE_LIM,
    delta2days,
    is_rapid_release,
    is_trad_release,
)

In [3]:
pd.set_option('display.max_rows', 1000)

In [4]:
with open(DATA_PATH / '01_github_project_results.json', 'r') as infile:
    projects_data = json.loads(infile.read())

print(len(projects_data))

2772


## Mine Releases


In [5]:
def mine(name: str) -> releasy.Project:
    elapsed_time = datetime.datetime.now()
    
    try:
        repo_path = str(REPO_PATH / name)
        project = releasy.Miner(repo_path, name).apply(
            releasy.FinalReleaseMiner(),
            releasy.HistoryCommitMiner(),
            releasy.BaseReleaseMiner(),
            releasy.ContributorMiner(),
            releasy.SemanticReleaseMiner()
        ).mine()

        rapid_releases = [release for release in project.main_releases if is_rapid_release(release)]
        trad_releases = [release for release in project.main_releases if is_trad_release(release)]


        project_data = {
            'project': project.name,
            'prefixes': len(project.releases.prefixes()),
            'prefixes_names': str(" ".join(project.releases.prefixes())),
            'main_releases': len(project.main_releases),
            'rapid_releases': len(rapid_releases),
            'trad_releases': len(trad_releases),
            'patches': len(project.patches),
            'releases': len(project.releases)
        }

        elapsed_time = datetime.datetime.now() - elapsed_time
        project_data['time'] = elapsed_time
    except Exception as err:
        print(f"{name:40} {err=}")
        traceback.print_exception(err)
        project_data = {}
        mreleases = [{}]
    
    return project_data

In [6]:
project_names = list(map(lambda p: p['nameWithOwner'], projects_data))
# project_names = project_names[:20]

In [7]:
with Pool(processes=CPU) as pool:
    processed = 0
    results = []
    progress = IntProgress(min=0, max=len(project_names))
    display(progress)
    for result in pool.imap_unordered(mine, project_names):
        results.append(result)
        progress.value += 1
        
projects = pd.DataFrame(results)
projects.sample(10)

IntProgress(value=0, max=2772)

Unnamed: 0,project,prefixes,prefixes_names,main_releases,rapid_releases,trad_releases,patches,releases,time
1508,python/mypy,1,v,59,38,8,7,66,0 days 00:00:00.624642
1630,oliver-moran/jimp,2,v,15,8,4,71,86,0 days 00:00:00.062354
550,filhodanuvem/gitql,2,v,10,2,6,8,18,0 days 00:00:00.024786
407,GrenderG/Toasty,1,,5,0,4,14,19,0 days 00:00:00.021850
2610,gulpjs/gulp,2,v,6,2,3,13,19,0 days 00:00:00.073568
1277,aws/aws-cdk,1,v,217,216,0,46,263,0 days 00:00:00.814102
28,operator-framework/operator-sdk,2,v scorecard-kuttl/v,42,32,0,47,89,0 days 00:00:00.204215
2602,rclone/rclone,1,v,69,36,11,21,90,0 days 00:00:00.432667
1796,GoogleChromeLabs/ndb,1,v,2,0,1,57,59,0 days 00:00:00.028360
679,cortexlabs/cortex,1,v,42,36,1,20,62,0 days 00:00:00.133786


In [8]:
len(projects)

2772

In [9]:
projects.to_csv(DATA_PATH / '10_projects_all.csv')

## Select Releases

In [41]:
selected_projects = projects.copy()

### Remove projects without main releases

These projects may contain releases, but releasy did not extracted any main release. Hence, we need to remove these projects from the analysis

In [42]:
# total = len(selected_projects)
# selected_projects = selected_projects.query('main_releases > 0')
# removed = total - len(selected_projects)
# print(f"Discarded {removed} projects")
# print(f"Kept {len(selected_projects)} projects")

### Remove without few rapid and traditional releases

The projects must have at least one rapid and one traditional release

In [43]:
total = len(selected_projects)
selected_projects = selected_projects.query('rapid_releases >= 2 and trad_releases >= 2')
removed = total - len(selected_projects)
print(f"Discarded {removed} projects")
print(f"Kept {len(selected_projects)} projects")

Discarded 1121 projects
Kept 1651 projects


### Remove projecs with uncommon prefixes

These projects usually represent mono repo, i.e., multiple software projects in the same git repository. Hence, we need to remove these projects from the analysis

In [44]:
def uncommon_prefixes(qnt, names):
    uncommon_prefixes = 0
    if qnt <= 1:
        return uncommon_prefixes
    else:
        names = str(names).split(' ')
        uncommon_prefixes = [name for name in names if not re.match("^((|vv?|rel|l|rel/v|r|release)[/._-]?)$", name, flags=re.IGNORECASE)]
        return len(uncommon_prefixes)
        
selected_projects = selected_projects.assign(
    uncommon_prefixes=selected_projects.apply(lambda row: uncommon_prefixes(row['prefixes'], row['prefixes_names']), axis = 1))

In [45]:
len(selected_projects.query('uncommon_prefixes > 0'))

342

In [46]:
total = len(selected_projects)
selected_projects = selected_projects.query('uncommon_prefixes == 0')
removed = total - len(selected_projects)
print(f"Discarded {removed} projects")
print(f"Kept {len(selected_projects)} projects")

Discarded 342 projects
Kept 1309 projects


In [47]:
selected_projects.sample(10)

Unnamed: 0,project,prefixes,prefixes_names,main_releases,rapid_releases,trad_releases,patches,releases,time,uncommon_prefixes
1122,TarsCloud/Tars,2,v,15,6,4,32,47,0 days 00:00:00.059523,0
2050,petkaantonov/bluebird,1,v,24,11,7,115,139,0 days 00:00:00.165153,0
718,theonedev/onedev,2,v,26,17,2,131,157,0 days 00:00:00.233250,0
564,burnash/gspread,2,v,19,9,8,21,40,0 days 00:00:00.067311,0
962,ffuf/ffuf,1,v,18,8,5,6,24,0 days 00:00:00.028255,0
1738,micro/micro,2,v,62,48,2,37,99,0 days 00:00:00.244219,0
1381,uber/react-vis,2,v,22,14,3,77,99,0 days 00:00:00.065358,0
647,travist/jsencrypt,2,v,7,2,3,3,10,0 days 00:00:00.023553,0
629,davidjbradshaw/iframe-resizer,2,v,26,16,7,96,122,0 days 00:00:00.104132,0
686,remix-run/history,1,v,38,24,7,40,78,0 days 00:00:00.069134,0


In [48]:
len(selected_projects)

1309

In [49]:
selected_projects.to_csv(DATA_PATH / '10_projects_selected.csv', index=False)