# Mine and select projects

This notebook mine the projects previous cloned using releasy

In [1]:
import os
import datetime
import json
import re
import traceback

from multiprocessing import Pool
import pandas as pd

from ipywidgets import IntProgress
from IPython.display import display

import releasy

In [2]:
from util import (
    DATA_PATH,
    REPO_PATH,
    TMP_PATH, 
    CPU,
    CycleType,
    RAPID_RELEASE_LIM,
    TRAD_RELEASE_LIM,
    delta2days,
    is_rapid_release,
    is_trad_release,
)

In [3]:
pd.set_option('display.max_rows', 1000)

In [4]:
with open(DATA_PATH / '01_github_project_results.json', 'r') as infile:
    projects_data = json.loads(infile.read())

print(len(projects_data))

2772


## Mine Releases


In [5]:
def mine(name: str) -> releasy.Project:
    elapsed_time = datetime.datetime.now()
    
    try:
        repo_path = str(REPO_PATH / name)
        project = releasy.Miner(repo_path, name).apply(
            releasy.FinalReleaseMiner(),
            releasy.HistoryCommitMiner(),
            releasy.BaseReleaseMiner(),
            releasy.ContributorMiner(),
            releasy.SemanticReleaseMiner()
        ).mine()

        rapid_releases = [release for release in project.main_releases if is_rapid_release(release) and len(release.commits) >= 10]
        trad_releases = [release for release in project.main_releases if is_trad_release(release) and len(release.commits) >= 10]

        project_data = {
            'project': project.name,
            'prefixes': len(project.releases.prefixes()),
            'prefixes_names': str(" ".join(project.releases.prefixes())),
            'main_releases': len(project.main_releases),
            'rapid_releases': len(rapid_releases),
            'trad_releases': len(trad_releases),
            'patches': len(project.patches),
            'releases': len(project.releases)
        }

        elapsed_time = datetime.datetime.now() - elapsed_time
        project_data['time'] = elapsed_time
    except Exception as err:
        print(f"{name:40} {err=}")
        traceback.print_exception(err)
        project_data = {}
        mreleases = [{}]
    
    return project_data

In [6]:
project_names = list(map(lambda p: p['nameWithOwner'], projects_data))
# project_names = project_names[:20]

In [7]:
with Pool(processes=CPU) as pool:
    processed = 0
    results = []
    progress = IntProgress(min=0, max=len(project_names))
    display(progress)
    for result in pool.imap_unordered(mine, project_names):
        results.append(result)
        progress.value += 1
        
projects = pd.DataFrame(results)
projects.sample(10)

IntProgress(value=0, max=2772)

Unnamed: 0,project,prefixes,prefixes_names,main_releases,rapid_releases,trad_releases,patches,releases,time
1822,mholt/PapaParse,1,,19,1,2,55,74,0 days 00:00:00.042188
2338,flatpickr/flatpickr,2,v,23,3,4,121,144,0 days 00:00:00.149140
1135,squizlabs/PHP_CodeSniffer,1,,19,5,12,51,70,0 days 00:00:00.532551
2594,minio/minio,1,release-,1,0,0,0,1,0 days 00:00:00.098790
177,chyroc/WechatSogou,1,v,8,0,0,15,23,0 days 00:00:00.022833
1753,mail-in-a-box/mailinabox,1,v,45,13,11,0,45,0 days 00:00:00.214541
1407,sass/node-sass,4,RELEASE- napi_ v,45,7,15,114,159,0 days 00:00:00.476781
2197,BabylonJS/Babylon.js,2,v,36,10,14,36,72,0 days 00:00:02.181131
787,cryptomator/cryptomator,1,,18,7,6,73,91,0 days 00:00:00.214441
1722,tinymce/tinymce,24,@ephox/porkbun@ @ephox/polaris@ @ephox/agar@ ...,59,0,17,1408,1467,0 days 00:00:03.663213


In [8]:
len(projects)

2772

In [9]:
projects.to_csv(DATA_PATH / '10_projects_all.csv')

## Select Releases

In [10]:
selected_projects = projects.copy()

### Remove projects without main releases

These projects may contain releases, but releasy did not extracted any main release. Hence, we need to remove these projects from the analysis

In [11]:
# total = len(selected_projects)
# selected_projects = selected_projects.query('main_releases > 0')
# removed = total - len(selected_projects)
# print(f"Discarded {removed} projects")
# print(f"Kept {len(selected_projects)} projects")

### Remove without few rapid and traditional releases

The projects must have at least one rapid and one traditional release

In [12]:
total = len(selected_projects)
selected_projects = selected_projects.query('rapid_releases >= 2 and trad_releases >= 2')
removed = total - len(selected_projects)
print(f"Discarded {removed} projects")
print(f"Kept {len(selected_projects)} projects")

Discarded 1451 projects
Kept 1321 projects


### Remove projecs with uncommon prefixes

These projects usually represent mono repo, i.e., multiple software projects in the same git repository. Hence, we need to remove these projects from the analysis

In [13]:
def uncommon_prefixes(qnt, names):
    uncommon_prefixes = 0
    if qnt <= 1:
        return uncommon_prefixes
    else:
        names = str(names).split(' ')
        uncommon_prefixes = [name for name in names if not re.match("^((|vv?|rel|l|rel/v|r|release)[/._-]?)$", name, flags=re.IGNORECASE)]
        return len(uncommon_prefixes)
        
selected_projects = selected_projects.assign(
    uncommon_prefixes=selected_projects.apply(lambda row: uncommon_prefixes(row['prefixes'], row['prefixes_names']), axis = 1))

In [14]:
len(selected_projects.query('uncommon_prefixes > 0'))

282

In [15]:
total = len(selected_projects)
selected_projects = selected_projects.query('uncommon_prefixes == 0')
removed = total - len(selected_projects)
print(f"Discarded {removed} projects")
print(f"Kept {len(selected_projects)} projects")

Discarded 282 projects
Kept 1039 projects


In [16]:
selected_projects.sample(10)

Unnamed: 0,project,prefixes,prefixes_names,main_releases,rapid_releases,trad_releases,patches,releases,time,uncommon_prefixes
447,PHPOffice/PHPWord,1,,12,3,7,8,20,0 days 00:00:00.098578,0
1418,hediet/vscode-drawio,1,v,10,4,2,13,23,0 days 00:00:00.027145,0
1350,giampaolo/psutil,1,release-,32,8,17,52,84,0 days 00:00:00.384450,0
2333,DapperLib/Dapper,1,,31,4,6,13,44,0 days 00:00:00.069216,0
531,qemu/qemu,2,release_ v,48,2,42,86,134,0 days 00:00:05.838296,0
2474,tensorflow/models,2,v. v,19,3,9,6,25,0 days 00:00:00.733005,0
961,ffuf/ffuf,1,v,18,2,4,6,24,0 days 00:00:00.025843,0
2312,janl/mustache.js,2,v,20,3,8,23,43,0 days 00:00:00.051203,0
1539,wechaty/wechaty,1,v,15,4,7,8,23,0 days 00:00:00.373346,0
927,developit/microbundle,2,v,15,5,5,24,39,0 days 00:00:00.192207,0


In [17]:
len(selected_projects)

1039

In [18]:
selected_projects.to_csv(DATA_PATH / '10_projects_selected.csv', index=False)