In [1]:
import sys
# To import python scripts from other folders
sys.path.append('../')
import matplotlib.pyplot as plt
import json
from ProjectTestAnalysis import ProjectTestAnalysis
import os
import pandas as pd
import concurrent
from statistics import median, mean
root="/home/jovyan/work"
results_path=root+"/results/"
procesed_results_path=root+"/notebooks/ProjectAnalysis/TestAnalysis/results/"

In [2]:
ignored_android_projects = [
     'ActionBarSherlock',
     'roboguice',
     'android-Ultra-Pull-To-Refresh',
     'ViewPagerIndicator',
     'SlidingMenu',
     'NineOldAndroids',
     'ListViewAnimations',
     'Android-PullToRefresh',
     'ActiveAndroid',
     'android-common',
     'drag-sort-listview',
]
ignored_non_runnable_projects = [
    "guice",
    "Essentials",
    "neo4j",
    "spring-cloud-microservice-example",
    "canal",
    "hive",
    "spring-boot",
    "YCSB",
    "wildfly",
    "gephi",
    "deeplearning4j",
    "DataX",
    "netty",
    "hbase",
    "zheng",
    "openhab",
    "jstorm",
    "clojure",
    "learning-spark",
    "Mycat-Server" 
]
ignored_projects = ignored_android_projects + ignored_non_runnable_projects

In [3]:
def mean_f(x): return mean(x) if len(x) > 0 else 0.0
def median_f(x): return median(x) if len(x) > 0 else 0.0
def div_zero_f(x, y):
    if x == 0 or y == 0: return 0
    return x / y

In [4]:
def getProjects(dataset):
    path = root+"/configFiles/%sProjects/"%dataset
    projects = []
    for configFile in os.listdir(path):
        with open(path+configFile) as f:
            project_info = json.load(f)
            project_name = project_info["project"]
            if os.path.isdir(procesed_results_path+project_name) and project_name not in ignored_projects:
                projects.append((dataset, project_name))
    return projects

In [5]:
def process_project(dataset, project_name):
            
    # Retrive data
    try:
        pa = ProjectTestAnalysis(project_name,  2, root=root, forceGenerate=False)
    except Exception as e:
        print(project_name)
    results_df = pa.getSummary()
    results_dict = results_df.set_index('commit').to_dict('index')

    # Show charts
    
    # pa.generateAndSavePlot()
    
    # Generate table

    tests_per_commit = results_df["n_test"].tolist()
    
    report_df = pa.getReport()

    total_commits = report_df['id'].count()
    total_buildable = report_df[(report_df['build'] == 'SUCCESS')]['id'].count()

    total_buildable_test_w_test = 0
    total_success_test = 0
    total_failures = 0 
    total_errors = 0  
    
    testable_rate_all_commits = []
    testable_rate_buildable_commits = []
    testable_rate_test_buildable_commits = []
    
    for _, commit in report_df.iterrows():

        c_hash = commit['commit']
        
        if commit['build'] == 'SUCCESS':
            
            test_results = results_dict[c_hash]
            
            if commit['test_build'] == 'SUCCESS' and test_results['n_test'] > 0:

                total_buildable_test_w_test += 1

                testable_rate_all_commits.append(test_results['testable_rate'])
                testable_rate_buildable_commits.append(test_results['testable_rate'])
                testable_rate_test_buildable_commits.append(test_results['testable_rate'])

                if commit['test'] == 'SUCCESS': 
                    total_success_test += 1

                if commit['test'] == 'FAIL':
                    # At least 1 failure (no errors)
                    if test_results['n_failures'] > 0 and test_results['n_errors'] == 0:
                        total_failures += 1
                    # At least 1 error
                    else:
                        total_errors += 1
            else:
                testable_rate_all_commits.append(0.0)
                testable_rate_buildable_commits.append(0.0)
        else:
            testable_rate_all_commits.append(0.0)
    
    mean_consecutive_fails, meadian_consecutive_fails = pa.getMeanAndMedianOfConsecutiveFails()

    # TestCases
    
    test_case_df = pa.getTestCasesRank()

    different_tests    = 0
    always_success     = 0
    success_percent    = 0
    never_success      = 0
    always_error       = 0
    always_fail        = 0
    always_skipped     = 0

    success_percent_per_test = []
    
    if test_case_df is not None: 

        different_tests = len(test_case_df.index)

        for index, row in test_case_df.iterrows():

            if row['commits'] == row['success']: always_success += 1
            if row['success'] == 0: never_success += 1
            if row['commits'] == row['failures']: always_fail += 1
            if row['commits'] == row['errors']: always_error += 1
            if row['commits'] == row['skipped']: always_skipped += 1

            success_percent_per_test.append(row['success'] / row['commits'])
    
    testability_all_commits = div_zero_f(total_success_test, total_commits)
    testability_buildable_commits = div_zero_f(total_success_test, total_buildable)
    testability_test_buildable_commits = div_zero_f(total_success_test, total_buildable_test_w_test)
    
    buildability = div_zero_f(total_buildable, total_commits)
    
    test_buildability_a = div_zero_f(total_buildable_test_w_test, total_commits)
    test_buildability_s = div_zero_f(total_buildable_test_w_test, total_buildable)
    
    total_failures_percent = div_zero_f(total_failures, total_buildable_test_w_test)
    total_errors_percent = div_zero_f(total_errors, total_buildable_test_w_test)

    testability_rate_all_commits = mean_f(testable_rate_all_commits)
    testability_rate_all_commits_std = pd.Series(testable_rate_all_commits).std()
    
    
    testability_rate_buildable_commits = mean_f(testable_rate_buildable_commits)
    testability_rate_test_buildable_commits = mean_f(testable_rate_test_buildable_commits)
    testability_rate_test_buildable_commits_std = pd.Series(testable_rate_test_buildable_commits).std()

    
    
    ordered_snapshots = results_df.sort_values(by=['n_days'], ascending=False)
    oldest = ordered_snapshots.iloc[0]['n_days']
    newest = ordered_snapshots.iloc[-1]['n_days']
    
    loc = pa.getLoCReport().iloc[-1]['loc']
    
    return ([
        pa.project,                                    
        dataset,                                       
        oldest - newest,
        loc,
        total_commits,                                 
        total_buildable,                               
        buildability,              
     
        total_buildable_test_w_test,                          
        test_buildability_a, 
        test_buildability_s,
        
        total_success_test,                                   
        testability_all_commits,                              
        #testability_buildable_commits,
        testability_test_buildable_commits,
        
        testability_rate_all_commits,
        testability_rate_all_commits_std,
        #testability_rate_buildable_commits,
        testability_rate_test_buildable_commits,
        testability_rate_test_buildable_commits_std
    ], results_df)

In [6]:
columns = [
        'Project', 
        'Dataset',
        'Age',
        'LoC',
        'Total Commits', 
        'Source buildable commits', 
        'Source buildability', 
        'Test buildable commits',
        'Test buildability_A',
        'Test buildability_S',
        
        'Fully Testable commits',
        'FullyTestability_A',
        'FullyTestability_T',
        
        'TestabilityRate_A',
        'TestabilityRate_A_std',
        'TestabilityRate_T',
        'TestabilityRate_T_std',
    ]

In [7]:
def get_projects_resume(projects):
    
    future_results = []
    project_results = []
    snapshots_results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
        for dataset, project_name in projects:
            future_results.append(executor.submit(process_project, dataset, project_name))
            
        for f_result in concurrent.futures.as_completed(future_results):
            project_result, snapshots_result = f_result.result()
            project_results.append(project_result)
            snapshots_results.append(snapshots_result)

    df_projects = pd.DataFrame(project_results, columns = columns)
    
    print("Projects: %d"%df_projects['Project'].count())
    df_projects = df_projects.sort_values(by=['Project'])
    return df_projects.sort_values(by=['Dataset']), pd.concat(snapshots_results)

In [8]:
many4j = getProjects("ManySStub4J")

In [9]:
# ManySStub4J
many4j_projects, many4j_snapshots = get_projects_resume(many4j)
many4j_projects.to_csv(results_path+'Many4JResults.csv', index=False)

Projects: 66


In [10]:
def showInfoProject(name):
    p_data = process_project("ManySStub4J",name)
    return pd.DataFrame([p_data[0]], columns = columns)

In [11]:
showInfoProject("fastjson")#[['Test buildability_A', 'Test buildability_S', 'FullyTestability_A', 'FullyTestability_T', 'TestabilityRate_A', 'TestabilityRate_T' ]]

Unnamed: 0,Project,Dataset,Age,LoC,Total Commits,Source buildable commits,Source buildability,Test buildable commits,Test buildability_A,Test buildability_S,Fully Testable commits,FullyTestability_A,FullyTestability_T,TestabilityRate_A,TestabilityRate_A_std,TestabilityRate_T,TestabilityRate_T_std
0,fastjson,ManySStub4J,3536,114992,3863,3641,0.942532,3413,0.88351,0.93738,0,0,0,0.87809,0.31902,0.993865,0.009896
