In [1]:
import csv
import pandas as pd
import re
import hashlib
import os
import json
import subprocess
import glob
import random 

root = "/home/jovyan/work"

# 3.5 Analysis of logs

This notebook shows the results of analyzing the logs of the first commit where the regression test cannot be built.

In [2]:
def fromJsonList(list_str):
    return json.loads(list_str.replace("'","\""))

In [3]:
def getParent(commit, commits_dict):
    c_hash = fromJsonList(commit['parents'])[0]
    if c_hash == '':
        return None
    return commits_dict[c_hash]

In [4]:
def searchNoTransplantableReason(project, bug):

    commit_history_results_path = root+"/analysis/results/%s/Bug_%s/commit_history_results.csv"%(project,bug)

    with open(commit_history_results_path) as csvfile:
        commits = list(csv.DictReader(csvfile))
    
    commits_dict = {c['commit']:c for c in commits}
        
    ffc = None
    for commit in commits:
        if commit['State'] == "TestFail":
            ffc = commit

    try:
        first_commit_no_transplantable = getParent(ffc, commits_dict)
    except Exception as e:
        print(ffc, project, bug)
        raise e        
    
    if first_commit_no_transplantable is None:
        # Test always can be transplanted
        return None, None, "Always transplantable", "Always transplantable",""
    
    
    while first_commit_no_transplantable['State'] != "TestBuildError":
        first_commit_no_transplantable = getParent(first_commit_no_transplantable, commits_dict)
        # Reach last commit
        if first_commit_no_transplantable is None:
            first_commit_no_transplantable = getParent(ffc, commits_dict)
            
            # First not transplantable is last commit
            if getParent(first_commit_no_transplantable, commits_dict) is None:
                return None, None, "Always transplantable", "Always transplantable",""
            
            # Test stop be able to be transplanted due to source build problem
            commit = first_commit_no_transplantable  
            logs_path = root+"/results/%s/Bug_%s/commits/%s-%s/source-build.log"%(project,bug, commit['id'],commit['commit'])
            log = subprocess.check_output(['cat', logs_path]).decode('utf-8')
            
            error ="Other source build error"
            match = re.search("error: (.*)", log)
            if match is not None:
                error = match.group(1)
                if "unreported exception" in error:
                    error ="Other source build error"
                elif "cannot implement remove(Object,Object) in Map" in error:
                    error = "Java version error"
                elif "withFilterId(Object) in BeanSerializerBase" in error:
                    error = "Java version error"
                elif "as of release 5" in error:
                    error = "Java version error"
            else:
                if "BUILD FAILED" in log:
#                     match = re.search("BUILD FAILED\n(.*)", log)
#                     error = match.group(1)
                    if "does not exist" in log:
                        error = "File or directory not exist"
                else:
                    if "there is no POM in this directory" in log:
                        error = "No pom.xml file"
                    elif "Could not resolve dependencies" in log:
                        error = "Dependency resolution"
            return ffc, first_commit_no_transplantable, error, "Source build error",""
    
            
    commit = first_commit_no_transplantable       
    logs_path = root+"/results/%s/Bug_%s/commits/%s-%s/test-build.log"%(project,bug, commit['id'],commit['commit'])
    log = subprocess.check_output(['cat', logs_path]).decode('utf-8')

    match = re.search("error: (.*)", log)
    
    
    not_compatible_error = "Test code not compatible with source code"
    error_type = not_compatible_error
    
    if match is not None:
        error = match.group(1)
        if error == "cannot find symbol":
            pass
        elif error.startswith("duplicate class"):
            error = "duplicate class"
        elif error.startswith("incompatible types"):
            error = "incompatible types"
        elif error.startswith("no suitable method found"):
            error = "no suitable method found"
        elif error.startswith("no suitable constructor found"):
            error = "no suitable constructor found"
        elif re.search("package .* does not exist", error):
            error = "package X does not exist"
        elif "cannot be applied to given types" in error:
            error = "cannot be applied to given types"
        elif "is not abstract and does not override abstract method" in log:
            error = "Class X is not abstract and does not override abstract method Y"
        elif "not supported in " in error:
            error = "Java version error"
            #error_type = "Java version error"
        elif "unreported exception" in error:
            error = "unreported exception"
        elif "as of release 5" in error:
            error = "Java version error"
            #error_type = "Java version error"
        elif "has private access" in error:
            error = "has private access"
        
        detailed_error = re.search("error: (.*)\n(.*)\n(.*)\n(.*)\n(.*)", log).group(0)
    else:
        if "cannot find symbol" in log:
            error = "cannot find symbol"
        elif "not supported in " in log:
            error = "Java version error"
            #error_type = "Java version error"
        elif "incompatible types" in log:
            error = "incompatible types"
        elif re.search("method .* in class .* cannot be applied to given types", log):
            error = "method X in class Y cannot be applied to given types"
        elif re.search("reference to .* is ambiguous", log):
            error = "reference to X is ambiguous"
        elif "is not abstract and does not override abstract method" in log:
            error = "Class X is not abstract and does not override abstract method Y"
        elif "method does not override or implement a method from a supertype" in log:
            error = "method does not override or implement a method from a supertype"
        elif "cannot be applied to given types" in log:
            error = "cannot be applied to given types"
        elif "has private access" in log:
            error = "Class or method has private access"
        else:
            error = "Not detected"
        detailed_error = log

    return ffc, commit, error, error_type, detailed_error

In [5]:
searchNoTransplantableReason("JacksonDatabind","1")

({'id': '305',
  'commit': '3fdbf62dfe38c1a17521abfeae615593f228123d',
  'Build': 'True',
  'BuildTest': 'True',
  'ExecuteTest': 'False',
  'HasTestReport': 'True',
  'State': 'TestFail',
  'parents': "['8ad49f2edcdf0526dcfb0fb5a7f63b903b36c58f']",
  'date': '2012-10-09 17:00:13 -0700'},
 {'id': '546',
  'commit': 'c277641421be2b2fe3319fa75f54b9364ca91178',
  'Build': 'True',
  'BuildTest': 'False',
  'ExecuteTest': 'False',
  'HasTestReport': 'False',
  'State': 'TestBuildError',
  'parents': "['83bcd182408b0180049a7d201e09d996ec424593']",
  'date': '2012-05-14 19:40:57 -0700'},
 'annotation type not applicable to this kind of declaration',
 'Test code not compatible with source code',
 'error: annotation type not applicable to this kind of declaration\n[ERROR] /home/regseek/workdir/projects/JacksonDatabind_Bug_1/src/test/java/com/fasterxml/jackson/databind/struct/TestPOJOAsArray.java:[81,4] error: annotation type not applicable to this kind of declaration\n[ERROR] /home/regseek/work

In [6]:
errors = []
results = []
projects = [ 
     "Cli", "Closure", "Time", 
     "JacksonXml", "Collections", "Codec", "JxPath",
     "Compress", "Csv", "JacksonCore", "JacksonDatabind", 
     "Jsoup", "Lang", "Math", "Gson", "Mockito"
]
for project in projects:
    # FOR EACH BUG
    for bug_path in glob.glob("{root}/results/{project}/Bug_*/".format(root=root, project=project)):
        bug = re.search(r"Bug_(\d+)", bug_path).group(1)
        
        # Filter detected regressions
        with open(root+"/analysis/results/"+project+"/Bug_"+bug+"/bug_result.json") as jsonfile:
            result = json.loads(jsonfile.read())
            results.append(result)
        if result['category'] == "No regression is detected":
            
            ffc, ffc_parent, error, error_type, detailed_error = searchNoTransplantableReason(project,bug)
            errors.append({
                'project': project,
                'bug': bug,
                'error': error,
                'error_type': error_type,
                'count': 1,
#                 'detailed_error': detailed_error,
                'ffc': ffc,
                'ffc_parent': ffc_parent,
                'executionsOnPast': result['executionsOnPast']
            })

In [7]:
len(errors)

710

In [8]:
df = pd.DataFrame.from_dict(errors)
df.groupby(['error',"error_type"])[["count"]].count().sort_values(by='count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
error,error_type,Unnamed: 2_level_1
cannot find symbol,Test code not compatible with source code,352
Always transplantable,Always transplantable,55
cannot be applied to given types,Test code not compatible with source code,51
package X does not exist,Test code not compatible with source code,40
No pom.xml file,Source build error,35
Java version error,Test code not compatible with source code,33
duplicate class,Test code not compatible with source code,27
File or directory not exist,Source build error,25
no suitable method found,Test code not compatible with source code,21
Java version error,Source build error,19


In [9]:
df[df['error_type']=="Test code not compatible with source code"].groupby(['error'])[["count"]].count().sort_values(by='count', ascending=False)

Unnamed: 0_level_0,count
error,Unnamed: 1_level_1
cannot find symbol,352
cannot be applied to given types,51
package X does not exist,40
Java version error,33
duplicate class,27
no suitable method found,21
incompatible types,11
no suitable constructor found,10
unreported exception,5
has private access,5


In [10]:
df[df['error_type']=="Source build error"].groupby(['error'])[["count"]].count().sort_values(by='count', ascending=False)

Unnamed: 0_level_0,count
error,Unnamed: 1_level_1
No pom.xml file,35
File or directory not exist,25
Java version error,19
Other source build error,3


In [11]:
df.groupby(['error_type'])[["count"]].count().sort_values(by='count', ascending=False)

Unnamed: 0_level_0,count
error_type,Unnamed: 1_level_1
Test code not compatible with source code,573
Source build error,82
Always transplantable,55


# 3.6 Prevalence of bugs in new functionality

In [12]:
test_not_compatible_bugs = df[df['error_type']=="Test code not compatible with source code"].T.to_dict().values()
random.seed(2023)
random_bugs = random.sample(list(test_not_compatible_bugs),100)

In [13]:
repository = {
    "Cli": "https://github.com/apache/commons-cli/commit/",
    "JxPath": "https://github.com/apache/commons-jxpath/commit/",
    "Codec": "https://github.com/apache/commons-codec/commit/",
    "Time": "https://github.com/JodaOrg/joda-time/commit/",
    "Compress": "https://github.com/apache/commons-compress/commit/",
    "JacksonCore": "https://github.com/FasterXML/jackson-core/commit/",
    "JacksonXml": "https://github.com/FasterXML/jackson-dataformat-xml/commit/",
    "JacksonDatabind": "https://github.com/FasterXML/jackson-databind/commit/",
    "Gson": "https://github.com/google/gson/commit/",
    "Jsoup": "https://github.com/jhy/jsoup/commit/",
    "Lang": "https://github.com/apache/commons-lang/commit/",
    "Math": "https://github.com/Maes95/commons-math/commit/",
    "Closure": "https://github.com/google/closure-compiler/commit/",
    "Mockito": "https://github.com/mockito/mockito/commit/",
    "Csv": "https://github.com/apache/commons-csv/commit/",
    "Collections": "https://github.com/apache/commons-collections/commit/"
}

In [14]:
def getBFC(project, bug_id):
    commit_history_results_path = root+"/analysis/results/{project}/Bug_{bug_id}/commit_history_results.csv".format(
        project=project, bug_id=bug_id
    )
    with open(commit_history_results_path) as csvfile:
        commits = list(csv.DictReader(csvfile))
    return commits[0]['commit']

In [15]:
test_not_compatible_bugs_info = []
idx = 1
for bug in random_bugs:

    ffc = bug['ffc']
    ffc_parent = bug['ffc_parent']
    logs_path = root+"/results/%s/Bug_%s/commits/%s-%s/test-build.log"%(
        bug["project"],bug["bug"], ffc_parent['id'],ffc_parent['commit']
    )
    
    # A bit unnecesary, a copy should be enoght, but maybe we need to access the log here in advance
    log = subprocess.check_output(['cat', logs_path]).decode('utf-8')
    file_name= "%d-%s-Bug_%s.log"%(idx,bug["project"],bug["bug"])
    dest_path = "random_bugs_analyzed/logs/"+file_name
    with open(dest_path, 'w') as f:
        f.write(log)
        
    test_not_compatible_bugs_info.append({
        'idx': idx,
        'project': bug["project"],
        'bug_id': bug["bug"],
        'BFC': repository[bug["project"]]+getBFC(bug["project"], bug["bug"]),
        'FFC': repository[bug["project"]]+ffc['commit'],
        'FFC_parent_error': bug['error'],
        'FFC_parent_log':  file_name
    })
    idx+=1

In [16]:
pd.DataFrame.from_dict(test_not_compatible_bugs_info).to_csv('random_bugs_analyzed/100_random_bugs.csv', index=False) 

In [17]:
bugs_with_comment = []
idx = 1
for bug in random_bugs:

    ffc = bug['ffc']
    
    commit_history_results_path = root+"/results/%s/Bug_%s/commit_history.csv"%(
        bug["project"],bug["bug"]
    )

    with open(commit_history_results_path) as csvfile:
        commits = list(csv.DictReader(csvfile))
    
    commits_dict = {c['hash']:c for c in commits}
    
    bugs_with_comment.append({
        'idx': idx,
        'project': bug["project"],
        'bug_id': bug["bug"],
        'BFC': repository[bug["project"]]+getBFC(bug["project"], bug["bug"]),
        'FFC': repository[bug["project"]]+ffc['commit'],
        'comment': commits_dict[ffc['commit']]['comment']
    })
    idx+=1

## 3.6.1 Using NLP

In [21]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

In [27]:
# If not available, downloads stopwords and punctuation for the analysis of commit messages
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [28]:
bugs = {'close', 'hotfix', 'incorrect', 'bug', 'buggi', 'bugfix', 'correct', 'typo', 'resolv', 'issu', 'fix', 'error', 'debug', 'fail', 'repair', 'crash', 'broken', 'miss'}
refactorings = {'delet', 'reduc', 'tune', 'revis', 'modifi', 'renam', 'replac', 'chang', 'remov', 'rollback', 'readd', 'refactor', 'rewrit', 'rework', 'optim', 'deoptim', 'exclud', 'deprec', 'cleanup', 'improv', 'clean', 'format', 'simplifi', 'adjust', 'updat', 'reformat', 'avoid', 'refin', 'suppress', 'revert', 'simplific', 'extend', 'disable', 'upgrad', 'react', 'restor', 'eliminat', 'prepar', 'better', 'elimin', 'move', 'allow'}
adds = {'add', 'new', 'develop', 'creat', 'featur', 'implement', 'ad', 'introduc', 'support', 'migrat'}
tests = {'test', 'pass', 'unit'}


def in_common(words_set, words_list):
    """
    Given a set of words and a list of words
    returns the number of words in the list of words found in the set of words
    """
    found = 0
    found_list = words_set & set(words_list)
    if found_list:
        found = sum(el in list(found_list) for el in words_list)
    return found

def get_commit_type(text):
    """
    given a commit message, determines what type of commit it is
    
    Code partially taken from https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
    """    
    stemmer = PorterStemmer()
    tokenizer = RegexpTokenizer(r'\w+')

    stop_words = set(stopwords.words('english'))
    word_tokens = tokenizer.tokenize(text)
    filtered_text = [w for w in word_tokens if not w.lower() in stop_words]
    stemmed_text = [stemmer.stem(w) for w in filtered_text]
    
    bug = in_common(bugs, stemmed_text)
    refactoring = in_common(refactorings, stemmed_text)
    add = in_common(adds, stemmed_text)
    test = in_common(tests, stemmed_text)
    
    return bug, refactoring, add, test

In [29]:
for bug in bugs_with_comment: 
    isBugRelated, isRefactoringRelated, isAddRelated, isTestRelated = get_commit_type(bug["comment"])
    bug['isBugRelated'] = isBugRelated
    bug['isRefactoringRelated'] = isRefactoringRelated
    bug['isAddRelated'] = isAddRelated
    bug['isTestRelated'] = isTestRelated

In [31]:
pd.DataFrame.from_dict(bugs_with_comment).to_csv('random_bugs_analyzed/100_random_bugs_comments.csv', index=False) 