Skip to content

Commit

Permalink
codeclone_detection: Add ClangCloneDetectionBear
Browse files Browse the repository at this point in the history
  • Loading branch information
sils committed May 21, 2015
1 parent c9d5eab commit dfeb632
Show file tree
Hide file tree
Showing 25 changed files with 715 additions and 0 deletions.
1 change: 1 addition & 0 deletions .misc/.install.sh
Expand Up @@ -2,5 +2,6 @@ if python --version | grep 3\.4 ; then
pip install coveralls codecov
fi

pip install munkres3
sudo apt-get install espeak libclang1-3.4
sudo ln -s /usr/lib/x86_64-linux-gnu/libclang.so.1 /usr/lib/x86_64-linux-gnu/libclang.so
42 changes: 42 additions & 0 deletions bears/codeclone_detection/ClangCloneDetectionBear.py
@@ -0,0 +1,42 @@
from coalib.results.RESULT_SEVERITY import RESULT_SEVERITY
from coalib.results.Result import Result
from coalib.bears.GlobalBear import GlobalBear
from coalib.misc.i18n import _
from bears.codeclone_detection.ClangSimilarityBear import ClangSimilarityBear


class ClangCloneDetectionBear(GlobalBear):
def run(self,
dependency_results: dict,
max_clone_difference: float=0.2):
'''
Checks the given code for similar functions that are probably
redundant.
:param max_clone_difference: The maximum difference a clone should
have.
'''
differences = dependency_results["ClangSimilarityBear"][0].contents

self.debug("Creating results...")
results = []
for function_1, function_2, difference in differences:
if difference < max_clone_difference:
results.append(Result(
self.__class__.__name__,
_("Code clone found. The other occurrence is at file "
"{file}, line {line}, function {function}. The "
"similarity is {similarity}.").format(
file=function_2[0],
line=function_2[1],
function=function_2[2],
similarity=1-difference),
file=function_1[0],
severity=RESULT_SEVERITY.MAJOR,
line_nr=function_1[1]))

return results

@staticmethod
def get_dependencies():
return [ClangSimilarityBear]
100 changes: 100 additions & 0 deletions bears/codeclone_detection/ClangSimilarityBear.py
@@ -0,0 +1,100 @@
from itertools import combinations
import multiprocessing


from coalib.processes.SectionExecutor import get_cpu_count
from coalib.results.HiddenResult import HiddenResult
from coalib.settings.Setting import typed_dict
from coalib.bears.GlobalBear import GlobalBear
from bears.codeclone_detection.ClangCountVectorCreator import \
ClangCountVectorCreator
from bears.codeclone_detection.ClangCountingConditions import condition_dict
from bears.codeclone_detection.CloneDetectionRoutines import \
compare_functions, \
get_count_matrices


"""
counting_condition_dict is a function object generated by typed_dict. This
function takes a setting and creates a dictionary out of it while it
converts all keys to counting condition function objects (via the
condition_dict) and all values to floats while unset values default to 1.
"""
counting_condition_dict = typed_dict(
lambda setting: condition_dict[str(setting).lower()],
float,
1)


# Coverage cannot be measured because this is in another process
def get_difference(args): # pragma: no cover
"""
Retrieves the difference between two functions using the munkres algorithm.
:param args: A tuple holding the first function id, the second and the
count matrices dictionary holding the count matrices for
each function with the function id as key.
:return: A tuple containing both function ids and their similarity.
"""
function_1, function_2, count_matrices = args
return (function_1,
function_2,
compare_functions(count_matrices[function_1],
count_matrices[function_2]))


class ClangSimilarityBear(GlobalBear):
def run(self,
condition_list: counting_condition_dict):
'''
Retrieves similarities for code clone detection. Those can be reused in
another bear to produce results.
:param condition_list: A comma seperated list of counting
conditions. Possible values are: used,
returned, is_condition, in_condition,
is_assignee, is_assigner, loop_content.
Weightings can be assigned to each
condition due to providing a dict
value, i.e. having used weighted in
half as much as other conditions would
simply be: "used: 0.5, is_assignee".
Weightings default to 1 if unset.
'''
if not isinstance(condition_list, dict):
self.err("The condition_list setting is invalid. Code clone "
"detection cannot run.")
return

self.debug("Using the following counting conditions:")
for key, val in condition_list.items():
self.debug(" *", key.__name__, "(weighting: {})".format(val))

self.debug("Creating count matrices...")
count_matrices = get_count_matrices(
ClangCountVectorCreator(list(condition_list.keys()),
list(condition_list.values())),
list(self.file_dict.keys()),
lambda prog: self.debug("{:2.4f}%...".format(prog)))

self.debug("Calculating differences...")
# Code clone detection may take ages for a larger code basis. It is
# highly probable, that no other bears are running in parallel,
# thus we do parallel execution within this bear.
pool = multiprocessing.Pool(get_cpu_count())

differences = []
function_count = len(count_matrices)
# Thats n over 2, hardcoded to simplify calculation
combination_length = function_count * (function_count-1) / 2
function_combinations = [(f1, f2, count_matrices)
for f1, f2 in combinations(count_matrices, 2)]

for i, elem in enumerate(pool.imap_unordered(get_difference,
function_combinations,
chunksize=100)):
if i % 1000 == 0:
self.debug("{:2.4f}%...".format(100*i/combination_length))
differences.append(elem)

return [HiddenResult(self.__class__.__name__, differences)]
91 changes: 91 additions & 0 deletions bears/codeclone_detection/CloneDetectionRoutines.py
@@ -0,0 +1,91 @@
from munkres import Munkres
# Instantiate globally since this class is holding stateless public methods.
munkres = Munkres()


def exclude_function(count_matrix):
"""
Determines heuristically whether or not it makes sense for clone
detection to take this function into account.
Applied heuristics:
* Functions with only count vectors with a sum of all elements of 1
or 0 are very likely only declarations or empty and to be ignored.
:param count_matrix: A dictionary with count vectors representing all
variables for a function.
:return: True if the function is useless for evaluation.
"""
return all(sum(cv.count_vector) < 2 for cv in count_matrix.values())


def get_count_matrices(count_vector_creator,
filenames,
progress_callback=lambda x: x):
"""
Retrieves matrices holding count vectors for all variables for all
functions in the given file.
:param count_vector_creator: A object with a get_vectors_for_file method
taking a filename as argument.
:param filenames: The files to create count vectors for.
:param progress_callback: A function with one float argument which is
called after processing each file with the
progress percentage (float) as an argument.
:return: A dict holding a tuple of (file, line,
function) as key and as value a dict with
variable names as key and count vector
objects as value.
"""
result = {}
maxlen = len(filenames)

for i, filename in enumerate(filenames):
progress_callback(100*(i/maxlen))
count_dict = count_vector_creator.get_vectors_for_file(filename)
for function in count_dict:
if not exclude_function(count_dict[function]):
result[(filename,
function[0],
function[1])] = count_dict[function]

return result


# Coverage cannot be measured because this is in another process
def compare_functions(cm1, cm2): # pragma: no cover
"""
Compares the functions represented by the given count matrices.
:param cm1: Count vector dict for the first function.
:param cm2: Count vector dict for the second function.
:return: The difference between these functions, 0 is identical and
1 is not similar at all.
"""
assert isinstance(cm1, dict)
assert isinstance(cm2, dict)
if len(cm1) == 0 or len(cm2) == 0:
return 1 if len(cm1) != len(cm2) else 0

# The cost matrix holds the difference between the two variables i and
# j in the i/j field. This is a representation of a bipartite weighted
# graph with nodes representing the first function on the one side
# (rows) and the nodes representing the second function on the other
# side (columns). The fields in the matrix are the weighted nodes
# connecting each element from one side to the other.
cost_matrix = [[cv1.difference(cv2)
for cv2 in cm2.values()]
for cv1 in cm1.values()]

# Pad manually with ones. If we have one variable in one function and
# no corresponding in the other this is 100% difference, so 1.
cost_matrix = munkres.pad_matrix(cost_matrix, pad_value=1)

# The munkres algorithm will calculate a matching such that the sum of
# the taken fields is minimal. It thus will associate each variable
# from one function to one on the other function.
matching = munkres.compute(cost_matrix)

# Sum it up, normalize it so we have a value in [0, 1]
return (sum(cost_matrix[x][y] for x, y in matching) /
max(len(cm1), len(cm2)))
105 changes: 105 additions & 0 deletions bears/tests/codeclone_detection/ClangCloneDetectionBearTest.py
@@ -0,0 +1,105 @@
import sys
import unittest
import os
import inspect
from queue import Queue

sys.path.insert(0, ".")

from bears.codeclone_detection.ClangSimilarityBear import ClangSimilarityBear
from bears.codeclone_detection.ClangCloneDetectionBear import \
ClangCloneDetectionBear
from coalib.bearlib.parsing.clang.cindex import Index, LibclangError
from coalib.settings.Section import Section
from coalib.settings.Setting import Setting


class ClangCloneDetectionBearTest(unittest.TestCase):
def setUp(self):
self.base_test_path = os.path.abspath(os.path.join(
os.path.dirname(inspect.getfile(ClangCloneDetectionBearTest)),
"clone_detection_samples"))
self.section = Section("default")
self.section.append(Setting("condition_list",
"returned, "
"is_condition, "
"in_condition, "
"in_second_level_condition, "
"in_third_level_condition, "
"is_assignee, "
"is_assigner, "
"loop_content, "
"second_level_loop_content, "
"third_level_loop_content, "
"is_param, "
"in_sum, "
"in_product, "
"in_binary_operation,"
"member_accessed"))
self.clone_files = [os.listdir(os.path.join(self.base_test_path,
"clones"))]

def test_dependencies(self):
self.assertIn(ClangSimilarityBear,
ClangCloneDetectionBear.get_dependencies())

def test_invalid_conditions(self):
self.section.append(Setting("condition_list", "bullshit"))

self.uut = ClangSimilarityBear({}, self.section, Queue())
self.assertEqual(self.uut.run_bear_from_section([], {}), None)

def test_non_clones(self):
self.non_clone_files = [
os.path.join(self.base_test_path, "non_clones", elem)
for elem in os.listdir(os.path.join(self.base_test_path,
"non_clones"))]

self.check_clone_detection_bear(self.non_clone_files,
lambda results:
self.assertEqual(results, []))

def test_clones(self):
self.clone_files = [
os.path.join(self.base_test_path, "clones", elem)
for elem in os.listdir(os.path.join(self.base_test_path,
"clones"))]

self.check_clone_detection_bear(self.clone_files,
lambda results:
self.assertNotEqual(results, []))

def check_clone_detection_bear(self, files, result_check_function):
"""
Checks the results of the CloneDetectionBear with the given function.
:param files: The files to check. Each will be checked
on its own.
:param result_check_function: A function yielding an exception if the
results are invalid.
"""
for file in files:
similarity_results = ClangSimilarityBear(
{file: ""},
self.section,
Queue()).run_bear_from_section([], {})
uut = ClangCloneDetectionBear(
{file: ""},
self.section,
Queue())
arg_dict = {"dependency_results":
{"ClangSimilarityBear": similarity_results}}

result_check_function(uut.run_bear_from_section([], arg_dict))


def skip_test():
try:
Index.create()
return False
except LibclangError as error:
return str(error)


if __name__ == '__main__':
unittest.main(verbosity=2)
@@ -0,0 +1,17 @@
int faculty1(int x) {
int result = x;
while(x > 2) {
result *= --x;
}

return result;
}

int faculty2(int x) {
int result = x;
for(x--; x > 1; --x) {
result *= x;
}

return result;
}
@@ -0,0 +1,15 @@
void original(int n) {
float sum=0.0; //C1
float prod=1.0;
for (int i=1; i<=n; i++)
{sum=sum + i;
prod = prod * i;
foo(sum, prod); }}

void sumProd(int n) {
float sum=0.0; //C1
float prod=1.0;
for (int i=1; i<=n; i++)
{sum=sum + i;
prod = prod * i;
foo(sum, prod); }}
@@ -0,0 +1,15 @@
void original(int n) {
float sum=0.0; //C1
float prod=1.0;
for (int i=1; i<=n; i++)
{sum=sum + i;
prod = prod * i;
foo(sum, prod); }}

void sumProd(int n) {
float sum=0.0; //C1'
float prod=1.0; //C
for (int i=1; i<=n; i++)
{sum=sum + i;
prod = prod * i;
foo(sum, prod); }}

0 comments on commit dfeb632

Please sign in to comment.