-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
codeclone_detection: Add ClangCloneDetectionBear
- Loading branch information
Showing
25 changed files
with
710 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from coalib.results.RESULT_SEVERITY import RESULT_SEVERITY | ||
from coalib.results.Result import Result | ||
from coalib.bears.GlobalBear import GlobalBear | ||
from coalib.misc.i18n import _ | ||
from bears.codeclone_detection.ClangSimilarityBear import ClangSimilarityBear | ||
|
||
|
||
class ClangCloneDetectionBear(GlobalBear): | ||
def run(self, | ||
dependency_results: dict, | ||
max_clone_difference: float=0.2): | ||
''' | ||
Checks the given code for similar functions that are probably | ||
redundant. | ||
:param max_clone_difference: The maximum difference a clone should | ||
have. | ||
''' | ||
differences = dependency_results["ClangSimilarityBear"][0].contents | ||
|
||
self.debug("Creating results...") | ||
results = [] | ||
for function_1, function_2, difference in differences: | ||
if difference < max_clone_difference: | ||
results.append(Result( | ||
self.__class__.__name__, | ||
_("Code clone found. The other occurrence is at file " | ||
"{file}, line {line}, function {function}. The " | ||
"similarity is {similarity}.").format( | ||
file=function_2[0], | ||
line=function_2[1], | ||
function=function_2[2], | ||
similarity=1-difference), | ||
file=function_1[0], | ||
severity=RESULT_SEVERITY.MAJOR, | ||
line_nr=function_1[1])) | ||
|
||
return results | ||
|
||
@staticmethod | ||
def get_dependencies(): | ||
return [ClangSimilarityBear] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
from itertools import combinations | ||
import multiprocessing | ||
|
||
|
||
from coalib.processes.SectionExecutor import get_cpu_count | ||
from coalib.results.HiddenResult import HiddenResult | ||
from coalib.settings.Setting import typed_dict | ||
from coalib.bears.GlobalBear import GlobalBear | ||
from bears.codeclone_detection.ClangCountVectorCreator import \ | ||
ClangCountVectorCreator | ||
from bears.codeclone_detection.ClangCountingConditions import condition_dict | ||
from bears.codeclone_detection.CloneDetectionRoutines import \ | ||
compare_functions, \ | ||
get_count_matrices | ||
|
||
|
||
""" | ||
counting_condition_dict is a function object generated by typed_dict. This | ||
function takes a setting and creates a dictionary out of it while it | ||
converts all keys to counting condition function objects (via the | ||
condition_dict) and all values to floats while unset values default to 1. | ||
""" | ||
counting_condition_dict = typed_dict( | ||
lambda setting: condition_dict[str(setting).lower()], | ||
float, | ||
1) | ||
|
||
|
||
# Coverage cannot be measured because this is in another process | ||
def get_difference(args): # pragma: no cover | ||
""" | ||
Retrieves the difference between two functions using the munkres algorithm. | ||
:param args: A tuple holding the first function id, the second and the | ||
count matrices dictionary holding the count matrices for | ||
each function with the function id as key. | ||
:return: A tuple containing both function ids and their similarity. | ||
""" | ||
function_1, function_2, count_matrices = args | ||
return (function_1, | ||
function_2, | ||
compare_functions(count_matrices[function_1], | ||
count_matrices[function_2])) | ||
|
||
|
||
class ClangSimilarityBear(GlobalBear): | ||
def run(self, | ||
condition_list: counting_condition_dict): | ||
''' | ||
Retrieves similarities for code clone detection. Those can be reused in | ||
another bear to produce results. | ||
:param condition_list: A comma seperated list of counting | ||
conditions. Possible values are: used, | ||
returned, is_condition, in_condition, | ||
is_assignee, is_assigner, loop_content. | ||
Weightings can be assigned to each | ||
condition due to providing a dict | ||
value, i.e. having used weighted in | ||
half as much as other conditions would | ||
simply be: "used: 0.5, is_assignee". | ||
Weightings default to 1 if unset. | ||
''' | ||
if not isinstance(condition_list, dict): | ||
self.err("The condition_list setting is invalid. Code clone " | ||
"detection cannot run.") | ||
return | ||
|
||
self.debug("Using the following counting conditions:") | ||
for key, val in condition_list.items(): | ||
self.debug(" *", key.__name__, "(weighting: {})".format(val)) | ||
|
||
self.debug("Creating count matrices...") | ||
count_matrices = get_count_matrices( | ||
ClangCountVectorCreator(list(condition_list.keys()), | ||
list(condition_list.values())), | ||
list(self.file_dict.keys())) | ||
|
||
self.debug("Calculating differences...") | ||
# Code clone detection may take ages for a larger code basis. It is | ||
# highly probable, that no other bears are running in parallel, | ||
# thus we do parallel execution within this bear. | ||
pool = multiprocessing.Pool(get_cpu_count()) | ||
|
||
differences = [] | ||
function_count = len(count_matrices) | ||
# Thats n over 2, hardcoded to simplify calculation | ||
combination_length = function_count * (function_count-1) / 2 | ||
function_combinations = [(f1, f2, count_matrices) | ||
for f1, f2 in combinations(count_matrices, 2)] | ||
|
||
for i, elem in enumerate(pool.imap_unordered(get_difference, | ||
function_combinations, | ||
chunksize=100)): | ||
if i % 1000 == 0: | ||
self.debug("{:2.4f}%".format(100*i/combination_length)) | ||
differences.append(elem) | ||
|
||
return [HiddenResult(self.__class__.__name__, differences)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
from munkres import Munkres | ||
# Instantiate globally since this class is holding stateless public methods. | ||
munkres = Munkres() | ||
|
||
|
||
def exclude_function(count_matrix): | ||
""" | ||
Determines heuristically whether or not it makes sense for clone | ||
detection to take this function into account. | ||
Applied heuristics: | ||
* Functions with only count vectors with a sum of all elements of 1 | ||
or 0 are very likely only declarations or empty and to be ignored. | ||
:param count_matrix: A dictionary with count vectors representing all | ||
variables for a function. | ||
:return: True if the function is useless for evaluation. | ||
""" | ||
return all(sum(cv.count_vector) < 2 for cv in count_matrix.values()) | ||
|
||
|
||
def get_count_matrices(count_vector_creator, filenames): | ||
""" | ||
Retrieves matrices holding count vectors for all variables for all | ||
functions in the given file. | ||
:param count_vector_creator: A object with a get_vectors_for_file method | ||
taking a filename as argument. | ||
:param filenames: The files to create count vectors for. | ||
:return: A dict holding a tuple of (file, line, | ||
function) as key and a dict of count vector | ||
objects with variable names as key as content. | ||
""" | ||
result = {} | ||
i = 0 | ||
maxlen = len(filenames) | ||
|
||
for filename in filenames: | ||
i += 1 | ||
print("{:2.4f}%".format(100*(i/maxlen))) | ||
count_dict = count_vector_creator.get_vectors_for_file(filename) | ||
for function in count_dict: | ||
if not exclude_function(count_dict[function]): | ||
result[(filename, | ||
function[0], | ||
function[1])] = count_dict[function] | ||
|
||
return result | ||
|
||
|
||
# Coverage cannot be measured because this is in another process | ||
def compare_functions(cm1, cm2): # pragma: no cover | ||
""" | ||
Compares the functions represented by the given count matrices. | ||
:param cm1: Count vector dict for the first function. | ||
:param cm2: Count vector dict for the second function. | ||
:return: The difference between these functions, 0 is identical and | ||
1 is not similar at all. | ||
""" | ||
assert isinstance(cm1, dict) | ||
assert isinstance(cm2, dict) | ||
if len(cm1) == 0 or len(cm2) == 0: | ||
return 1 if len(cm1) != len(cm2) else 0 | ||
|
||
# The cost matrix holds the difference between the two variables i and | ||
# j in the i/j field. This is a representation of a bipartite weighted | ||
# graph with nodes representing the first function on the one side | ||
# (rows) and the nodes representing the second function on the other | ||
# side (columns). The fields in the matrix are the weighted nodes | ||
# connecting each element from one side to the other. | ||
cost_matrix = [[cv1.difference(cv2) | ||
for cv2 in cm2.values()] | ||
for cv1 in cm1.values()] | ||
|
||
# Pad manually with ones. If we have one variable in one function and | ||
# no corresponding in the other this is 100% difference, so 1. | ||
cost_matrix = munkres.pad_matrix(cost_matrix, pad_value=1) | ||
|
||
# The munkres algorithm will calculate a matching such that the sum of | ||
# the taken fields is minimal. It thus will associate each variable | ||
# from one function to one on the other function. | ||
matching = munkres.compute(cost_matrix) | ||
|
||
# Sum it up, normalize it so we have a value in [0, 1] | ||
return sum(cost_matrix[x][y] for x, y in matching)/max(len(cm1), | ||
len(cm2)) |
105 changes: 105 additions & 0 deletions
105
bears/tests/codeclone_detection/ClangCloneDetectionBearTest.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
import sys | ||
import unittest | ||
import os | ||
import inspect | ||
from queue import Queue | ||
|
||
sys.path.insert(0, ".") | ||
|
||
from bears.codeclone_detection.ClangSimilarityBear import ClangSimilarityBear | ||
from bears.codeclone_detection.ClangCloneDetectionBear import \ | ||
ClangCloneDetectionBear | ||
from coalib.bearlib.parsing.clang.cindex import Index, LibclangError | ||
from coalib.settings.Section import Section | ||
from coalib.settings.Setting import Setting | ||
|
||
|
||
class ClangCloneDetectionBearTest(unittest.TestCase): | ||
def setUp(self): | ||
self.base_test_path = os.path.abspath(os.path.join( | ||
os.path.dirname(inspect.getfile(ClangCloneDetectionBearTest)), | ||
"clone_detection_samples")) | ||
self.section = Section("default") | ||
self.section.append(Setting("condition_list", | ||
"returned, " | ||
"is_condition, " | ||
"in_condition, " | ||
"in_second_level_condition, " | ||
"in_third_level_condition, " | ||
"is_assignee, " | ||
"is_assigner, " | ||
"loop_content, " | ||
"second_level_loop_content, " | ||
"third_level_loop_content, " | ||
"is_param, " | ||
"in_sum, " | ||
"in_product, " | ||
"in_binary_operation," | ||
"member_accessed")) | ||
self.clone_files = [os.listdir(os.path.join(self.base_test_path, | ||
"clones"))] | ||
|
||
def test_dependencies(self): | ||
self.assertIn(ClangSimilarityBear, | ||
ClangCloneDetectionBear.get_dependencies()) | ||
|
||
def test_invalid_conditions(self): | ||
self.section.append(Setting("condition_list", "bullshit")) | ||
|
||
self.uut = ClangSimilarityBear({}, self.section, Queue()) | ||
self.assertEqual(self.uut.run_bear_from_section([], {}), None) | ||
|
||
def test_non_clones(self): | ||
self.non_clone_files = [ | ||
os.path.join(self.base_test_path, "non_clones", elem) | ||
for elem in os.listdir(os.path.join(self.base_test_path, | ||
"non_clones"))] | ||
|
||
self.check_clone_detection_bear(self.non_clone_files, | ||
lambda results: | ||
self.assertEqual(results, [])) | ||
|
||
def test_clones(self): | ||
self.clone_files = [ | ||
os.path.join(self.base_test_path, "clones", elem) | ||
for elem in os.listdir(os.path.join(self.base_test_path, | ||
"clones"))] | ||
|
||
self.check_clone_detection_bear(self.clone_files, | ||
lambda results: | ||
self.assertNotEqual(results, [])) | ||
|
||
def check_clone_detection_bear(self, files, result_check_function): | ||
""" | ||
Checks the results of the CloneDetectionBear with the given function. | ||
:param files: The files to check. Each will be checked | ||
on its own. | ||
:param result_check_function: A function yielding an exception if the | ||
results are invalid. | ||
""" | ||
for file in files: | ||
similarity_results = ClangSimilarityBear( | ||
{file: ""}, | ||
self.section, | ||
Queue()).run_bear_from_section([], {}) | ||
uut = ClangCloneDetectionBear( | ||
{file: ""}, | ||
self.section, | ||
Queue()) | ||
arg_dict = {"dependency_results": | ||
{"ClangSimilarityBear": similarity_results}} | ||
|
||
result_check_function(uut.run_bear_from_section([], arg_dict)) | ||
|
||
|
||
def skip_test(): | ||
try: | ||
Index.create() | ||
return False | ||
except LibclangError as error: | ||
return str(error) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main(verbosity=2) |
17 changes: 17 additions & 0 deletions
17
bears/tests/codeclone_detection/clone_detection_samples/clones/faculty.c
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
int faculty1(int x) { | ||
int result = x; | ||
while(x > 2) { | ||
result *= --x; | ||
} | ||
|
||
return result; | ||
} | ||
|
||
int faculty2(int x) { | ||
int result = x; | ||
for(x--; x > 1; --x) { | ||
result *= x; | ||
} | ||
|
||
return result; | ||
} |
15 changes: 15 additions & 0 deletions
15
bears/tests/codeclone_detection/clone_detection_samples/clones/s1a.c
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
void original(int n) { | ||
float sum=0.0; //C1 | ||
float prod=1.0; | ||
for (int i=1; i<=n; i++) | ||
{sum=sum + i; | ||
prod = prod * i; | ||
foo(sum, prod); }} | ||
|
||
void sumProd(int n) { | ||
float sum=0.0; //C1 | ||
float prod=1.0; | ||
for (int i=1; i<=n; i++) | ||
{sum=sum + i; | ||
prod = prod * i; | ||
foo(sum, prod); }} |
15 changes: 15 additions & 0 deletions
15
bears/tests/codeclone_detection/clone_detection_samples/clones/s1b.c
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
void original(int n) { | ||
float sum=0.0; //C1 | ||
float prod=1.0; | ||
for (int i=1; i<=n; i++) | ||
{sum=sum + i; | ||
prod = prod * i; | ||
foo(sum, prod); }} | ||
|
||
void sumProd(int n) { | ||
float sum=0.0; //C1' | ||
float prod=1.0; //C | ||
for (int i=1; i<=n; i++) | ||
{sum=sum + i; | ||
prod = prod * i; | ||
foo(sum, prod); }} |
Oops, something went wrong.