Skip to content

Commit

Permalink
codeclone_detection: Add ClangCloneDetectionBear
Browse files Browse the repository at this point in the history
  • Loading branch information
sils committed May 24, 2015
1 parent b4b95bc commit db047bf
Show file tree
Hide file tree
Showing 25 changed files with 715 additions and 0 deletions.
1 change: 1 addition & 0 deletions .misc/.install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@ if python --version | grep 3\.4 ; then
pip install coveralls codecov
fi

pip install munkres3
sudo apt-get install espeak libclang1-3.4
sudo ln -s /usr/lib/x86_64-linux-gnu/libclang.so.1 /usr/lib/x86_64-linux-gnu/libclang.so
42 changes: 42 additions & 0 deletions bears/codeclone_detection/ClangCloneDetectionBear.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from coalib.results.RESULT_SEVERITY import RESULT_SEVERITY
from coalib.results.Result import Result
from coalib.bears.GlobalBear import GlobalBear
from coalib.misc.i18n import _
from bears.codeclone_detection.ClangSimilarityBear import ClangSimilarityBear


class ClangCloneDetectionBear(GlobalBear):
def run(self,
dependency_results: dict,
max_clone_difference: float=0.2):
'''
Checks the given code for similar functions that are probably
redundant.
:param max_clone_difference: The maximum difference a clone should
have.
'''
differences = dependency_results["ClangSimilarityBear"][0].contents

self.debug("Creating results...")
results = []
for function_1, function_2, difference in differences:
if difference < max_clone_difference:
results.append(Result(
self.__class__.__name__,
_("Code clone found. The other occurrence is at file "
"{file}, line {line}, function {function}. The "
"similarity is {similarity}.").format(
file=function_2[0],
line=function_2[1],
function=function_2[2],
similarity=1-difference),
file=function_1[0],
severity=RESULT_SEVERITY.MAJOR,
line_nr=function_1[1]))

return results

@staticmethod
def get_dependencies():
return [ClangSimilarityBear]
100 changes: 100 additions & 0 deletions bears/codeclone_detection/ClangSimilarityBear.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from itertools import combinations
import multiprocessing


from coalib.processes.SectionExecutor import get_cpu_count
from coalib.results.HiddenResult import HiddenResult
from coalib.settings.Setting import typed_dict
from coalib.bears.GlobalBear import GlobalBear
from bears.codeclone_detection.ClangCountVectorCreator import \
ClangCountVectorCreator
from bears.codeclone_detection.ClangCountingConditions import condition_dict
from bears.codeclone_detection.CloneDetectionRoutines import \
compare_functions, \
get_count_matrices


"""
counting_condition_dict is a function object generated by typed_dict. This
function takes a setting and creates a dictionary out of it while it
converts all keys to counting condition function objects (via the
condition_dict) and all values to floats while unset values default to 1.
"""
counting_condition_dict = typed_dict(
lambda setting: condition_dict[str(setting).lower()],
float,
1)


# Coverage cannot be measured because this is in another process
def get_difference(args): # pragma: no cover
"""
Retrieves the difference between two functions using the munkres algorithm.
:param args: A tuple holding the first function id, the second and the
count matrices dictionary holding the count matrices for
each function with the function id as key.
:return: A tuple containing both function ids and their similarity.
"""
function_1, function_2, count_matrices = args
return (function_1,
function_2,
compare_functions(count_matrices[function_1],
count_matrices[function_2]))


class ClangSimilarityBear(GlobalBear):
def run(self,
condition_list: counting_condition_dict):
'''

This comment has been minimized.

Copy link
@Udayan12167

Udayan12167 May 24, 2015

Contributor

Maybe use of triple double quotes here?

This comment has been minimized.

Copy link
@sils

sils May 24, 2015

Author Member

single quote mark for translation, double don't, this will be shown to the user potentially! Thus we should mark it for translation.

This comment has been minimized.

Copy link
@Udayan12167

Udayan12167 May 24, 2015

Contributor

Yeah sure... Was still not clear about the difference.

This comment has been minimized.

Copy link
@sils

sils May 24, 2015

Author Member

I'll need to write a doc page prroabbly...

Retrieves similarities for code clone detection. Those can be reused in
another bear to produce results.
:param condition_list: A comma seperated list of counting
conditions. Possible values are: used,
returned, is_condition, in_condition,
is_assignee, is_assigner, loop_content.
Weightings can be assigned to each
condition due to providing a dict
value, i.e. having used weighted in
half as much as other conditions would
simply be: "used: 0.5, is_assignee".
Weightings default to 1 if unset.
'''
if not isinstance(condition_list, dict):
self.err("The condition_list setting is invalid. Code clone "
"detection cannot run.")
return

self.debug("Using the following counting conditions:")
for key, val in condition_list.items():
self.debug(" *", key.__name__, "(weighting: {})".format(val))

self.debug("Creating count matrices...")
count_matrices = get_count_matrices(
ClangCountVectorCreator(list(condition_list.keys()),
list(condition_list.values())),
list(self.file_dict.keys()),
lambda prog: self.debug("{:2.4f}%...".format(prog)))

self.debug("Calculating differences...")
# Code clone detection may take ages for a larger code basis. It is
# highly probable, that no other bears are running in parallel,
# thus we do parallel execution within this bear.
pool = multiprocessing.Pool(get_cpu_count())

differences = []
function_count = len(count_matrices)
# Thats n over 2, hardcoded to simplify calculation
combination_length = function_count * (function_count-1) / 2
function_combinations = [(f1, f2, count_matrices)
for f1, f2 in combinations(count_matrices, 2)]

for i, elem in enumerate(pool.imap_unordered(get_difference,
function_combinations,
chunksize=100)):
if i % 1000 == 0:
self.debug("{:2.4f}%...".format(100*i/combination_length))
differences.append(elem)

return [HiddenResult(self.__class__.__name__, differences)]
91 changes: 91 additions & 0 deletions bears/codeclone_detection/CloneDetectionRoutines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from munkres import Munkres
# Instantiate globally since this class is holding stateless public methods.
munkres = Munkres()


def exclude_function(count_matrix):
"""
Determines heuristically whether or not it makes sense for clone
detection to take this function into account.
Applied heuristics:
* Functions with only count vectors with a sum of all elements of 1
or 0 are very likely only declarations or empty and to be ignored.
:param count_matrix: A dictionary with count vectors representing all
variables for a function.
:return: True if the function is useless for evaluation.
"""
return all(sum(cv.count_vector) < 2 for cv in count_matrix.values())


def get_count_matrices(count_vector_creator,
filenames,
progress_callback=lambda x: x):
"""
Retrieves matrices holding count vectors for all variables for all
functions in the given file.
:param count_vector_creator: A object with a get_vectors_for_file method
taking a filename as argument.
:param filenames: The files to create count vectors for.
:param progress_callback: A function with one float argument which is
called after processing each file with the
progress percentage (float) as an argument.
:return: A dict holding a tuple of (file, line,
function) as key and as value a dict with
variable names as key and count vector
objects as value.
"""
result = {}
maxlen = len(filenames)

for i, filename in enumerate(filenames):
progress_callback(100*(i/maxlen))
count_dict = count_vector_creator.get_vectors_for_file(filename)
for function in count_dict:
if not exclude_function(count_dict[function]):
result[(filename,
function[0],
function[1])] = count_dict[function]

return result


# Coverage cannot be measured because this is in another process
def compare_functions(cm1, cm2): # pragma: no cover
"""
Compares the functions represented by the given count matrices.
:param cm1: Count vector dict for the first function.
:param cm2: Count vector dict for the second function.
:return: The difference between these functions, 0 is identical and
1 is not similar at all.
"""
assert isinstance(cm1, dict)
assert isinstance(cm2, dict)
if len(cm1) == 0 or len(cm2) == 0:
return 1 if len(cm1) != len(cm2) else 0

# The cost matrix holds the difference between the two variables i and
# j in the i/j field. This is a representation of a bipartite weighted
# graph with nodes representing the first function on the one side
# (rows) and the nodes representing the second function on the other
# side (columns). The fields in the matrix are the weighted nodes
# connecting each element from one side to the other.
cost_matrix = [[cv1.difference(cv2)
for cv2 in cm2.values()]
for cv1 in cm1.values()]

# Pad manually with ones. If we have one variable in one function and
# no corresponding in the other this is 100% difference, so 1.
cost_matrix = munkres.pad_matrix(cost_matrix, pad_value=1)

# The munkres algorithm will calculate a matching such that the sum of
# the taken fields is minimal. It thus will associate each variable
# from one function to one on the other function.
matching = munkres.compute(cost_matrix)

# Sum it up, normalize it so we have a value in [0, 1]
return (sum(cost_matrix[x][y] for x, y in matching) /
max(len(cm1), len(cm2)))
105 changes: 105 additions & 0 deletions bears/tests/codeclone_detection/ClangCloneDetectionBearTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import sys
import unittest
import os
import inspect
from queue import Queue

sys.path.insert(0, ".")

from bears.codeclone_detection.ClangSimilarityBear import ClangSimilarityBear
from bears.codeclone_detection.ClangCloneDetectionBear import \
ClangCloneDetectionBear
from coalib.bearlib.parsing.clang.cindex import Index, LibclangError
from coalib.settings.Section import Section
from coalib.settings.Setting import Setting


class ClangCloneDetectionBearTest(unittest.TestCase):
def setUp(self):
self.base_test_path = os.path.abspath(os.path.join(
os.path.dirname(inspect.getfile(ClangCloneDetectionBearTest)),
"clone_detection_samples"))
self.section = Section("default")
self.section.append(Setting("condition_list",
"returned, "
"is_condition, "
"in_condition, "
"in_second_level_condition, "
"in_third_level_condition, "
"is_assignee, "
"is_assigner, "
"loop_content, "
"second_level_loop_content, "
"third_level_loop_content, "
"is_param, "
"in_sum, "
"in_product, "
"in_binary_operation,"
"member_accessed"))
self.clone_files = [os.listdir(os.path.join(self.base_test_path,
"clones"))]

def test_dependencies(self):
self.assertIn(ClangSimilarityBear,
ClangCloneDetectionBear.get_dependencies())

def test_invalid_conditions(self):
self.section.append(Setting("condition_list", "bullshit"))

self.uut = ClangSimilarityBear({}, self.section, Queue())
self.assertEqual(self.uut.run_bear_from_section([], {}), None)

def test_non_clones(self):
self.non_clone_files = [
os.path.join(self.base_test_path, "non_clones", elem)
for elem in os.listdir(os.path.join(self.base_test_path,
"non_clones"))]

self.check_clone_detection_bear(self.non_clone_files,
lambda results:
self.assertEqual(results, []))

def test_clones(self):
self.clone_files = [
os.path.join(self.base_test_path, "clones", elem)
for elem in os.listdir(os.path.join(self.base_test_path,
"clones"))]

self.check_clone_detection_bear(self.clone_files,
lambda results:
self.assertNotEqual(results, []))

def check_clone_detection_bear(self, files, result_check_function):
"""
Checks the results of the CloneDetectionBear with the given function.
:param files: The files to check. Each will be checked
on its own.
:param result_check_function: A function yielding an exception if the
results are invalid.
"""
for file in files:
similarity_results = ClangSimilarityBear(
{file: ""},
self.section,
Queue()).run_bear_from_section([], {})
uut = ClangCloneDetectionBear(
{file: ""},
self.section,
Queue())
arg_dict = {"dependency_results":
{"ClangSimilarityBear": similarity_results}}

result_check_function(uut.run_bear_from_section([], arg_dict))


def skip_test():
try:
Index.create()
return False
except LibclangError as error:
return str(error)


if __name__ == '__main__':
unittest.main(verbosity=2)
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
int faculty1(int x) {
int result = x;
while(x > 2) {
result *= --x;
}

return result;
}

int faculty2(int x) {
int result = x;
for(x--; x > 1; --x) {
result *= x;
}

return result;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
void original(int n) {
float sum=0.0; //C1
float prod=1.0;
for (int i=1; i<=n; i++)
{sum=sum + i;
prod = prod * i;
foo(sum, prod); }}

void sumProd(int n) {
float sum=0.0; //C1
float prod=1.0;
for (int i=1; i<=n; i++)
{sum=sum + i;
prod = prod * i;
foo(sum, prod); }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
void original(int n) {
float sum=0.0; //C1
float prod=1.0;
for (int i=1; i<=n; i++)
{sum=sum + i;
prod = prod * i;
foo(sum, prod); }}

void sumProd(int n) {
float sum=0.0; //C1'
float prod=1.0; //C
for (int i=1; i<=n; i++)
{sum=sum + i;
prod = prod * i;
foo(sum, prod); }}
Loading

8 comments on commit db047bf

@Udayan12167
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just add something with weighted conditions in the test?

@Udayan12167
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ack otherwise

@sils
Copy link
Member Author

@sils sils commented on db047bf May 24, 2015

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that wouldn't change the outcome of the test (and shouldn't).

@sils
Copy link
Member Author

@sils sils commented on db047bf May 24, 2015

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so one could do an extra test for that... at least weightings are tested for the ClangCountVectorCreator.

@Udayan12167
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe you can add a test code sample in the documentation. My only concern is to show the users how weightings work.

@sils
Copy link
Member Author

@sils sils commented on db047bf May 24, 2015

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Udayan12167
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay thats cool then. Good too finally see it all come together :D

@sils
Copy link
Member Author

@sils sils commented on db047bf May 24, 2015

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, pushed tjhat to master :) we just got codeclone detection :)

Please sign in to comment.