codeclone_detection: Add ClangCloneDetectionBear

coala · May 21, 2015 · dfeb632 · dfeb632
1 parent c9d5eab
commit dfeb632
Show file tree

Hide file tree

Showing 25 changed files with 715 additions and 0 deletions.
diff --git a/.misc/.install.sh b/.misc/.install.sh
@@ -2,5 +2,6 @@ if python --version | grep 3\.4 ; then
   pip install coveralls codecov
 fi
 
+pip install munkres3
 sudo apt-get install espeak libclang1-3.4
 sudo ln -s /usr/lib/x86_64-linux-gnu/libclang.so.1 /usr/lib/x86_64-linux-gnu/libclang.so
diff --git a/bears/codeclone_detection/ClangCloneDetectionBear.py b/bears/codeclone_detection/ClangCloneDetectionBear.py
@@ -0,0 +1,42 @@
+from coalib.results.RESULT_SEVERITY import RESULT_SEVERITY
+from coalib.results.Result import Result
+from coalib.bears.GlobalBear import GlobalBear
+from coalib.misc.i18n import _
+from bears.codeclone_detection.ClangSimilarityBear import ClangSimilarityBear
+
+
+class ClangCloneDetectionBear(GlobalBear):
+    def run(self,
+            dependency_results: dict,
+            max_clone_difference: float=0.2):
+        '''
+        Checks the given code for similar functions that are probably
+        redundant.
+
+        :param max_clone_difference: The maximum difference a clone should
+                                     have.
+        '''
+        differences = dependency_results["ClangSimilarityBear"][0].contents
+
+        self.debug("Creating results...")
+        results = []
+        for function_1, function_2, difference in differences:
+            if difference < max_clone_difference:
+                results.append(Result(
+                    self.__class__.__name__,
+                    _("Code clone found. The other occurrence is at file "
+                      "{file}, line {line}, function {function}. The "
+                      "similarity is {similarity}.").format(
+                        file=function_2[0],
+                        line=function_2[1],
+                        function=function_2[2],
+                        similarity=1-difference),
+                    file=function_1[0],
+                    severity=RESULT_SEVERITY.MAJOR,
+                    line_nr=function_1[1]))
+
+        return results
+
+    @staticmethod
+    def get_dependencies():
+        return [ClangSimilarityBear]
diff --git a/bears/codeclone_detection/ClangSimilarityBear.py b/bears/codeclone_detection/ClangSimilarityBear.py
@@ -0,0 +1,100 @@
+from itertools import combinations
+import multiprocessing
+
+
+from coalib.processes.SectionExecutor import get_cpu_count
+from coalib.results.HiddenResult import HiddenResult
+from coalib.settings.Setting import typed_dict
+from coalib.bears.GlobalBear import GlobalBear
+from bears.codeclone_detection.ClangCountVectorCreator import \
+    ClangCountVectorCreator
+from bears.codeclone_detection.ClangCountingConditions import condition_dict
+from bears.codeclone_detection.CloneDetectionRoutines import \
+    compare_functions, \
+    get_count_matrices
+
+
+"""
+counting_condition_dict is a function object generated by typed_dict. This
+function takes a setting and creates a dictionary out of it while it
+converts all keys to counting condition function objects (via the
+condition_dict) and all values to floats while unset values default to 1.
+"""
+counting_condition_dict = typed_dict(
+    lambda setting: condition_dict[str(setting).lower()],
+    float,
+    1)
+
+
+# Coverage cannot be measured because this is in another process
+def get_difference(args):  # pragma: no cover
+    """
+    Retrieves the difference between two functions using the munkres algorithm.
+
+    :param args: A tuple holding the first function id, the second and the
+                 count matrices dictionary holding the count matrices for
+                 each function with the function id as key.
+    :return:     A tuple containing both function ids and their similarity.
+    """
+    function_1, function_2, count_matrices = args
+    return (function_1,
+            function_2,
+            compare_functions(count_matrices[function_1],
+                              count_matrices[function_2]))
+
+
+class ClangSimilarityBear(GlobalBear):
+    def run(self,
+            condition_list: counting_condition_dict):
+        '''
+        Retrieves similarities for code clone detection. Those can be reused in
+        another bear to produce results.
+
+        :param condition_list:       A comma seperated list of counting
+                                     conditions. Possible values are: used,
+                                     returned, is_condition, in_condition,
+                                     is_assignee, is_assigner, loop_content.
+                                     Weightings can be assigned to each
+                                     condition due to providing a dict
+                                     value, i.e. having used weighted in
+                                     half as much as other conditions would
+                                     simply be: "used: 0.5, is_assignee".
+                                     Weightings default to 1 if unset.
+        '''
+        if not isinstance(condition_list, dict):
+            self.err("The condition_list setting is invalid. Code clone "
+                     "detection cannot run.")
+            return
+
+        self.debug("Using the following counting conditions:")
+        for key, val in condition_list.items():
+            self.debug(" *", key.__name__, "(weighting: {})".format(val))
+
+        self.debug("Creating count matrices...")
+        count_matrices = get_count_matrices(
+            ClangCountVectorCreator(list(condition_list.keys()),
+                                    list(condition_list.values())),
+            list(self.file_dict.keys()),
+            lambda prog: self.debug("{:2.4f}%...".format(prog)))
+
+        self.debug("Calculating differences...")
+        # Code clone detection may take ages for a larger code basis. It is
+        # highly probable, that no other bears are running in parallel,
+        # thus we do parallel execution within this bear.
+        pool = multiprocessing.Pool(get_cpu_count())
+
+        differences = []
+        function_count = len(count_matrices)
+        # Thats n over 2, hardcoded to simplify calculation
+        combination_length = function_count * (function_count-1) / 2
+        function_combinations = [(f1, f2, count_matrices)
+                                 for f1, f2 in combinations(count_matrices, 2)]
+
+        for i, elem in enumerate(pool.imap_unordered(get_difference,
+                                                     function_combinations,
+                                                     chunksize=100)):
+            if i % 1000 == 0:
+                self.debug("{:2.4f}%...".format(100*i/combination_length))
+            differences.append(elem)
+
+        return [HiddenResult(self.__class__.__name__, differences)]
diff --git a/bears/codeclone_detection/CloneDetectionRoutines.py b/bears/codeclone_detection/CloneDetectionRoutines.py
@@ -0,0 +1,91 @@
+from munkres import Munkres
+# Instantiate globally since this class is holding stateless public methods.
+munkres = Munkres()
+
+
+def exclude_function(count_matrix):
+    """
+    Determines heuristically whether or not it makes sense for clone
+    detection to take this function into account.
+
+    Applied heuristics:
+     * Functions with only count vectors with a sum of all elements of 1
+       or 0 are very likely only declarations or empty and to be ignored.
+
+    :param count_matrix: A dictionary with count vectors representing all
+                         variables for a function.
+    :return:             True if the function is useless for evaluation.
+    """
+    return all(sum(cv.count_vector) < 2 for cv in count_matrix.values())
+
+
+def get_count_matrices(count_vector_creator,
+                       filenames,
+                       progress_callback=lambda x: x):
+    """
+    Retrieves matrices holding count vectors for all variables for all
+    functions in the given file.
+
+    :param count_vector_creator: A object with a get_vectors_for_file method
+                                 taking a filename as argument.
+    :param filenames:            The files to create count vectors for.
+    :param progress_callback:    A function with one float argument which is
+                                 called after processing each file with the
+                                 progress percentage (float) as an argument.
+    :return:                     A dict holding a tuple of (file, line,
+                                 function) as key and as value a dict with
+                                 variable names as key and count vector
+                                 objects as value.
+    """
+    result = {}
+    maxlen = len(filenames)
+
+    for i, filename in enumerate(filenames):
+        progress_callback(100*(i/maxlen))
+        count_dict = count_vector_creator.get_vectors_for_file(filename)
+        for function in count_dict:
+            if not exclude_function(count_dict[function]):
+                result[(filename,
+                        function[0],
+                        function[1])] = count_dict[function]
+
+    return result
+
+
+# Coverage cannot be measured because this is in another process
+def compare_functions(cm1, cm2):  # pragma: no cover
+    """
+    Compares the functions represented by the given count matrices.
+
+    :param cm1: Count vector dict for the first function.
+    :param cm2: Count vector dict for the second function.
+    :return:    The difference between these functions, 0 is identical and
+                1 is not similar at all.
+    """
+    assert isinstance(cm1, dict)
+    assert isinstance(cm2, dict)
+    if len(cm1) == 0 or len(cm2) == 0:
+        return 1 if len(cm1) != len(cm2) else 0
+
+    # The cost matrix holds the difference between the two variables i and
+    # j in the i/j field. This is a representation of a bipartite weighted
+    # graph with nodes representing the first function on the one side
+    # (rows) and the nodes representing the second function on the other
+    #  side (columns). The fields in the matrix are the weighted nodes
+    # connecting each element from one side to the other.
+    cost_matrix = [[cv1.difference(cv2)
+                    for cv2 in cm2.values()]
+                   for cv1 in cm1.values()]
+
+    # Pad manually with ones. If we have one variable in one function and
+    # no corresponding in the other this is 100% difference, so 1.
+    cost_matrix = munkres.pad_matrix(cost_matrix, pad_value=1)
+
+    # The munkres algorithm will calculate a matching such that the sum of
+    # the taken fields is minimal. It thus will associate each variable
+    # from one function to one on the other function.
+    matching = munkres.compute(cost_matrix)
+
+    # Sum it up, normalize it so we have a value in [0, 1]
+    return (sum(cost_matrix[x][y] for x, y in matching) /
+            max(len(cm1), len(cm2)))
diff --git a/bears/tests/codeclone_detection/ClangCloneDetectionBearTest.py b/bears/tests/codeclone_detection/ClangCloneDetectionBearTest.py
@@ -0,0 +1,105 @@
+import sys
+import unittest
+import os
+import inspect
+from queue import Queue
+
+sys.path.insert(0, ".")
+
+from bears.codeclone_detection.ClangSimilarityBear import ClangSimilarityBear
+from bears.codeclone_detection.ClangCloneDetectionBear import \
+    ClangCloneDetectionBear
+from coalib.bearlib.parsing.clang.cindex import Index, LibclangError
+from coalib.settings.Section import Section
+from coalib.settings.Setting import Setting
+
+
+class ClangCloneDetectionBearTest(unittest.TestCase):
+    def setUp(self):
+        self.base_test_path = os.path.abspath(os.path.join(
+            os.path.dirname(inspect.getfile(ClangCloneDetectionBearTest)),
+            "clone_detection_samples"))
+        self.section = Section("default")
+        self.section.append(Setting("condition_list",
+                                    "returned, "
+                                    "is_condition, "
+                                    "in_condition, "
+                                    "in_second_level_condition, "
+                                    "in_third_level_condition, "
+                                    "is_assignee, "
+                                    "is_assigner, "
+                                    "loop_content, "
+                                    "second_level_loop_content, "
+                                    "third_level_loop_content, "
+                                    "is_param, "
+                                    "in_sum, "
+                                    "in_product, "
+                                    "in_binary_operation,"
+                                    "member_accessed"))
+        self.clone_files = [os.listdir(os.path.join(self.base_test_path,
+                                                    "clones"))]
+
+    def test_dependencies(self):
+        self.assertIn(ClangSimilarityBear,
+                      ClangCloneDetectionBear.get_dependencies())
+
+    def test_invalid_conditions(self):
+        self.section.append(Setting("condition_list", "bullshit"))
+
+        self.uut = ClangSimilarityBear({}, self.section, Queue())
+        self.assertEqual(self.uut.run_bear_from_section([], {}), None)
+
+    def test_non_clones(self):
+        self.non_clone_files = [
+            os.path.join(self.base_test_path, "non_clones", elem)
+            for elem in os.listdir(os.path.join(self.base_test_path,
+                                                "non_clones"))]
+
+        self.check_clone_detection_bear(self.non_clone_files,
+                                        lambda results:
+                                        self.assertEqual(results, []))
+
+    def test_clones(self):
+        self.clone_files = [
+            os.path.join(self.base_test_path, "clones", elem)
+            for elem in os.listdir(os.path.join(self.base_test_path,
+                                                "clones"))]
+
+        self.check_clone_detection_bear(self.clone_files,
+                                        lambda results:
+                                        self.assertNotEqual(results, []))
+
+    def check_clone_detection_bear(self, files, result_check_function):
+        """
+        Checks the results of the CloneDetectionBear with the given function.
+
+        :param files:                 The files to check. Each will be checked
+                                      on its own.
+        :param result_check_function: A function yielding an exception if the
+                                      results are invalid.
+        """
+        for file in files:
+            similarity_results = ClangSimilarityBear(
+                {file: ""},
+                self.section,
+                Queue()).run_bear_from_section([], {})
+            uut = ClangCloneDetectionBear(
+                {file: ""},
+                self.section,
+                Queue())
+            arg_dict = {"dependency_results":
+                        {"ClangSimilarityBear": similarity_results}}
+
+            result_check_function(uut.run_bear_from_section([], arg_dict))
+
+
+def skip_test():
+    try:
+        Index.create()
+        return False
+    except LibclangError as error:
+        return str(error)
+
+
+if __name__ == '__main__':
+    unittest.main(verbosity=2)
diff --git a/bears/tests/codeclone_detection/clone_detection_samples/clones/faculty.c b/bears/tests/codeclone_detection/clone_detection_samples/clones/faculty.c
@@ -0,0 +1,17 @@
+int faculty1(int x) {
+    int result = x;
+    while(x > 2) {
+        result *= --x;
+    }
+
+    return result;
+}
+
+int faculty2(int x) {
+    int result = x;
+    for(x--; x > 1; --x) {
+        result *= x;
+    }
+
+    return result;
+}
diff --git a/bears/tests/codeclone_detection/clone_detection_samples/clones/s1a.c b/bears/tests/codeclone_detection/clone_detection_samples/clones/s1a.c
@@ -0,0 +1,15 @@
+void original(int n) {
+float sum=0.0; //C1
+float prod=1.0;
+for (int i=1; i<=n; i++)
+    {sum=sum + i;
+    prod = prod * i;
+    foo(sum, prod); }}
+
+void sumProd(int n) {
+float sum=0.0; //C1
+float prod=1.0;
+for (int i=1; i<=n; i++)
+      {sum=sum + i;
+      prod = prod * i;
+      foo(sum, prod); }}
diff --git a/bears/tests/codeclone_detection/clone_detection_samples/clones/s1b.c b/bears/tests/codeclone_detection/clone_detection_samples/clones/s1b.c
@@ -0,0 +1,15 @@
+void original(int n) {
+float sum=0.0; //C1
+float prod=1.0;
+for (int i=1; i<=n; i++)
+    {sum=sum + i;
+    prod = prod * i;
+    foo(sum, prod); }}
+
+void sumProd(int n) {
+float sum=0.0; //C1'
+float prod=1.0; //C
+for (int i=1; i<=n; i++)
+    {sum=sum + i;
+    prod = prod * i;
+    foo(sum, prod); }}