coala · sils · May 17, 2015 · May 18, 2015 · May 20, 2015 · May 19, 2015
diff --git a/.misc/.install.sh b/.misc/.install.sh
@@ -2,5 +2,6 @@ if python --version | grep 3\.4 ; then
   pip install coveralls codecov
 fi
 
+pip install munkres3
 sudo apt-get install espeak libclang1-3.4
 sudo ln -s /usr/lib/x86_64-linux-gnu/libclang.so.1 /usr/lib/x86_64-linux-gnu/libclang.so
diff --git a/appveyor/appveyor.yml → .misc/appveyor.yml b/appveyor/appveyor.yml → .misc/appveyor.yml
@@ -3,7 +3,7 @@ environment:
     # SDK v7.0 MSVC Express 2008's SetEnv.cmd script will fail if the
     # /E:ON and /V:ON options are not enabled in the batch script intepreter
     # See: http://stackoverflow.com/a/13751649/163740
-    CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\appveyor\\run_with_env.cmd"
+    CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\.misc\\run_with_env.cmd"
 
   matrix:
     - PYTHON: "C:\\Python33"

diff --git a/appveyor/run_with_env.cmd → .misc/run_with_env.cmd b/appveyor/run_with_env.cmd → .misc/run_with_env.cmd
diff --git a/README.md b/README.md
@@ -49,8 +49,11 @@ Lasse Schuirmann  <lasse@schuirmann.net> and Fabian Neuschmidt
 Build status
 ------------
 
-[![Build Status](https://travis-ci.org/coala-analyzer/coala.svg?branch=master)](https://travis-ci.org/coala-analyzer/coala)
-(Test and build)
+[![Linux Build Status](https://travis-ci.org/coala-analyzer/coala.svg?branch=master)](https://travis-ci.org/coala-analyzer/coala)
+(Linux build and test, python 3.2, 3.3, 3.4)
+
+[![Windows Build status](https://ci.appveyor.com/api/projects/status/jevcxfo48mc4e09p/branch/master?svg=true)](https://ci.appveyor.com/project/sils1297/coala/branch/master)
+(Windows build and test, python 3.3, 3.4)
 
 [![Codacy Code Quality](https://www.codacy.com/project/badge/f0ac979fa93f49509cba9086754a50d4)](https://www.codacy.com/app/lasse/coala)
 (Code quality 1)
@@ -59,10 +62,10 @@ Build status
 (Code quality 2)
 
 [![Coverage Status](https://coveralls.io/repos/coala-analyzer/coala/badge.svg?branch=master)](https://coveralls.io/r/coala-analyzer/coala?branch=master)
-(Statement coverage)
+(Statement coverage, measured on linux with python 3.4)
 
 [![codecov.io](https://codecov.io/github/coala-analyzer/coala/coverage.svg?branch=master)](https://codecov.io/github/coala-analyzer/coala?branch=master)
-(Branch coverage)
+(Branch coverage, measured on linux with python 3.4)
 
 [![Documentation Status](https://readthedocs.org/projects/coala/badge/?version=latest)](https://readthedocs.org/projects/coala/?badge=latest)
 (Documentation Status)

diff --git a/bears/codeclone_detection/ClangCloneDetectionBear.py b/bears/codeclone_detection/ClangCloneDetectionBear.py
@@ -0,0 +1,42 @@
+from coalib.results.RESULT_SEVERITY import RESULT_SEVERITY
+from coalib.results.Result import Result
+from coalib.bears.GlobalBear import GlobalBear
+from coalib.misc.i18n import _
+from bears.codeclone_detection.ClangSimilarityBear import ClangSimilarityBear
+
+
+class ClangCloneDetectionBear(GlobalBear):
+    def run(self,
+            dependency_results: dict,
+            max_clone_difference: float=0.2):
+        '''
+        Checks the given code for similar functions that are probably
+        redundant.
+
+        :param max_clone_difference: The maximum difference a clone should
+                                     have.
+        '''
+        differences = dependency_results["ClangSimilarityBear"][0].contents
+
+        self.debug("Creating results...")
+        results = []
+        for function_1, function_2, difference in differences:
+            if difference < max_clone_difference:
+                results.append(Result(
+                    self.__class__.__name__,
+                    _("Code clone found. The other occurrence is at file "
+                      "{file}, line {line}, function {function}. The "
+                      "similarity is {similarity}.").format(
+                        file=function_2[0],
+                        line=function_2[1],
+                        function=function_2[2],
+                        similarity=1-difference),
+                    file=function_1[0],
+                    severity=RESULT_SEVERITY.MAJOR,
+                    line_nr=function_1[1]))
+
+        return results
+
+    @staticmethod
+    def get_dependencies():
+        return [ClangSimilarityBear]
diff --git a/bears/codeclone_detection/ClangCountVectorCreator.py b/bears/codeclone_detection/ClangCountVectorCreator.py
@@ -108,7 +108,9 @@ def _get_vectors_for_cursor(self, cursor, filename):
 
         if str(file) == str(filename) and self.is_function_declaration(cursor):
             self._get_vector_for_function(cursor)
-            result = {self.get_identifier_name(cursor): self.count_vectors}
+
+            result = {(cursor.extent.start.line,
+                       self.get_identifier_name(cursor)): self.count_vectors}
             # Reset local states
             self.count_vectors = {}
             self.stack = []

diff --git a/bears/codeclone_detection/ClangCountingConditions.py b/bears/codeclone_detection/ClangCountingConditions.py
@@ -7,7 +7,6 @@
 
 from coalib.bearlib.parsing.clang.cindex import CursorKind
 from coalib.misc.Enum import enum
-from coalib.settings.Setting import Setting
 
 
 def _stack_contains_kind(stack, kind):
@@ -36,19 +35,20 @@ def _is_nth_child_of_kind(stack, allowed_nums, kind):
                          and the child number.
     :param allowed_nums: List/iterator of child numbers allowed.
     :param kind:         The kind of the parent element.
-    :return:             True if the described situation matches.
+    :return:             Number of matches.
     """
     is_kind_child = False
+    count = 0
     for elem, child_num in stack:
         if is_kind_child and child_num in allowed_nums:
-            return True
+            count += 1
 
         if elem.kind == kind:
             is_kind_child = True
         else:
             is_kind_child = False
 
-    return False
+    return count
 
 
 FOR_POSITION = enum("UNKNOWN", "INIT", "COND", "INC", "BODY")
@@ -119,12 +119,67 @@ def _get_positions_in_for_loop(cursor, stack):
     return results
 
 
-ARITH_BINARY_OPERATORS = ['+', '-', '*', '/', '&', '|']
+def _get_binop_operator(cursor):
+    """
+    Returns the operator token of a binary operator cursor.
+
+    :param cursor: A cursor of kind BINARY_OPERATOR.
+    :return:       The token object containing the actual operator or None.
+    """
+    children = list(cursor.get_children())
+    operator_min_begin = (children[0].location.line,
+                          children[0].location.column)
+    operator_max_end = (children[1].location.line,
+                        children[1].location.column)
+
+    for token in cursor.get_tokens():
+        if (operator_min_begin < (token.extent.start.line,
+                                  token.extent.start.column) and
+            operator_max_end >= (token.extent.end.line,
+                                token.extent.end.column)):
+            return token
+
+    return None  # pragma: no cover
+
+
+def _stack_contains_operators(stack, operators):
+    for elem, child_num in stack:
+        if elem.kind in [CursorKind.BINARY_OPERATOR,
+                         CursorKind.COMPOUND_ASSIGNMENT_OPERATOR]:
+            operator = _get_binop_operator(elem)
+            # Not known how to reproduce but may be possible when evil macros
+            # join the game.
+            if operator is None:  # pragma: no cover
+                continue
+
+            if operator.spelling.decode() in operators:
+                return True
+
+    return False
+
+
+ARITH_BINARY_OPERATORS = ['+', '-', '*', '/', '%', '&', '|']
 COMPARISION_OPERATORS = ["==", "<=", ">=", "<", ">", "!=", "&&", "||"]
 ADV_ASSIGNMENT_OPERATORS = [op + "=" for op in ARITH_BINARY_OPERATORS]
 ASSIGNMENT_OPERATORS = ["="] + ADV_ASSIGNMENT_OPERATORS
 
 
+def in_sum(cursor, stack):
+    return _stack_contains_operators(stack, ['+', '-', '+=', '-='])
+
+
+def in_product(cursor, stack):
+    return _stack_contains_operators(stack, ['*', '/', '%', '*=', '/=', '%='])
+
+
+def in_binary_operation(cursor, stack):
+    return _stack_contains_operators(stack, ['&', '|', '&=', '|='])
+
+
+def member_accessed(cursor, stack):
+    return _stack_contains_kind(stack, CursorKind.MEMBER_REF_EXPR)
+
+
 def used(cursor, stack):
     return True
 
@@ -144,15 +199,23 @@ def is_inc_or_dec(cursor, stack):
 
 
 def is_condition(cursor, stack):
-    return (_is_nth_child_of_kind(stack, [0], CursorKind.WHILE_STMT) or
-            _is_nth_child_of_kind(stack, [0], CursorKind.IF_STMT) or
+    return (_is_nth_child_of_kind(stack, [0], CursorKind.WHILE_STMT) != 0 or
+            _is_nth_child_of_kind(stack, [0], CursorKind.IF_STMT) != 0 or
             FOR_POSITION.COND in _get_positions_in_for_loop(cursor, stack))
 
 
 def in_condition(cursor, stack):
     # In every case the first child of IF_STMT is the condition itself
     # (non-NULL) so the second and third child are in the then/else branch
-    return _is_nth_child_of_kind(stack, [1, 2], CursorKind.IF_STMT)
+    return _is_nth_child_of_kind(stack, [1, 2], CursorKind.IF_STMT) == 1
+
+
+def in_second_level_condition(cursor, stack):
+    return _is_nth_child_of_kind(stack, [1, 2], CursorKind.IF_STMT) == 2
+
+
+def in_third_level_condition(cursor, stack):
+    return _is_nth_child_of_kind(stack, [1, 2], CursorKind.IF_STMT) > 2
 
 
 def is_assignee(cursor, stack):
@@ -192,32 +255,55 @@ def is_assigner(cursor, stack):
     return is_inc_or_dec(cursor, stack)
 
 
-def loop_content(cursor, stack):
+def _loop_level(cursor, stack):
     positions_in_for = _get_positions_in_for_loop(cursor, stack)
-    return (_is_nth_child_of_kind(stack, [1], CursorKind.WHILE_STMT) or
-            FOR_POSITION.INC in positions_in_for or
-            FOR_POSITION.BODY in positions_in_for)
+    return (positions_in_for.count(FOR_POSITION.INC) +
+            positions_in_for.count(FOR_POSITION.BODY) +
+            _is_nth_child_of_kind(stack, [1], CursorKind.WHILE_STMT))
+
+
+def loop_content(cursor, stack):
+    return _loop_level(cursor, stack) == 1
+
+
+def second_level_loop_content(cursor, stack):
+    return _loop_level(cursor, stack) == 2
+
+
+def third_level_loop_content(cursor, stack):
+    return _loop_level(cursor, stack) > 2
+
+
+def is_param(cursor, stack):
+    return cursor.kind == CursorKind.PARM_DECL
 
 
 condition_dict = {"used": used,
                   "returned": returned,
                   "is_condition": is_condition,
                   "in_condition": in_condition,
+                  "in_second_level_condition": in_second_level_condition,
+                  "in_third_level_condition": in_third_level_condition,
                   "is_assignee": is_assignee,
                   "is_assigner": is_assigner,
-                  "loop_content": loop_content}
+                  "loop_content": loop_content,
+                  "second_level_loop_content": second_level_loop_content,
+                  "third_level_loop_content": third_level_loop_content,
+                  "is_param": is_param,
+                  "in_sum": in_sum,
+                  "in_product": in_product,
+                  "in_binary_operation": in_binary_operation,
+                  "member_accessed": member_accessed}
 
 
 def counting_condition(value):
     """
     This is a custom converter to convert a setting from coala into counting
     condition function objects for this bear only.
 
-    :param value: A Setting
+    :param value: An object that can be converted to a list.
     :return:      A list of functions (counting conditions)
     """
-    assert isinstance(value, Setting)
-
     str_list = list(value)
     result_list = []
     for elem in str_list:

diff --git a/bears/codeclone_detection/ClangSimilarityBear.py b/bears/codeclone_detection/ClangSimilarityBear.py
@@ -0,0 +1,100 @@
+from itertools import combinations
+import multiprocessing
+
+
+from coalib.processes.SectionExecutor import get_cpu_count
+from coalib.results.HiddenResult import HiddenResult
+from coalib.settings.Setting import typed_dict
+from coalib.bears.GlobalBear import GlobalBear
+from bears.codeclone_detection.ClangCountVectorCreator import \
+    ClangCountVectorCreator
+from bears.codeclone_detection.ClangCountingConditions import condition_dict
+from bears.codeclone_detection.CloneDetectionRoutines import \
+    compare_functions, \
+    get_count_matrices
+
+
+"""
+counting_condition_dict is a function object generated by typed_dict. This
+function takes a setting and creates a dictionary out of it while it
+converts all keys to counting condition function objects (via the
+condition_dict) and all values to floats while unset values default to 1.
+"""
+counting_condition_dict = typed_dict(
+    lambda setting: condition_dict[str(setting).lower()],
+    float,
+    1)
+
+
+# Coverage cannot be measured because this is in another process
+def get_difference(args):  # pragma: no cover
+    """
+    Retrieves the difference between two functions using the munkres algorithm.
+
+    :param args: A tuple holding the first function id, the second and the
+                 count matrices dictionary holding the count matrices for
+                 each function with the function id as key.
+    :return:     A tuple containing both function ids and their similarity.
+    """
+    function_1, function_2, count_matrices = args
+    return (function_1,
+            function_2,
+            compare_functions(count_matrices[function_1],
+                              count_matrices[function_2]))
+
+
+class ClangSimilarityBear(GlobalBear):
+    def run(self,
+            condition_list: counting_condition_dict):
+        '''
+        Retrieves similarities for code clone detection. Those can be reused in
+        another bear to produce results.
+
+        :param condition_list:       A comma seperated list of counting
+                                     conditions. Possible values are: used,
+                                     returned, is_condition, in_condition,
+                                     is_assignee, is_assigner, loop_content.
+                                     Weightings can be assigned to each
+                                     condition due to providing a dict
+                                     value, i.e. having used weighted in
+                                     half as much as other conditions would
+                                     simply be: "used: 0.5, is_assignee".
+                                     Weightings default to 1 if unset.
+        '''
+        if not isinstance(condition_list, dict):
+            self.err("The condition_list setting is invalid. Code clone "
+                     "detection cannot run.")
+            return
+
+        self.debug("Using the following counting conditions:")
+        for key, val in condition_list.items():
+            self.debug(" *", key.__name__, "(weighting: {})".format(val))
+
+        self.debug("Creating count matrices...")
+        count_matrices = get_count_matrices(
+            ClangCountVectorCreator(list(condition_list.keys()),
+                                    list(condition_list.values())),
+            list(self.file_dict.keys()),
+            lambda prog: self.debug("{:2.4f}%...".format(prog)))
+
+        self.debug("Calculating differences...")
+        # Code clone detection may take ages for a larger code basis. It is
+        # highly probable, that no other bears are running in parallel,
+        # thus we do parallel execution within this bear.
+        pool = multiprocessing.Pool(get_cpu_count())
+
+        differences = []
+        function_count = len(count_matrices)
+        # Thats n over 2, hardcoded to simplify calculation
+        combination_length = function_count * (function_count-1) / 2
+        function_combinations = [(f1, f2, count_matrices)
+                                 for f1, f2 in combinations(count_matrices, 2)]
+
+        for i, elem in enumerate(pool.imap_unordered(get_difference,
+                                                     function_combinations,
+                                                     chunksize=100)):
+            if i % 1000 == 0:
+                self.debug("{:2.4f}%...".format(100*i/combination_length))
+            differences.append(elem)
+
+        return [HiddenResult(self.__class__.__name__, differences)]