Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wip/sils/cc v6 #564

Closed
wants to merge 13 commits into from
1 change: 1 addition & 0 deletions .misc/.install.sh
Expand Up @@ -2,5 +2,6 @@ if python --version | grep 3\.4 ; then
pip install coveralls codecov
fi

pip install munkres3
sudo apt-get install espeak libclang1-3.4
sudo ln -s /usr/lib/x86_64-linux-gnu/libclang.so.1 /usr/lib/x86_64-linux-gnu/libclang.so
2 changes: 1 addition & 1 deletion appveyor/appveyor.yml → .misc/appveyor.yml
Expand Up @@ -3,7 +3,7 @@ environment:
# SDK v7.0 MSVC Express 2008's SetEnv.cmd script will fail if the
# /E:ON and /V:ON options are not enabled in the batch script intepreter
# See: http://stackoverflow.com/a/13751649/163740
CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\appveyor\\run_with_env.cmd"
CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\.misc\\run_with_env.cmd"

matrix:
- PYTHON: "C:\\Python33"
Expand Down
File renamed without changes.
11 changes: 7 additions & 4 deletions README.md
Expand Up @@ -49,8 +49,11 @@ Lasse Schuirmann <lasse@schuirmann.net> and Fabian Neuschmidt
Build status
------------

[![Build Status](https://travis-ci.org/coala-analyzer/coala.svg?branch=master)](https://travis-ci.org/coala-analyzer/coala)
(Test and build)
[![Linux Build Status](https://travis-ci.org/coala-analyzer/coala.svg?branch=master)](https://travis-ci.org/coala-analyzer/coala)
(Linux build and test, python 3.2, 3.3, 3.4)

[![Windows Build status](https://ci.appveyor.com/api/projects/status/jevcxfo48mc4e09p/branch/master?svg=true)](https://ci.appveyor.com/project/sils1297/coala/branch/master)
(Windows build and test, python 3.3, 3.4)

[![Codacy Code Quality](https://www.codacy.com/project/badge/f0ac979fa93f49509cba9086754a50d4)](https://www.codacy.com/app/lasse/coala)
(Code quality 1)
Expand All @@ -59,10 +62,10 @@ Build status
(Code quality 2)

[![Coverage Status](https://coveralls.io/repos/coala-analyzer/coala/badge.svg?branch=master)](https://coveralls.io/r/coala-analyzer/coala?branch=master)
(Statement coverage)
(Statement coverage, measured on linux with python 3.4)

[![codecov.io](https://codecov.io/github/coala-analyzer/coala/coverage.svg?branch=master)](https://codecov.io/github/coala-analyzer/coala?branch=master)
(Branch coverage)
(Branch coverage, measured on linux with python 3.4)

[![Documentation Status](https://readthedocs.org/projects/coala/badge/?version=latest)](https://readthedocs.org/projects/coala/?badge=latest)
(Documentation Status)
Expand Down
42 changes: 42 additions & 0 deletions bears/codeclone_detection/ClangCloneDetectionBear.py
@@ -0,0 +1,42 @@
from coalib.results.RESULT_SEVERITY import RESULT_SEVERITY
from coalib.results.Result import Result
from coalib.bears.GlobalBear import GlobalBear
from coalib.misc.i18n import _
from bears.codeclone_detection.ClangSimilarityBear import ClangSimilarityBear


class ClangCloneDetectionBear(GlobalBear):
def run(self,
dependency_results: dict,
max_clone_difference: float=0.2):
'''
Checks the given code for similar functions that are probably
redundant.

:param max_clone_difference: The maximum difference a clone should
have.
'''
differences = dependency_results["ClangSimilarityBear"][0].contents

self.debug("Creating results...")
results = []
for function_1, function_2, difference in differences:
if difference < max_clone_difference:
results.append(Result(
self.__class__.__name__,
_("Code clone found. The other occurrence is at file "
"{file}, line {line}, function {function}. The "
"similarity is {similarity}.").format(
file=function_2[0],
line=function_2[1],
function=function_2[2],
similarity=1-difference),
file=function_1[0],
severity=RESULT_SEVERITY.MAJOR,
line_nr=function_1[1]))

return results

@staticmethod
def get_dependencies():
return [ClangSimilarityBear]
4 changes: 3 additions & 1 deletion bears/codeclone_detection/ClangCountVectorCreator.py
Expand Up @@ -108,7 +108,9 @@ def _get_vectors_for_cursor(self, cursor, filename):

if str(file) == str(filename) and self.is_function_declaration(cursor):
self._get_vector_for_function(cursor)
result = {self.get_identifier_name(cursor): self.count_vectors}

result = {(cursor.extent.start.line,
self.get_identifier_name(cursor)): self.count_vectors}
# Reset local states
self.count_vectors = {}
self.stack = []
Expand Down
118 changes: 102 additions & 16 deletions bears/codeclone_detection/ClangCountingConditions.py
Expand Up @@ -7,7 +7,6 @@

from coalib.bearlib.parsing.clang.cindex import CursorKind
from coalib.misc.Enum import enum
from coalib.settings.Setting import Setting


def _stack_contains_kind(stack, kind):
Expand Down Expand Up @@ -36,19 +35,20 @@ def _is_nth_child_of_kind(stack, allowed_nums, kind):
and the child number.
:param allowed_nums: List/iterator of child numbers allowed.
:param kind: The kind of the parent element.
:return: True if the described situation matches.
:return: Number of matches.
"""
is_kind_child = False
count = 0
for elem, child_num in stack:
if is_kind_child and child_num in allowed_nums:
return True
count += 1

if elem.kind == kind:
is_kind_child = True
else:
is_kind_child = False

return False
return count


FOR_POSITION = enum("UNKNOWN", "INIT", "COND", "INC", "BODY")
Expand Down Expand Up @@ -119,12 +119,67 @@ def _get_positions_in_for_loop(cursor, stack):
return results


ARITH_BINARY_OPERATORS = ['+', '-', '*', '/', '&', '|']
def _get_binop_operator(cursor):
"""
Returns the operator token of a binary operator cursor.

:param cursor: A cursor of kind BINARY_OPERATOR.
:return: The token object containing the actual operator or None.
"""
children = list(cursor.get_children())
operator_min_begin = (children[0].location.line,
children[0].location.column)
operator_max_end = (children[1].location.line,
children[1].location.column)

for token in cursor.get_tokens():
if (operator_min_begin < (token.extent.start.line,
token.extent.start.column) and
operator_max_end >= (token.extent.end.line,
token.extent.end.column)):
return token

return None # pragma: no cover


def _stack_contains_operators(stack, operators):
for elem, child_num in stack:
if elem.kind in [CursorKind.BINARY_OPERATOR,
CursorKind.COMPOUND_ASSIGNMENT_OPERATOR]:
operator = _get_binop_operator(elem)
# Not known how to reproduce but may be possible when evil macros
# join the game.
if operator is None: # pragma: no cover
continue

if operator.spelling.decode() in operators:
return True

return False


ARITH_BINARY_OPERATORS = ['+', '-', '*', '/', '%', '&', '|']
COMPARISION_OPERATORS = ["==", "<=", ">=", "<", ">", "!=", "&&", "||"]
ADV_ASSIGNMENT_OPERATORS = [op + "=" for op in ARITH_BINARY_OPERATORS]
ASSIGNMENT_OPERATORS = ["="] + ADV_ASSIGNMENT_OPERATORS


def in_sum(cursor, stack):
return _stack_contains_operators(stack, ['+', '-', '+=', '-='])


def in_product(cursor, stack):
return _stack_contains_operators(stack, ['*', '/', '%', '*=', '/=', '%='])


def in_binary_operation(cursor, stack):
return _stack_contains_operators(stack, ['&', '|', '&=', '|='])


def member_accessed(cursor, stack):
return _stack_contains_kind(stack, CursorKind.MEMBER_REF_EXPR)


def used(cursor, stack):
return True

Expand All @@ -144,15 +199,23 @@ def is_inc_or_dec(cursor, stack):


def is_condition(cursor, stack):
return (_is_nth_child_of_kind(stack, [0], CursorKind.WHILE_STMT) or
_is_nth_child_of_kind(stack, [0], CursorKind.IF_STMT) or
return (_is_nth_child_of_kind(stack, [0], CursorKind.WHILE_STMT) != 0 or
_is_nth_child_of_kind(stack, [0], CursorKind.IF_STMT) != 0 or
FOR_POSITION.COND in _get_positions_in_for_loop(cursor, stack))


def in_condition(cursor, stack):
# In every case the first child of IF_STMT is the condition itself
# (non-NULL) so the second and third child are in the then/else branch
return _is_nth_child_of_kind(stack, [1, 2], CursorKind.IF_STMT)
return _is_nth_child_of_kind(stack, [1, 2], CursorKind.IF_STMT) == 1


def in_second_level_condition(cursor, stack):
return _is_nth_child_of_kind(stack, [1, 2], CursorKind.IF_STMT) == 2


def in_third_level_condition(cursor, stack):
return _is_nth_child_of_kind(stack, [1, 2], CursorKind.IF_STMT) > 2


def is_assignee(cursor, stack):
Expand Down Expand Up @@ -192,32 +255,55 @@ def is_assigner(cursor, stack):
return is_inc_or_dec(cursor, stack)


def loop_content(cursor, stack):
def _loop_level(cursor, stack):
positions_in_for = _get_positions_in_for_loop(cursor, stack)
return (_is_nth_child_of_kind(stack, [1], CursorKind.WHILE_STMT) or
FOR_POSITION.INC in positions_in_for or
FOR_POSITION.BODY in positions_in_for)
return (positions_in_for.count(FOR_POSITION.INC) +
positions_in_for.count(FOR_POSITION.BODY) +
_is_nth_child_of_kind(stack, [1], CursorKind.WHILE_STMT))


def loop_content(cursor, stack):
return _loop_level(cursor, stack) == 1


def second_level_loop_content(cursor, stack):
return _loop_level(cursor, stack) == 2


def third_level_loop_content(cursor, stack):
return _loop_level(cursor, stack) > 2


def is_param(cursor, stack):
return cursor.kind == CursorKind.PARM_DECL


condition_dict = {"used": used,
"returned": returned,
"is_condition": is_condition,
"in_condition": in_condition,
"in_second_level_condition": in_second_level_condition,
"in_third_level_condition": in_third_level_condition,
"is_assignee": is_assignee,
"is_assigner": is_assigner,
"loop_content": loop_content}
"loop_content": loop_content,
"second_level_loop_content": second_level_loop_content,
"third_level_loop_content": third_level_loop_content,
"is_param": is_param,
"in_sum": in_sum,
"in_product": in_product,
"in_binary_operation": in_binary_operation,
"member_accessed": member_accessed}


def counting_condition(value):
"""
This is a custom converter to convert a setting from coala into counting
condition function objects for this bear only.

:param value: A Setting
:param value: An object that can be converted to a list.
:return: A list of functions (counting conditions)
"""
assert isinstance(value, Setting)

str_list = list(value)
result_list = []
for elem in str_list:
Expand Down
100 changes: 100 additions & 0 deletions bears/codeclone_detection/ClangSimilarityBear.py
@@ -0,0 +1,100 @@
from itertools import combinations
import multiprocessing


from coalib.processes.SectionExecutor import get_cpu_count
from coalib.results.HiddenResult import HiddenResult
from coalib.settings.Setting import typed_dict
from coalib.bears.GlobalBear import GlobalBear
from bears.codeclone_detection.ClangCountVectorCreator import \
ClangCountVectorCreator
from bears.codeclone_detection.ClangCountingConditions import condition_dict
from bears.codeclone_detection.CloneDetectionRoutines import \
compare_functions, \
get_count_matrices


"""
counting_condition_dict is a function object generated by typed_dict. This
function takes a setting and creates a dictionary out of it while it
converts all keys to counting condition function objects (via the
condition_dict) and all values to floats while unset values default to 1.
"""
counting_condition_dict = typed_dict(
lambda setting: condition_dict[str(setting).lower()],
float,
1)


# Coverage cannot be measured because this is in another process
def get_difference(args): # pragma: no cover
"""
Retrieves the difference between two functions using the munkres algorithm.

:param args: A tuple holding the first function id, the second and the
count matrices dictionary holding the count matrices for
each function with the function id as key.
:return: A tuple containing both function ids and their similarity.
"""
function_1, function_2, count_matrices = args
return (function_1,
function_2,
compare_functions(count_matrices[function_1],
count_matrices[function_2]))


class ClangSimilarityBear(GlobalBear):
def run(self,
condition_list: counting_condition_dict):
'''
Retrieves similarities for code clone detection. Those can be reused in
another bear to produce results.

:param condition_list: A comma seperated list of counting
conditions. Possible values are: used,
returned, is_condition, in_condition,
is_assignee, is_assigner, loop_content.
Weightings can be assigned to each
condition due to providing a dict
value, i.e. having used weighted in
half as much as other conditions would
simply be: "used: 0.5, is_assignee".
Weightings default to 1 if unset.
'''
if not isinstance(condition_list, dict):
self.err("The condition_list setting is invalid. Code clone "
"detection cannot run.")
return

self.debug("Using the following counting conditions:")
for key, val in condition_list.items():
self.debug(" *", key.__name__, "(weighting: {})".format(val))

self.debug("Creating count matrices...")
count_matrices = get_count_matrices(
ClangCountVectorCreator(list(condition_list.keys()),
list(condition_list.values())),
list(self.file_dict.keys()),
lambda prog: self.debug("{:2.4f}%...".format(prog)))

self.debug("Calculating differences...")
# Code clone detection may take ages for a larger code basis. It is
# highly probable, that no other bears are running in parallel,
# thus we do parallel execution within this bear.
pool = multiprocessing.Pool(get_cpu_count())

differences = []
function_count = len(count_matrices)
# Thats n over 2, hardcoded to simplify calculation
combination_length = function_count * (function_count-1) / 2
function_combinations = [(f1, f2, count_matrices)
for f1, f2 in combinations(count_matrices, 2)]

for i, elem in enumerate(pool.imap_unordered(get_difference,
function_combinations,
chunksize=100)):
if i % 1000 == 0:
self.debug("{:2.4f}%...".format(100*i/combination_length))
differences.append(elem)

return [HiddenResult(self.__class__.__name__, differences)]