Skip to content

Commit

Permalink
Added cc metrics
Browse files Browse the repository at this point in the history
Added lizard dependency and codemetrics.get_complexity()
Added LogEntry.copyfrompath when files are copied from an existing path.
  • Loading branch information
elmotec committed Feb 13, 2019
1 parent 60b4ea8 commit 6af71f8
Show file tree
Hide file tree
Showing 18 changed files with 1,034 additions and 372 deletions.
2 changes: 1 addition & 1 deletion codemetrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@
===========
Code metrics is a simple Python module that leverage your source control
management (SCM) tool to generate insight on your code base.
management (SCM) tool and pandas to generate insight on your code base.
"""
4 changes: 2 additions & 2 deletions codemetrics/cloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ def get_cloc(path='.', cloc_program='cloc'):
pandas.DataFrame.
"""
internals._check_run_in_root(path)
internals.check_run_in_root(path)
cmdline = f'{cloc_program} --csv --by-file {path}'
records = []
try:
output = internals._run(cmdline)
output = internals.run(cmdline).split('\n')
except FileNotFoundError as err:
msg = f'{err}. Is {cloc_program} available? Please pass ' \
'cloc_program=<cloc location> to get_cloc'
Expand Down
88 changes: 83 additions & 5 deletions codemetrics/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@

import os.path
import typing
import datetime as dt

import pandas as pd
import lizard
import sklearn
import sklearn.cluster
import sklearn.feature_extraction.text
Expand All @@ -17,7 +19,8 @@
'get_ages',
'get_hot_spots',
'get_co_changes',
'guess_components'
'guess_components',
'get_complexity',
]


Expand Down Expand Up @@ -127,10 +130,9 @@ def get_co_changes(log=None, by=None, on=None):
if on is None:
on = 'revision'
df = log[[on, by]].drop_duplicates()
sj = pd.merge(df, df, on=on)
sj = sj.rename(columns={by + '_x': by, by + '_y': 'dependency'})
sj.drop_duplicates(inplace=True) # FIXME: needs a test
sj = sj.groupby([by, 'dependency']).count().reset_index()
sj = pd.merge(df, df, on=on).\
rename(columns={by + '_x': by, by + '_y': 'dependency'}).\
groupby([by, 'dependency']).count().reset_index()
result = pd.merge(sj[sj[by] == sj['dependency']][[by, on]],
sj[sj[by] != sj['dependency']], on=by). \
rename(columns={on + '_x': 'changes', on + '_y': 'cochanges'})
Expand Down Expand Up @@ -179,3 +181,79 @@ def __cluster_name(center, threshold):
rv = pd.DataFrame(data={'path': data, 'component': components})
rv.sort_values(by='component', inplace=True)
return rv


# Exclude the parameters field for now.
_lizard_fields = [fld for fld in vars(lizard.FunctionInfo('', '')).keys()
if fld not in ['filename', 'parameters']]
_complexity_fields = _lizard_fields + \
'file_tokens file_nloc path revision'.split()


def _get_complexity(path_ref_df: pd.DataFrame,
download_func: typing.Callable) -> pd.DataFrame:
"""Downloads and run complexity metrics on a specific path and revision.
Args:
path_rev_df: DataFrame of path and revision.
download_func: function to download a particular revision of a file.
Returns:
DataFrame with metrics at the function levels.
"""
# FIXME Check the type of download_func: taking (path, revision) or df?
assert callable(download_func), 'download_func is not callable'
for dld in download_func(path_ref_df):
assert isinstance(dld, scm.FileDownloadResult), \
'download_func is expected to return scm.FileDownloadResult objs'
assert isinstance(dld.code, str), 'code is expected to be 1 long string'
info = lizard.analyze_file.analyze_source_code(dld.path, dld.code)
if info.function_list:
df = pd.DataFrame.from_records(
[vars(d) for d in info.function_list],
columns=_lizard_fields)
df['file_tokens'] = info.token_count
df['file_nloc'] = info.nloc
df['path'] = dld.path
df['revision'] = dld.revision
else:
df = pd.DataFrame({k: [] for k in _complexity_fields})
# For consistency with the input.
yield df
return


def get_complexity(df: pd.DataFrame,
download_func: typing.Callable) -> pd.DataFrame:
"""Generate complexity information for files and revisions in dataframe.
For each pair of (path, revision) in the input dataframe, analyze the code
with lizard and return the output.
Args:
df: expected to contain at least 2 columns (path, revision).
download_func: callable that downloads a path on a given revision in
a temporary directory and return that file in an object of type
`codemetrics.scm.FileDownloadResult`.
Returns:
Dataframe containing output of function-level lizard.analyze_
Example::
>>> import codemetrics as cm
>>> list(df.columns)
['path', 'revision']
>>> complexity = get_complexity(df)
.. _lizard.analyze: https://github.com/terryyin/lizard
"""
for expected in ['path', 'revision']:
if expected not in df.columns:
raise ValueError(f"'{expected}' column not found in input")
dfs = list(
_get_complexity(df[['path', 'revision']], download_func=download_func)
)
return pd.concat(dfs).reset_index(drop=True)
179 changes: 129 additions & 50 deletions codemetrics/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
"""Git related functions."""

import datetime as dt
import typing
import re

import tqdm
import numpy as np
import pandas as pd

from . import internals
from . import scm
Expand All @@ -19,19 +21,19 @@ class _GitLogCollector(scm._ScmLogCollector):

_args = 'log --pretty=format:"[%h] [%an] [%ad] [%s]" --date=iso --numstat'

def __init__(self, git_program='git', **kwargs):
def __init__(self, git_client='git', **kwargs):
"""Initialize.
Compiles regular expressions to be used during parsing of log.
Args:
git_program: name of svn client.
**kwargs: passed to parent :class:`_ScmLogCollector`
git_client: name of git client.
**kwargs: passed to parent :class:`codemetrics.scm._ScmLogCollector`
"""
super().__init__(**kwargs)
self.git_program = git_program
self.log_moved_re = re.compile(r"\{(?:\S* )?=> (\S*)\}")
self.git_client = git_client
self.log_moved_re = re.compile(r"([-\d]+)\s+([-\d]+)\s+(\S*)\{(\S*) => (\S*)\}(\S*)")

def process_entry(self, log_entry):
"""Convert a single xml <logentry/> element to csv rows.
Expand All @@ -48,35 +50,43 @@ def process_entry(self, log_entry):
"""
try:
hash, author, date_str, *remainder = log_entry[0][1:-1].split('] [')
except ValueError as err:
rev, author, date_str, *remainder = log_entry[0][1:-1].split('] [')
except ValueError:
log.warning('failed to parse %s', log_entry[0])
raise
msg = '] ['.join(remainder)
date = dt.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S %z')
for path_elem in log_entry[1:]:
copyfrompath = None
path_elem = path_elem.strip()
if not path_elem:
break
# git log shows special characters in paths to indicate moves.
substed_path_elem = self.log_moved_re.sub(r'\1', path_elem)
substed_path_elem = substed_path_elem.replace('//', '/')
try:
added, removed, relpath = substed_path_elem.split()
except ValueError:
log.warning('failed to parse the following line:\n%s\n%s',
log_entry[0], path_elem)
continue
if '{' not in path_elem:
added, removed, relpath = path_elem.split()
else:
match = self.log_moved_re.match(path_elem)
if not match:
log.warning('failed to parse the following line:\n%s\n%s',
log_entry[0], path_elem)
continue
added = match.group(1)
removed = match.group(2)
relpath = match.group(3) + match.group(5) + match.group(6)
relpath = relpath.replace('//', '/')
copyfrompath = match.group(3) + match.group(4) + match.group(6)
copyfrompath = copyfrompath.replace('//', '/')
# - indicate binary files.
added_as_int = int(added) if added != '-' else np.nan
removed_as_int = int(removed) if removed != '-' else np.nan
entry = scm.LogEntry(hash, author, date, None, 'f', None,
None, relpath, msg, added_as_int,
removed_as_int)
entry = scm.LogEntry(rev, author=author, date=date, path=relpath,
message=msg, kind='f', added=added_as_int,
removed=removed_as_int,
copyfrompath=copyfrompath)
yield entry

def get_log_entries(self, text):
"""See :member:`_ScmLogCollector.get_log_entries`."""
def process_log_entries(self, text):
"""See :member:`_ScmLogCollector.process_log_entries`."""
log_entry = []
for line in text:
if line.startswith('['):
Expand All @@ -90,33 +100,51 @@ def get_log_entries(self, text):
yield from self.process_entry(log_entry)
log_entry = []

def get_log(self):
"""Call git log and return output as a DataFrame."""
command = f'{self.git_program} {self._args}'
if self.after:
command += f' --after {self.after:%Y-%m-%d}'
if self.before:
command += f' --before {self.before:%Y-%m-%d}'
command_with_path = f'{command} {self.path}'
results = internals._run(command_with_path)
return self.process_output_to_df(results)


def get_git_log(
after: dt.datetime=None,
before: dt.datetime=None,
path: str='.',
git_program: str='git',
progress_bar: tqdm.tqdm=None):
"""Entry point to retrieve Subversion log.
def get_log(self,
path: str = '.',
after: dt.datetime = None,
before: dt.datetime = None,
progress_bar: tqdm.tqdm = None) -> pd.DataFrame:
"""Retrieve log from git.
Args:
path: location of checked out subversion repository root. Defaults to .
after: only get the log after time stamp. Defaults to one year ago.
before: only get the log before time stamp. Defaults to now.
progress_bar: tqdm.tqdm progress bar.
Returns:
pandas.DataFrame with columns matching the fields of
codemetrics.scm.LogEntry.
"""
internals.check_run_in_root(path)
after, before = internals.handle_default_dates(after, before)
if progress_bar is not None and after is None:
raise ValueError("progress_bar requires 'after' parameter")
command = f'{self.git_client} {self._args}'
if after:
command += f' --after {after:%Y-%m-%d}'
if before:
command += f' --before {before:%Y-%m-%d}'
command_with_path = f'{command} {path}'
results = internals.run(command_with_path).split('\n')
return self.process_log_output_to_df(results, after=after,
progress_bar=progress_bar)


def get_git_log(path: str = '.',
after: dt.datetime = None,
before: dt.datetime = None,
progress_bar: tqdm.tqdm = None,
git_client: str = 'git') -> pd.DataFrame:
"""Entry point to retrieve git log.
Args:
after: only get the log after time stamp
(defaults to one year ago).
before: only get the log before time stamp
(defaults to now).
path: location of checked out subversion repository root.
svn_program: svn client (defaults to svn).
path: location of checked out subversion repository root. Defaults to .
after: only get the log after time stamp. Defaults to one year ago.
before: only get the log before time stamp. Defaults to now.
git_client: git client executable (defaults to git).
progress_bar: tqdm.tqdm progress bar.
Returns:
Expand All @@ -129,8 +157,59 @@ def get_git_log(
log_df = cm.git.get_git_log(path='src', after=last_year)
"""
internals._check_run_in_root(path)
collector = _GitLogCollector(after=after, before=before, path=path,
git_program=git_program,
progress_bar=progress_bar)
return collector.get_log()
collector = _GitLogCollector(git_client=git_client)
return collector.get_log(after=after, before=before, path=path,
progress_bar=progress_bar)


def _download_file(base_command, filename, revision) -> scm.FileDownloadResult:
"""Download specific file and revision from git."""
command = f'{base_command} {revision}:{filename}'
content = internals.run(command)
yield scm.FileDownloadResult(filename, revision, content)


class _GitFileDownloader:
"""Download files from Subversion."""

def __init__(self, git_client: str = 'git'):
"""Initialize downloader.
Args:
git_client: name of git client.
"""
self.command = f'{git_client} show '

def download_files(self,
df: pd.DataFrame) -> typing.Sequence[scm.FileDownloadResult]:
"""Downloads files from Subversion.
Args:
df: dataframe containing at least a (path, revision) columns to
identify the files to download.
Returns:
list of file locations.
"""
for _, (filename, revision) in df[['path', 'revision']].iterrows():
yield from _download_file(self.command, filename, revision)
return


def download_files(df: pd.DataFrame,
git_client: str = 'git') -> typing.Sequence[scm.FileDownloadResult]:
"""Downloads files from Subversion.
Args:
df: dataframe containing at least a (path, revision) columns to
identify the files to download.
git_client: Subversion client executable. Defaults to git.
Returns:
list of scm.FileDownloadResult.
"""
downloader = _GitFileDownloader(git_client=git_client)
yield from downloader.download_files(df)

0 comments on commit 6af71f8

Please sign in to comment.