# Code metrics analysis

This notebooks demonstrates how to use __[codemetrics](https://github.com/elmotec/codemetrics)__ to gain insight on a code base.

In [21]:
import os
import pathlib as pl
import datetime as dt
import textwrap
import json
import pathlib as pth
import logging

import joblib
import pandas as pd
import numpy as np
import altair as alt
from altair.vega.v5 import Vega
from tqdm import tqdm

import codemetrics as cm
import codemetrics.vega

%matplotlib inline

In [22]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
log = cm.log
if not log.handlers:
    cm.log.addHandler(logging.StreamHandler())
cm.log.setLevel(logging.DEBUG)

## Setup

Sets up a few useful things:
    - define a few useful variables like `year_ago` 
    - change directory to the location of the project being analyzed.
    - joblib.Memory to cache some outputs and clearing out the cache when we execute this cell.


In [24]:
# Useful variables
year_ago = dt.datetime.now(tz=dt.timezone.utc) - dt.timedelta(365)

# Change current directory to the project under being analyzed.
target_location = str(pl.Path().absolute() / '..' / '..'/  'pandas')

# Sets up caching and wipes out cache if any.
disk = joblib.Memory(location=os.getenv('TEMP'), verbose=0)
get_cloc = disk.cache(cm.get_cloc)
get_git_log = disk.cache(cm.get_git_log)
get_cloc.clear()
get_git_log.clear()

## Lines of code (loc)

Leverage cloc to count the lines of code and infer some basic information about the languages used in this project.

In [26]:
cloc_program = 'cloc.exe'
loc = cm.get_cloc(path=target_location, cloc_program=cloc_program)

cloc.exe --csv --by-file C:\Users\jlecomte\Documents\Github\codemetrics\notebooks\..\..\pandas


> [1;32mc:\users\jlecomte\documents\github\codemetrics\codemetrics\internals.py[0m(101)[0;36mrun[1;34m()[0m
[1;32m     99 [1;33m        [0mcmd_list[0m [1;33m=[0m [0mshlex[0m[1;33m.[0m[0msplit[0m[1;33m([0m[0mcommand[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    100 [1;33m        [1;32mimport[0m [0mpdb[0m[1;33m;[0m [0mpdb[0m[1;33m.[0m[0mset_trace[0m[1;33m([0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m--> 101 [1;33m        result = subprocess.run(
[0m[1;32m    102 [1;33m            [0mcmd_list[0m[1;33m,[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    103 [1;33m            [0mcheck[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m[0m[1;33m[0m[0m
[0m


ipdb>  cmd_list


['cloc.exe', '--csv', '--by-file', 'C:UsersjlecomteDocumentsGithubcodemetricsnotebooks....pandas']


ipdb>  command


'cloc.exe --csv --by-file C:\\Users\\jlecomte\\Documents\\Github\\codemetrics\\notebooks\\..\\..\\pandas'


ipdb>  shlex.split(command)


['cloc.exe', '--csv', '--by-file', 'C:UsersjlecomteDocumentsGithubcodemetricsnotebooks....pandas']


ipdb>  shlex.split(command, posix=False)


['cloc.exe', '--csv', '--by-file', 'C:\\Users\\jlecomte\\Documents\\Github\\codemetrics\\notebooks\\..\\..\\pandas']


ipdb>  q


BdbQuit: 

In [13]:
loc

Unnamed: 0,language,path,blank,comment,code


In [None]:
loc_sum = loc.groupby('language').sum().reset_index().melt(id_vars=['language']).rename(columns={'variable': 'type', 'value': 'lines'})
alt.Chart(loc_sum).mark_bar().encode(
    x=alt.X('lines:Q'),
    y=alt.Y('language:N', sort=alt.EncodingSortField(field='lines', op='sum', order='descending')),
    color=alt.Color('type:N', scale=alt.Scale(scheme='accent')), 
    tooltip=['lines:Q', 'type:O'],
).properties(title='Lines of code')

## Retrieve git log data

Now that we know what the code base looks like today, we turn our attention to history and build a data frame of the git log history for the past year. 

We then calculate the age of each file and generate a graph of recent changes as well as a circle visualization of the code base. 

In [None]:
log = get_git_log(path='.', after=year_ago, git_client='git.exe')
log['issue'] = log['message'].str.extract(r'\(#(\d+)\)')
log = pd.merge(log, loc[['path']], left_on='path', right_on='path')

In [None]:
ages = cm.get_ages(log).merge(loc)
ages['last_change'] = ages['age'].apply(lambda a: pd.to_datetime('today') - dt.timedelta(a))
ages['age_in_week'] = ages['age'].apply(lambda d: int(d / 7))

width = 1000
weeks = list(range(int(400 / 7)))
chart = alt.Chart(ages).encode(color='language')
top = chart.mark_bar().\
    encode(x=alt.X('age_agg:O', sort='ascending', title='age in weeks', scale=alt.Scale(domain=weeks)),
           y=alt.Y('count(path):Q', title='Number of files'),
           color=alt.Color('language', scale=alt.Scale(scheme='tableau10')),
           tooltip=['count(path)', 'language']
          ).\
    transform_calculate(age_agg='floor(datum.age / 7)').\
    properties(width=width)
bottom = chart.mark_tick(size=60, thickness=2, opacity=.3).\
    encode(x=alt.X('age:Q', title='age in days'),
           tooltip='path').properties(width=width)
alt.vconcat(top, bottom)

In [None]:
code_ages = ages.query("path.str.endswith('.py') | path.str.endswith('.c')")
desc = cm.vega.vis_ages(code_ages, height=500, width=500)
Vega(desc)

## Complexity

On to complexity calculation. Here we calculate the complexity of the current code base, file by file,
function by function so it could take a little long. We run it on .py files only and we leverage tqdm to
show a progress bar.

In [None]:
# Get Python file with the most recent revision
python_df = (log[['path', 'date']]
             .groupby('path', as_index=False)
             .max()
             .query("path.str.endswith('.py')")
             .merge(log[['path', 'date', 'revision']])
             .assign(path=lambda x: x["path"].astype("string"))
            )
# Progress bar for pandas
tqdm.pandas(desc="calculating")
# Calculates complexity on python files.
complexity = (python_df[['revision', 'path']]
              .groupby(['revision', 'path'])
              .progress_apply(cm.get_complexity, download_func=cm.git.download)
             )


In [None]:
# Calculates 80% percentile of complexity for each file
path_complexity = (complexity
                   .reset_index()[['path', 'cyclomatic_complexity', 'token_count']]
                   .groupby('path').quantile(0.8)
                   .sort_values(by='cyclomatic_complexity', ascending=False)
                   .reset_index()
                   .rename(columns={'cyclomatic_complexity': 'complexity'})
                  )
# Merge to lines of code. 
loc_cc = pd.merge(loc, path_complexity)

# Hot spots

Hot spots are files that exhibit high complexity __and__ changed a lot recently. This is typically where you will find bugs.

The graph below will show files harboring complexity as large circle while an increased number of change will make the color trend from yellow to more red shades.

In [None]:
hspots = cm.get_hot_spots(log.head(1), loc_cc.head(1))

In [None]:
hspots = cm.get_hot_spots(log, loc_cc.assign(path=lambda x: x['path'].astype('string'))) 
hspots.query("language == 'Python'").sort_values(by=['changes', 'complexity'], ascending=False).head()
desc = cm.vega.vis_hot_spots(hspots, width=500, height=500, size_column='complexity')
Vega(desc)

## Co-changes or inter-file coupling

Co-changes builds on the idea that files or functions that change together imply a hidden dependency and may need refactoring.

We calculate how often each file change in relation to the other and display high level of coupling. Again, we focus on the files that have changed a lot recently.

In [None]:
co_changes = cm.get_co_changes(log, by='path', on='issue').\
    query("(coupling > .6) & (changes > 20)").\
    sort_values(by=['changes', 'cochanges'], ascending=False)
co_changes

## File level analysis

It can also be useful to dive into the complexity history of one particular file or function. Let's consider the following file and calculate historical complexity of each function in the file. 

In [None]:
path = 'pandas/core/frame.py'
func_df = log[log['path'] == path][['date', 'revision', 'path']]
func_df.head()

In [None]:
func_cplx_df = func_df.groupby(['revision', 'path']).progress_apply(cm.get_complexity, download_func=cm.git.download)

In [None]:
func_cpx_df = pd.merge(func_df, func_cplx_df.reset_index(), on=['revision', 'path'])
top_cpx_func_df = func_cpx_df.groupby('name')[['cyclomatic_complexity']].mean().nlargest(8, 'cyclomatic_complexity')
filt_func_cpx_df = func_cpx_df[func_cpx_df['name'].isin(set(top_cpx_func_df.index))]
filt_func_cpx_df.head()

In [None]:
alt.Chart(filt_func_cpx_df).\
    mark_line().encode(
        x='date:T',
        y='cyclomatic_complexity:Q',
        color='name:N',
        tooltip=['name', 'revision']
    )

To be continued...