#### Generate Commits Summary by Authors and Repos for a range of Weeks
This code consolidate the contributors (with similar name using [Cosin Similarity](https://en.wikipedia.org/wiki/Cosine_similarity)) and generate the summary and graph of commits/files/changes for a range of weeks and the set of repos.
<br/>The code relied on [git-quick-stats](https://github.com/amiller/git-quick-stats) to generate the git log output.

In [None]:
# Import some libraries
import os
import glob
import datetime
import pandas as pd
from typing import List, Dict
from collections import namedtuple
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Some utility functions
def parseNumber(key, return_pos, line):
    section = list(filter(len, line.split(' ')))
    #print(key, section, return_pos, line)
    if key == section[0]:
        return int(section[return_pos])
    else:
        return 0

In [None]:
# Git command template for git-quick-stats 
gitLogCommandTemplate = 'cd {path} && export _GIT_SINCE="{from_date}" && export _GIT_UNTIL="{to_date}" && git-quick-stats -T > {output_filename}'

In [None]:
# Input parameters
# Replace the sourcePath with your local repo path
sourcePath = '{set-your-local-repo-path-here}/'
repos = [f for f in glob.glob(sourcePath + "**/", recursive=False)]
print(repos)

In [None]:
# Output Model
gitInfo = Dict[str, List]

In [None]:
# Main Function
def Run(weeks: int) -> gitInfo:
    info = {'repo': [], 'author':[], 'dateending':[], 'insertions':[], 'deletions':[], 'files':[], 'commits':[], 'lineschanged':[]}
    for repo in repos:
        repoName = repo.strip().split('/')[-2:-1][0]
        cmd = gitLogCommandTemplate.replace('{path}', repo)
        for week in range(1,weeks+1):
            # map
            from_date = (datetime.date.today() - datetime.timedelta(days=week*7)).isoformat()
            to_date = (datetime.date.today() - datetime.timedelta(days=(week-1)*7)).isoformat()
            output_filename = '{0}_{1}.txt'.format(from_date, to_date)
            cmdw = cmd.replace('{from_date}', from_date)
            cmdw = cmdw.replace('{to_date}', to_date)
            cmdw = cmdw.replace('{output_filename}', output_filename)
            exit_code = os.system(cmdw)
            #print(exit_code)
            output_filename = repo + output_filename

            # reduce
            with open(output_filename) as f:
                lines = f.readlines()
                if len(lines) > 5:
                    line_pos = 2
                    while line_pos + 8 < len(lines):
                        author = lines[line_pos].strip().replace(':','')
                        if author.startswith('total'):
                            break
                        elif len(author) == 0 or author.startswith('last commit') or author.startswith('first commit'):
                            line_pos += 1
                            continue
                        info['repo'].append(repoName)
                        info['author'].append(author)
                        info['dateending'].append(to_date)

                        line_pos += 1
                        if lines[line_pos+1].strip().startswith('insertions'):
                            info['insertions'].append(parseNumber('insertions', 1, lines[line_pos].strip().replace(':','')))
                        else:
                            info['insertions'].append(0)

                        line_pos += 1
                        if lines[line_pos].strip().startswith('deletions'):
                            info['deletions'].append(parseNumber('deletions', 1, lines[line_pos].strip().replace(':','')))
                        else:
                            info['deletions'].append(0)

                        line_pos += 1
                        if lines[line_pos].strip().startswith('files'):
                            info['files'].append(parseNumber('files', 1, lines[line_pos].strip().replace(':','')))
                        else:
                            info['files'].append(0)

                        line_pos += 1
                        if lines[line_pos].strip().startswith('commits'):
                            info['commits'].append(parseNumber('commits', 1, lines[line_pos].strip().replace(':','')))
                        else:
                            info['commits'].append(0)

                        line_pos += 1
                        if lines[line_pos].strip().startswith('lines'):
                            info['lineschanged'].append(parseNumber('lines', 2, lines[line_pos].strip().replace(':','')))
                        else:
                            info['lineschanged'].append(0)

                        line_pos += 1
                        
    return info

In [None]:
# Driver, get data for past 8 weeks
data = Run(8)

In [None]:
# Consolidate the author with similar names, I am using consine similarity here 
# The cos_sim is a k x n matrix where the diagonal elements are 1.

authors = sorted(list(set(data['author'])))
vectorizer = CountVectorizer().fit_transform(authors)
vectors = vectorizer.toarray()
cos_sim = cosine_similarity(vectors)

# Uncomment below to view the content
#print(authors)
#cos_sim 

In [None]:
# Now consolidate author in the info data model 
# I use cosine similatrity score of 60 as the threshold to be consider as "same author"
similarity_threshold= 0.6

author_table={}
skipList=set()
for i in range(0, len(cos_sim)):
    if i not in skipList:
        author_table[authors[i]] = authors[i]
        skipList.add(i)
        #print(skipList)
        for j in range(i+1, len(cos_sim)):
            if cos_sim[j][i] >= similarity_threshold:
                author_table[authors[j]] = authors[i]
                skipList.add(j)
#print(author_table)
for i in range(0, len(data['author'])):
    data['author'][i] = author_table[data['author'][i]]

In [None]:
df = pd.DataFrame(data)
commits_grp = df.groupby(['author','dateending'])[['author','dateending','commits']]

In [None]:
col, row = 0, 0
fig, axes = plt.subplots(nrows=6, ncols=2, figsize=(22,25))
for a in set(author_table.values()):
    a1 = commits_grp.filter(lambda x: (x['author'] == a).any()).groupby(['dateending']).sum()
    a1plot = a1.plot(ax=axes[row][col], title=a, marker='o')
    _ = a1plot.set(xlabel='Date', ylabel='Commits');
    col += 1
    if col == 2:
        col = 0
        row += 1
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

In [None]:
# # Save the data frame to CSV
# df = pd.DataFrame(data)
# df.to_csv('output.csv')

# # Group by commits and save to CSV
# filterby = 'commits'

# df1 = df.groupby(['repo', 'dateending', 'author'], as_index=False)[filterby].sum().pivot('repo', 'author','dateending').fillna(0)
# df1.to_csv('output-repo-filter-commits.csv')

# df2 = df.groupby(['dateending', 'author'], as_index=False)[filterby].sum().pivot('author','dateending').fillna(0)
# df2.to_csv('output-filter-commits.csv')