# Extract GitHub contributions over time

Uses [GitPython](http://gitpython.readthedocs.io/en/stable/).

In [1]:
import collections
import re

import git
import pandas

In [2]:
repo = git.Repo(path = 'deep-review')
# State of deep-review submodule
print(repo.head.commit)

d4c316393779a9248d924abae83bc672aafa39df


In [3]:
# These patterns are reversed because our diffs are backwards
# -R (R=True, suggested in https://git.io/vdigv) doesn't seem to work
added_pattern = re.compile(r'^-(.+)', flags=re.MULTILINE)
deleted_pattern = re.compile(r'^\+(.+)', flags=re.MULTILINE)
pattern_alphanum = re.compile(r'\w')

def get_word_count(text):
    """
    Compute word count of text. Splits by whitespace and only retains
    words with at least one alphanumeric character [a-zA-Z0-9_].
    """
    words = text.split()
    words = list(filter(pattern_alphanum.match, words))
    return len(words)

def get_commit_stats(commit):
    """
    Return addition and deletion (based on `git diff --word-diff`) for a commit.
    
    https://git-scm.com/docs/git-diff
    http://gitpython.readthedocs.io/en/stable/reference.html#module-git.diff
    """
    diffs = commit.diff(
        other=commit.parents[0] if commit.parents else git.NULL_TREE,
        paths='sections/*.md',
        create_patch=True,
        word_diff='porcelain',
    )

    delta = collections.OrderedDict()
    for key in 'words_added', 'words_deleted', 'characters_added', 'characters_deleted':
        delta[key] = 0
    for diff in diffs:
        diff_str = str(diff)
        for added in added_pattern.findall(diff_str):
            delta['words_added'] += get_word_count(added)
            delta['characters_added'] += len(added)
        for deleted in deleted_pattern.findall(diff_str):
            delta['words_deleted'] += get_word_count(deleted)
            delta['characters_deleted'] += len(deleted)
    return(delta)

In [4]:
rows = list()
for commit in repo.iter_commits():
    row = collections.OrderedDict()
    row['commit'] = commit.hexsha
    row['author_name'] = commit.author.name
    row['author_email'] = commit.author.email
    row['committer_name'] = commit.committer.name
    row['authored_datetime'] = commit.authored_datetime
    row['committed_datetime'] = commit.committed_datetime
    row['summary'] = commit.summary
    row['count'] = commit.count()
    row['merge'] = int(len(commit.parents) > 1)
    row['parents'] = ', '.join(x.hexsha for x in commit.parents)
    row.update(get_commit_stats(commit))
    rows.append(row)

rows = list(reversed(rows))
commit_df = pandas.DataFrame(rows)

In [5]:
commit_df.tail(5)

Unnamed: 0,commit,author_name,author_email,committer_name,authored_datetime,committed_datetime,summary,count,merge,parents,words_added,words_deleted,characters_added,characters_deleted
433,8bc23969901a8adf76f438e83f3c4da75f931220,Michael Hoffman,michaelmhoffman@users.noreply.github.com,Anthony Gitter,2017-08-11 07:47:11-04:00,2017-08-11 06:47:11-05:00,Fix clear errors (#618),434,0,a82c67d7b7445ef52c57f3501114bfe913d42b5d,73,23,655,186
434,4b629b3a34917086d09de44fa25bb5a7f4416561,Michael Hoffman,michaelmhoffman@users.noreply.github.com,Anthony Gitter,2017-08-26 08:27:34-04:00,2017-08-26 07:27:34-05:00,"Rework promoter/enhancer section (#623), close...",435,0,8bc23969901a8adf76f438e83f3c4da75f931220,509,794,4143,6055
435,6d95f407b414ee84c899150f7f9a4482e3c98fb5,Robert Gieseke,rob.g@web.de,Anthony Gitter,2017-09-28 20:38:20+02:00,2017-09-28 13:38:20-05:00,Mention MIT licensed 3rd-party code (#665) [ci...,436,0,4b629b3a34917086d09de44fa25bb5a7f4416561,0,0,0,0
436,2e721e0c9ee8314688581e0e4dd465222d867b01,Yifan Peng,yifan.peng@nih.gov,Anthony Gitter,2017-10-04 07:20:57-04:00,2017-10-04 06:20:57-05:00,Add ChestX-ray data link (#668) closes #666,437,0,6d95f407b414ee84c899150f7f9a4482e3c98fb5,13,2,167,13
437,d4c316393779a9248d924abae83bc672aafa39df,Yifan Peng,yifan.peng@nih.gov,Anthony Gitter,2017-10-04 14:34:28-04:00,2017-10-04 13:34:28-05:00,ChestX-ray dataset reference formatting (#669)...,438,0,2e721e0c9ee8314688581e0e4dd465222d867b01,0,0,0,0


In [6]:
commit_df.to_csv('commits.tsv', sep='\t', index=False)