# Extract GitHub contributions over time

Uses [GitPython](http://gitpython.readthedocs.io/en/stable/).

In [1]:
import collections
import re

import git
import pandas

In [2]:
repo = git.Repo(path = 'deep-review')
# State of deep-review submodule
print(repo.head.commit)

1db0c13e0d5828d325ac8e1eba523b77bc33a7d1


In [3]:
# These patterns are reversed because our diffs are backwards
# -R (R=True, suggested in https://git.io/vdigv) doesn't seem to work
added_pattern = re.compile(r'^-(.+)', flags=re.MULTILINE)
deleted_pattern = re.compile(r'^\+(.+)', flags=re.MULTILINE)
pattern_alphanum = re.compile(r'\w')

def get_word_count(text):
    """
    Compute word count of text. Splits by whitespace and only retains
    words with at least one alphanumeric character [a-zA-Z0-9_].
    """
    words = text.split()
    words = list(filter(pattern_alphanum.match, words))
    return len(words)

def get_commit_stats(commit):
    """
    Return addition and deletion (based on `git diff --word-diff`) for a commit.
    
    https://git-scm.com/docs/git-diff
    http://gitpython.readthedocs.io/en/stable/reference.html#module-git.diff
    """
    diffs = commit.diff(
        other=commit.parents[0] if commit.parents else git.NULL_TREE,
        paths=['sections/*.md', 'content/*.md'],
        create_patch=True,
        word_diff='porcelain',
    )

    delta = collections.OrderedDict()
    for key in 'words_added', 'words_deleted', 'characters_added', 'characters_deleted':
        delta[key] = 0
    for diff in diffs:
        diff_str = str(diff)
        additions = added_pattern.findall(diff_str)
        deletions = deleted_pattern.findall(diff_str)

        # Attempt to ignore relocated lines
        additions, deletions = (
            [x for x in additions if get_word_count(x) < 4 or x not in set(deletions)],
            [x for x in deletions if get_word_count(x) < 4 or x not in set(additions)],
        )
        for added in additions:
            delta['words_added'] += get_word_count(added)
            delta['characters_added'] += len(added)
        for deleted in deletions:
            delta['words_deleted'] += get_word_count(deleted)
            delta['characters_deleted'] += len(deleted)
    return(delta)

In [4]:
rows = list()
for commit in repo.iter_commits():
    row = collections.OrderedDict()
    row['commit'] = commit.hexsha
    row['author_name'] = commit.author.name
    row['author_email'] = commit.author.email
    row['committer_name'] = commit.committer.name
    row['authored_datetime'] = commit.authored_datetime
    row['committed_datetime'] = commit.committed_datetime
    row['summary'] = commit.summary
    row['count'] = commit.count()
    row['merge'] = int(len(commit.parents) > 1)
    row['parents'] = ', '.join(x.hexsha for x in commit.parents)
    row.update(get_commit_stats(commit))
    rows.append(row)

rows = list(reversed(rows))
commit_df = pandas.DataFrame(rows)

In [5]:
commit_df.tail(5)

Unnamed: 0,commit,author_name,author_email,committer_name,authored_datetime,committed_datetime,summary,count,merge,parents,words_added,words_deleted,characters_added,characters_deleted
723,b3b57d37c4accabc2e15fd1d3f320a52ce25ed8b,Anthony Gitter,agitter@users.noreply.github.com,GitHub,2018-01-18 21:29:00-06:00,2018-01-18 21:29:00-06:00,Adjust relative column width in Table 1 (#806),724,0,6bbf4c7e4a28f1d214db16d10eb3095fbd5173f0,9,0,131,125
724,7cef20b43a8f40037d261c51ced55880082645bd,Anthony Gitter,agitter@users.noreply.github.com,Casey Greene,2018-01-19 06:01:25-06:00,2018-01-19 07:01:25-05:00,Fix typos (#808),725,0,b3b57d37c4accabc2e15fd1d3f320a52ce25ed8b,42,7,383,76
725,a01dd71f8603412b1cd80fb4415ce579999aaf8f,Anthony Gitter,agitter@users.noreply.github.com,Casey Greene,2018-01-19 06:26:51-06:00,2018-01-19 07:26:51-05:00,Update acknowledgements (#809),726,0,7cef20b43a8f40037d261c51ced55880082645bd,14,2,116,12
726,57563a3036ada1eec12e6071bce0fac8f85613ad,Anthony Gitter,agitter@users.noreply.github.com,Casey Greene,2018-01-20 09:44:58-06:00,2018-01-20 10:44:58-05:00,Readme status update (#811),727,0,a01dd71f8603412b1cd80fb4415ce579999aaf8f,0,0,0,0
727,1db0c13e0d5828d325ac8e1eba523b77bc33a7d1,Michael Hoffman,michaelmhoffman@users.noreply.github.com,Anthony Gitter,2018-01-23 08:30:20-08:00,2018-01-23 10:30:20-06:00,"Fix non-semantic newlines for `e.g. `, `i.e. ...",728,0,57563a3036ada1eec12e6071bce0fac8f85613ad,80,0,610,0


In [6]:
commit_df.to_csv('commits.tsv', sep='\t', index=False)