# Extract GitHub contributions over time

Uses [GitPython](http://gitpython.readthedocs.io/en/stable/).

In [1]:
import collections
import pathlib
import re

import git
import pandas

In [2]:
repo = git.Repo(path = 'deep-review')
# State of deep-review submodule
print(repo.head.commit)

63d2468883ea69ad7ad638c39efab0fcbe026298


In [3]:
# These patterns are reversed because our diffs are backwards
# -R (R=True, suggested in https://git.io/vdigv) doesn't seem to work
added_pattern = re.compile(r'^-(.+)', flags=re.MULTILINE)
deleted_pattern = re.compile(r'^\+(.+)', flags=re.MULTILINE)
pattern_alphanum = re.compile(r'\w')

def get_word_count(text):
    """
    Compute word count of text. Splits by whitespace and only retains
    words with at least one alphanumeric character [a-zA-Z0-9_].
    """
    words = text.split()
    words = list(filter(pattern_alphanum.match, words))
    return len(words)

def get_commit_stats(commit):
    """
    Return addition and deletion (based on `git diff --word-diff`) for a commit.
    
    https://git-scm.com/docs/git-diff
    http://gitpython.readthedocs.io/en/stable/reference.html#module-git.diff
    """
    diffs = commit.diff(
        other=commit.parents[0] if commit.parents else git.NULL_TREE,
        paths=['sections/*.md', 'content/*.md'],
        create_patch=True,
        word_diff='porcelain',
    )

    delta = collections.OrderedDict()
    for key in 'words_added', 'words_deleted', 'characters_added', 'characters_deleted':
        delta[key] = 0
    for diff in diffs:
        diff_str = str(diff)
        additions = added_pattern.findall(diff_str)
        deletions = deleted_pattern.findall(diff_str)

        # Attempt to ignore relocated lines
        additions, deletions = (
            [x for x in additions if get_word_count(x) < 4 or x not in set(deletions)],
            [x for x in deletions if get_word_count(x) < 4 or x not in set(additions)],
        )
        for added in additions:
            delta['words_added'] += get_word_count(added)
            delta['characters_added'] += len(added)
        for deleted in deletions:
            delta['words_deleted'] += get_word_count(deleted)
            delta['characters_deleted'] += len(deleted)
    return(delta)

In [4]:
rows = list()
for commit in repo.iter_commits():
    row = collections.OrderedDict()
    row['commit'] = commit.hexsha
    row['author_name'] = commit.author.name
    row['author_email'] = commit.author.email
    row['committer_name'] = commit.committer.name
    row['authored_datetime'] = commit.authored_datetime
    row['committed_datetime'] = commit.committed_datetime
    row['summary'] = commit.summary
    row['count'] = commit.count()
    row['merge'] = int(len(commit.parents) > 1)
    row['parents'] = ', '.join(x.hexsha for x in commit.parents)
    row.update(get_commit_stats(commit))
    rows.append(row)

rows = list(reversed(rows))
commit_df = pandas.DataFrame(rows)

In [5]:
commit_df.tail(5)

Unnamed: 0,commit,author_name,author_email,committer_name,authored_datetime,committed_datetime,summary,count,merge,parents,words_added,words_deleted,characters_added,characters_deleted
750,77fa6be69b5060563988bbd0031fa45958b81ccb,Anthony Gitter,agitter@users.noreply.github.com,GitHub,2018-03-05 10:45:28-06:00,2018-03-05 10:45:28-06:00,Add end to end protein structure prediction (#...,751,0,7ea8733a9bbf66ecd48dbdd4467d2f9f94def3ce,37,27,321,153
751,09cb8c6ff2278e69108bd80fc112a3fdc09f02be,Alexandr Kalinin,1107762+alxndrkalinin@users.noreply.github.com,Anthony Gitter,2018-03-05 11:55:55-05:00,2018-03-05 10:55:55-06:00,2 examples of small data training in medical i...,752,0,77fa6be69b5060563988bbd0031fa45958b81ccb,165,0,1255,0
752,fafc92abf2716166060a22a3bc60d123011d3e5f,Stephen Ra,stephenrra@gmail.com,Casey Greene,2018-03-05 16:03:54-05:00,2018-03-05 16:03:54-05:00,Minor fix VAE table entry (#822),753,0,09cb8c6ff2278e69108bd80fc112a3fdc09f02be,38,21,322,142
753,9c2d9c26b8fd40c321a80b854e2e74677cd38c79,Anthony Gitter,agitter@users.noreply.github.com,GitHub,2018-03-05 16:23:27-06:00,2018-03-05 16:23:27-06:00,Remove newline (#830),754,0,fafc92abf2716166060a22a3bc60d123011d3e5f,36,1,320,9
754,63d2468883ea69ad7ad638c39efab0fcbe026298,Anthony Gitter,agitter@users.noreply.github.com,Casey Greene,2018-03-06 07:17:34-06:00,2018-03-06 08:17:34-05:00,Remove duplicate reference (#831),755,0,9c2d9c26b8fd40c321a80b854e2e74677cd38c79,26,1,241,46


## Manual author name fixes

In [6]:
path = pathlib.Path('renamer.tsv')
renamer = dict()
if path.exists():
    df = pandas.read_table(path)
    renamer = dict(zip(df.name, df.rename_to))
len(renamer)

13

In [7]:
for column in 'author_name', 'committer_name':
    print(f"Unique {column}s begore renaming {commit_df[column].nunique()}")
    commit_df[column].replace(renamer, inplace=True)
    print(f"Unique {column}s after renaming {commit_df[column].nunique()}")

Unique author_names begore renaming 53
Unique author_names after renaming 47
Unique committer_names begore renaming 12
Unique committer_names after renaming 11


In [8]:
commit_df.head(2)

Unnamed: 0,commit,author_name,author_email,committer_name,authored_datetime,committed_datetime,summary,count,merge,parents,words_added,words_deleted,characters_added,characters_deleted
0,e1529c48fe2dd83c81cc91a09d3b80fdf40e16bb,Casey Greene,cgreene@users.noreply.github.com,Casey Greene,2016-08-02 13:41:56-04:00,2016-08-02 13:41:56-04:00,Initial commit,1,0,,0,0,0,0
1,c98f9789c6fdc9a7313456fb543fbe44b805907e,Casey Greene,cgreene@users.noreply.github.com,GitHub,2016-08-02 13:54:09-04:00,2016-08-02 13:54:09-04:00,draft description of repo,2,0,e1529c48fe2dd83c81cc91a09d3b80fdf40e16bb,0,0,0,0


## Export

In [9]:
commit_df.to_csv('commits.tsv', sep='\t', index=False)