# Extract GitHub contributions over time

Uses [GitPython](http://gitpython.readthedocs.io/en/stable/).

In [1]:
import collections
import pathlib
import re
import datetime
import git
import pandas
import pytz

In [2]:
repo = git.Repo(path = '../')
repo.head.reference = "master"
print(repo.head.commit)

2c88132aa6e259244b474470bfe72722f89f1683


In [3]:
# These patterns are reversed because our diffs are backwards
# -R (R=True, suggested in https://git.io/vdigv) doesn't seem to work
added_pattern = re.compile(r'^-(.+)', flags=re.MULTILINE)
deleted_pattern = re.compile(r'^\+(.+)', flags=re.MULTILINE)
pattern_alphanum = re.compile(r'\w')

def get_word_count(text):
    """
    Compute word count of text. Splits by whitespace and only retains
    words with at least one alphanumeric character [a-zA-Z0-9_].
    """
    words = text.split()
    words = list(filter(pattern_alphanum.match, words))
    return len(words)

def get_commit_stats(commit):
    """
    Return addition and deletion (based on `git diff --word-diff`) for a commit.
    
    https://git-scm.com/docs/git-diff
    http://gitpython.readthedocs.io/en/stable/reference.html#module-git.diff
    """
    diffs = commit.diff(
        other=commit.parents[0] if commit.parents else git.NULL_TREE,
        paths=['sections/*.md', 'content/*.md'],
        create_patch=True,
        word_diff='porcelain',
    )

    delta = collections.OrderedDict()
    for key in 'words_added', 'words_deleted', 'characters_added', 'characters_deleted':
        delta[key] = 0
    for diff in diffs:
        diff_str = str(diff)
        additions = added_pattern.findall(diff_str)
        deletions = deleted_pattern.findall(diff_str)

        # Attempt to ignore relocated lines
        additions, deletions = (
            [x for x in additions if get_word_count(x) < 4 or x not in set(deletions)],
            [x for x in deletions if get_word_count(x) < 4 or x not in set(additions)],
        )
        for added in additions:
            delta['words_added'] += get_word_count(added)
            delta['characters_added'] += len(added)
        for deleted in deletions:
            delta['words_deleted'] += get_word_count(deleted)
            delta['characters_deleted'] += len(deleted)
    return(delta)

In [4]:
rows = list()
# We want to filter out anything from the Manubot development project that predates the COVID-19 project
# The COVID-19 project began on 3/20/2020, but let's build in one extra day just in case timezones cause issues
tz = pytz.timezone('America/New_York')
project_start = datetime.datetime(2020, 3, 19, 0, 0, 0, tzinfo=tz)
for commit in repo.iter_commits():
    if commit.authored_datetime < project_start:
        continue
    row = collections.OrderedDict()
    row['commit'] = commit.hexsha
    row['author_name'] = commit.author.name
    row['author_email'] = commit.author.email
    row['committer_name'] = commit.committer.name
    row['authored_datetime'] = commit.authored_datetime
    row['committed_datetime'] = commit.committed_datetime
    row['summary'] = commit.summary
    row['count'] = commit.count()
    row['merge'] = int(len(commit.parents) > 1)
    row['parents'] = ', '.join(x.hexsha for x in commit.parents)
    row.update(get_commit_stats(commit))
    rows.append(row)

rows = list(reversed(rows))
commit_df = pandas.DataFrame(rows)
backup_times = commit_df["authored_datetime"]

In [5]:
commit_df.tail(5)

Unnamed: 0,commit,author_name,author_email,committer_name,authored_datetime,committed_datetime,summary,count,merge,parents,words_added,words_deleted,characters_added,characters_deleted
4830,afa719f455502e4a1db8af12f1c515384ffce9c3,HM Rando,halie.rando@cuanschutz.edu,GitHub,2021-08-27 10:36:27-04:00,2021-08-27 10:36:27-04:00,Merge branch 'master' into dziakj1-patch-1,5144,1,"e424d1f22568754e1aa8f28c0b4107580d23707b, 7b87...",86,8,711,86
4831,74e07d945a23835c39f7d5e8a5d8ff8abfb6e05f,Anthony Gitter,agitter@users.noreply.github.com,GitHub,2021-08-27 09:39:51-05:00,2021-08-27 09:39:51-05:00,Merge branch 'master' into more-proofreading,5143,1,"8801b30c4ac2f2162025e267e90891ce4931ab0e, 7b87...",86,8,711,86
4832,0a25b75c7283bf512e1e3f9207e324c2e7a171e3,Anthony Gitter,agitter@users.noreply.github.com,GitHub,2021-08-27 10:03:49-05:00,2021-08-27 10:03:49-05:00,Merge pull request #1013 from greenelab/more-p...,5144,1,"7b87214ae8186c65de3b642c42e9fb6b8670680e, 74e0...",41,36,575,581
4833,d38aec4ed60a9cd95fa242f9fef99a14c294ef00,HM Rando,halie.rando@cuanschutz.edu,GitHub,2021-08-27 11:04:27-04:00,2021-08-27 11:04:27-04:00,Merge branch 'master' into dziakj1-patch-1,5149,1,"afa719f455502e4a1db8af12f1c515384ffce9c3, 0a25...",41,36,575,581
4834,2c88132aa6e259244b474470bfe72722f89f1683,HM Rando,halie.rando@cuanschutz.edu,GitHub,2021-08-27 11:46:27-04:00,2021-08-27 11:46:27-04:00,Merge pull request #1011 from greenelab/dziakj...,5150,1,"0a25b75c7283bf512e1e3f9207e324c2e7a171e3, d38a...",54,6,417,37


## Manual author name fixes

In [6]:
path = pathlib.Path('renamer.tsv')
renamer = dict()
if path.exists():
    df = pandas.read_csv(path, sep='\t')
    renamer = dict(zip(df.name, df.rename_to))
len(renamer)

35

In [7]:
for column in 'author_name', 'committer_name':
    print(f"Unique {column}s before renaming {commit_df[column].nunique()}")
    commit_df[column].replace(renamer, inplace=True)
    print(f"Unique {column}s after renaming {commit_df[column].nunique()}")

Unique author_names before renaming 62
Unique author_names after renaming 52
Unique committer_names before renaming 20
Unique committer_names after renaming 19


In [8]:
commit_df.head(2)

Unnamed: 0,commit,author_name,author_email,committer_name,authored_datetime,committed_datetime,summary,count,merge,parents,words_added,words_deleted,characters_added,characters_deleted
0,3f8cdd531916e7cfb3a295f53142c6930ad5c124,Halie Rando,halie.rando@pennmedicine.upenn.edu,Halie Rando,2020-03-20 15:18:05-04:00,2020-03-20 15:18:05-04:00,initial commit,316,0,1780fac0ac6bba1260a9da3886061730fa5d2765,146,0,1000,0
1,02b07f0f32bdcd7075dde7370bb08e6e36d908a1,Halie Rando,halie.rando@pennmedicine.upenn.edu,Halie Rando,2020-03-20 18:11:40-04:00,2020-03-20 18:11:40-04:00,updated readme and abstract,317,0,3f8cdd531916e7cfb3a295f53142c6930ad5c124,85,61,630,414


In [9]:
# Fix date format to be compatible with R
utc_dt = list()
for index, value in commit_df["authored_datetime"].items():
    utc_dt.append(value.astimezone(pytz.utc))
commit_df["authored_datetime"] = pandas.Series(utc_dt)
commit_df["authored_datetime"] = commit_df["authored_datetime"].dt.date

## Export

In [10]:
commit_df.to_csv('commits.tsv', sep='\t', index=False)

In [11]:
# Check and see whether these need to be cleaned up!
print(commit_df["author_name"].unique())

['Halie Rando' 'Casey Greene' 'Anthony Gitter' 'Michael Robson'
 'Ronnie Russell' 'Simina Boca' 'Nils Wellhausen' 'Christian Brueffer'
 'Ronan Lordan' "Lucy D'Agostino McGowan" 'Sandipan Ray'
 'Anna Ada Dattoli' 'Ryan Hagenson' 'Jeff Field' 'Adam MacLean'
 'Ryan Velazquez' 'Fengling Hu' 'John P Barton' 'Bharath Ramsundar'
 'Alexandra Lee' 'Diane Rafizadeh' 'John J. Dziak' 'Nafisa Jadavji'
 'Elizabeth Sell' 'Marouen Ben Guebila' 'Jinhui Wang' 'David Mai'
 'Yoson Park' 'Likhitha Kolla' 'Ashwin Skelly' 'Daniel Himmelstein'
 'J. Brian Byrd' 'Soumita Ghosh' 'David Manheim' 'Matthias Fax'
 'Yael Marshall' 'Vikas Bansal' 'Vincent Rubinetti' 'Sebastian Karcher'
 'Stephen Capone' 'Yanjun Qi' 'Yuchen Sun' 'Lamonica Shinholster'
 'Sergey Knyazev' 'Dimitri Perrin' 'Greg Szeto' 'C. Titus Brown'
 'Temitayo Lukan' 'Tiago Lubiana' 'Nick Fry' 'Amruta Naik' 'Yusha Sun']


In [12]:
# Go back to the external-resources branch
repo.head.reference = "contrib-viz"