# Extract GitHub contributions over time

Uses [GitPython](http://gitpython.readthedocs.io/en/stable/).

In [29]:
import collections
import pathlib
import re
import datetime
import git
import pandas
import pytz

In [2]:
repo = git.Repo(path = '../')
repo.head.reference = "master"
# State of deep-review submodule
print(repo.head.commit)

05acaf59cd3a3701df8ce0ad113773e154774d82


In [3]:
# These patterns are reversed because our diffs are backwards
# -R (R=True, suggested in https://git.io/vdigv) doesn't seem to work
added_pattern = re.compile(r'^-(.+)', flags=re.MULTILINE)
deleted_pattern = re.compile(r'^\+(.+)', flags=re.MULTILINE)
pattern_alphanum = re.compile(r'\w')

def get_word_count(text):
    """
    Compute word count of text. Splits by whitespace and only retains
    words with at least one alphanumeric character [a-zA-Z0-9_].
    """
    words = text.split()
    words = list(filter(pattern_alphanum.match, words))
    return len(words)

def get_commit_stats(commit):
    """
    Return addition and deletion (based on `git diff --word-diff`) for a commit.
    
    https://git-scm.com/docs/git-diff
    http://gitpython.readthedocs.io/en/stable/reference.html#module-git.diff
    """
    diffs = commit.diff(
        other=commit.parents[0] if commit.parents else git.NULL_TREE,
        paths=['sections/*.md', 'content/*.md'],
        create_patch=True,
        word_diff='porcelain',
    )

    delta = collections.OrderedDict()
    for key in 'words_added', 'words_deleted', 'characters_added', 'characters_deleted':
        delta[key] = 0
    for diff in diffs:
        diff_str = str(diff)
        additions = added_pattern.findall(diff_str)
        deletions = deleted_pattern.findall(diff_str)

        # Attempt to ignore relocated lines
        additions, deletions = (
            [x for x in additions if get_word_count(x) < 4 or x not in set(deletions)],
            [x for x in deletions if get_word_count(x) < 4 or x not in set(additions)],
        )
        for added in additions:
            delta['words_added'] += get_word_count(added)
            delta['characters_added'] += len(added)
        for deleted in deletions:
            delta['words_deleted'] += get_word_count(deleted)
            delta['characters_deleted'] += len(deleted)
    return(delta)

In [36]:
rows = list()
# We want to filter out anything from the Manubot development project that predates COVID-19
tz = pytz.timezone('America/New_York')
project_start = datetime.datetime(2020, 3, 19, 0, 0, 0, tzinfo=tz)
for commit in repo.iter_commits():
    if commit.authored_datetime < project_start:
        continue
    row = collections.OrderedDict()
    row['commit'] = commit.hexsha
    row['author_name'] = commit.author.name
    row['author_email'] = commit.author.email
    row['committer_name'] = commit.committer.name
    row['authored_datetime'] = commit.authored_datetime
    row['committed_datetime'] = commit.committed_datetime
    row['summary'] = commit.summary
    row['count'] = commit.count()
    row['merge'] = int(len(commit.parents) > 1)
    row['parents'] = ', '.join(x.hexsha for x in commit.parents)
    row.update(get_commit_stats(commit))
    rows.append(row)

rows = list(reversed(rows))
commit_df = pandas.DataFrame(rows)

2020-03-03 15:02:15-05:00
2020-02-28 12:46:34-05:00
2020-02-18 15:31:45-05:00
2020-02-18 11:25:59-05:00
2020-02-18 11:08:48-05:00
2020-02-18 09:33:23-05:00
2020-02-18 09:24:55-05:00
2020-02-17 17:25:12-05:00
2020-02-14 14:55:51-08:00
2020-01-30 13:44:14-08:00
2020-01-30 14:10:22-05:00
2020-01-29 10:45:25-05:00
2020-01-28 14:08:25-05:00
2020-01-18 19:33:09-05:00
2020-01-18 19:26:15-05:00
2020-01-17 15:57:48-05:00
2020-01-17 15:35:17-05:00
2020-01-14 17:31:56-05:00
2020-01-14 10:00:34-05:00
2020-01-04 17:27:12-05:00
2020-01-04 16:49:26-05:00
2019-12-18 13:11:22-05:00
2019-12-13 15:12:12-05:00
2019-11-20 15:53:36-05:00
2019-11-20 12:03:06-05:00
2019-11-20 10:55:00-05:00
2019-11-01 13:41:38-04:00
2019-10-31 17:44:04-04:00
2019-10-11 12:05:31-04:00
2019-09-18 09:28:40-05:00
2019-09-11 13:14:44-04:00
2019-09-06 12:07:38-04:00
2019-09-05 17:34:58-04:00
2019-08-30 12:16:45-04:00
2019-08-09 11:41:12-05:00
2019-08-04 15:26:38-04:00
2019-08-02 11:30:32-04:00
2019-08-01 11:01:38-04:00
2019-07-09 1

In [89]:
commit_df.tail(5)

Unnamed: 0,commit,author_name,author_email,committer_name,authored_datetime,committed_datetime,summary,count,merge,parents,words_added,words_deleted,characters_added,characters_deleted
4362,f2cc7e529db8e7f056d0229cc5a4db1b74dc96ba,Halie M. Rando,halie.rando@pennmedicine.upenn.edu,Halie M. Rando,2021-04-26,2021-04-26 17:12:38-04:00,intiial edits to intro,4677,0,421d256c01d8e6de59bb747445d34768ee9ae1ac,107,38,734,255
4363,1248a331b90c3baab6ac2c0697473423bb9856f5,Halie M. Rando,halie.rando@pennmedicine.upenn.edu,Halie M. Rando,2021-04-26,2021-04-26 17:13:26-04:00,resolve merge,4679,1,"f2cc7e529db8e7f056d0229cc5a4db1b74dc96ba, d53f...",43,3,309,27
4364,a824ec6b91a77bf839ce561c83db5f221613f9ad,Halie M. Rando,halie.rando@pennmedicine.upenn.edu,Halie M. Rando,2021-04-26,2021-04-26 17:35:27-04:00,restructure intro,4680,0,1248a331b90c3baab6ac2c0697473423bb9856f5,309,215,2132,1458
4365,889a954939a92b177c86efd5154f240cf2620d05,Halie M. Rando,halie.rando@pennmedicine.upenn.edu,Halie M. Rando,2021-04-26,2021-04-26 19:17:23-04:00,responses to @agitter's structural suggestions,4681,0,a824ec6b91a77bf839ce561c83db5f221613f9ad,1198,954,9057,7595
4366,05acaf59cd3a3701df8ce0ad113773e154774d82,Halie M. Rando,halie.rando@cuanschutz.edu,GitHub,2021-04-27,2021-04-27 07:59:36-04:00,Merge pull request #939 from rando2/methods,4682,1,"89dff08a5bb813c1bbc224275a347891ac8a532d, 889a...",2365,1491,17380,12011


## Manual author name fixes

In [46]:
path = pathlib.Path('renamer.tsv')
renamer = dict()
if path.exists():
    df = pandas.read_csv(path, sep='\t')
    renamer = dict(zip(df.name, df.rename_to))
len(renamer)

32

In [47]:
for column in 'author_name', 'committer_name':
    print(f"Unique {column}s before renaming {commit_df[column].nunique()}")
    commit_df[column].replace(renamer, inplace=True)
    print(f"Unique {column}s after renaming {commit_df[column].nunique()}")

Unique author_names before renaming 61
Unique author_names after renaming 53
Unique committer_names before renaming 19
Unique committer_names after renaming 18


In [92]:
commit_df.head(2)

Unnamed: 0,commit,author_name,author_email,committer_name,authored_datetime,committed_datetime,summary,count,merge,parents,words_added,words_deleted,characters_added,characters_deleted
0,3f8cdd531916e7cfb3a295f53142c6930ad5c124,Halie M. Rando,halie.rando@pennmedicine.upenn.edu,Halie M. Rando,2020-03-20,2020-03-20 15:18:05-04:00,initial commit,316,0,1780fac0ac6bba1260a9da3886061730fa5d2765,146,0,1000,0
1,02b07f0f32bdcd7075dde7370bb08e6e36d908a1,Halie M. Rando,halie.rando@pennmedicine.upenn.edu,Halie M. Rando,2020-03-20,2020-03-20 18:11:40-04:00,updated readme and abstract,317,0,3f8cdd531916e7cfb3a295f53142c6930ad5c124,85,61,630,414


In [85]:
# Fix date format to be compatible with R
# Run through again to confirm this actually works, accidentally overwrote the back-up df
utc_dt = list()
for index, value in commit_df["authored_datetime"].items():
    print(value.astimezone(pytz.utc))
    break
commit_df["authored_datetime"] = pandas.Series(utc_dt)
commit_df["authored_datetime"] = commit_df["authored_datetime"].dt.date

                                        commit      author_name  \
0     3f8cdd531916e7cfb3a295f53142c6930ad5c124   Halie M. Rando   
1     02b07f0f32bdcd7075dde7370bb08e6e36d908a1   Halie M. Rando   
2     19467047694bfcb22ed0cda4903bac7ff8bbb921  Casey S. Greene   
3     22efa5998f384ceb18addb8fa77a3ef9caff3bde  Casey S. Greene   
4     365bffd068da598336a18a253657e6be8b51fd94   Halie M. Rando   
5     7302c3523d2aff8de7570d3778ae90f2085e3748   Halie M. Rando   
6     b92547e40b0e8ccd59d3f278506adfc97b470125   Halie M. Rando   
7     7c38327a63c39afabfaf11a4e2136826effbeb98   Halie M. Rando   
8     b0638accfc8dfb2adb240f8e70a4d93f29de7457   Halie M. Rando   
9     5d06ae8bb5d784458c8cb7b5a1f2262c127eb0ef   Halie M. Rando   
10    ca508e1a79e3c932575504731f3dcae2328f62e7   Halie M. Rando   
11    59de7848336358a8c705df79b2cdda9b75ab205a   Halie M. Rando   
12    6c201b68d0d2621fa70b3c25ed772c0a0ea1048a   Halie M. Rando   
13    be27fbb90ef9f51987a03da978c2cdf6bc732926   Halie M. Rand

AttributeError: 'datetime.date' object has no attribute 'astimezone'

## Export

In [90]:
commit_df.to_csv('commits.tsv', sep='\t', index=False)

In [50]:
# These need to be cleaned up! But at least they are de-duped
print(commit_df["author_name"].unique())

['Halie M. Rando' 'Casey S. Greene' 'Anthony Gitter' 'Michael Robson'
 'Ronnie M. Russell' 'Simina M. Boca' 'Nils Wellhausen'
 'Christian Brueffer' 'Ronan Lordan' "Lucy D'Agostino McGowan"
 'Sandipan Ray' 'Anna Ada Dattoli' 'Ryan Hagenson' 'Jeff Field'
 'Adam L. MacLean' 'Adam L MacLean' 'Ryan Velazquez' 'Fengling Hu'
 'John P Barton' 'Bharath Ramsundar' 'Alexandra Lee' 'Diane Rafi'
 'John J. Dziak' 'Nafisa Jadavji' 'Elizabeth Sell' 'Marouen' 'Jinhui'
 'David Mai' 'Yoson Park' 'Likhitha Kolla' 'Ashwin Skelly'
 'Daniel Himmelstein' 'J. Brian Byrd' 'soumitagh' 'David Manheim'
 'Matthias Fax' 'Yael E. Marshall' 'Bansalvi' 'Vincent Rubinetti'
 'Sebastian Karcher' 'Stephen Capone' 'Yanjun Qi' 'Yuchen Sun'
 'Lamonica Shinholster' 'Sergey Knyazev' 'Dimitri Perrin' 'rlordan'
 'Greg Szeto' 'C. Titus Brown' 'Temitayo Lukan' 'Tiago Lubiana' 'Naika'
 'Yushua Sun']


In [91]:
repo.head.reference = "contrib-viz"