# git2net analysis for the seaborn repository

First we clone the repo

In [1]:
import pygit2 as git2
import os
import shutil
import git2net
import pathpy as pp
import sqlite3
import pandas as pd
from datetime import date, datetime, timedelta
import matplotlib.pyplot as plt
import json 

git_repo_url = 'https://github.com/mwaskom/seaborn.git'
local_directory = '.'
git_repo_dir = 'notebooks/repos/seaborn4analysis'
sqlite_db_file = 'notebooks/databases/seaborn.db'

Clone repo:

In [None]:
if os.path.exists(git_repo_dir):
    shutil.rmtree(git_repo_dir)

repo = git2.clone_repository(git_repo_url, git_repo_dir) # Clones a non-bare repository

Then we create the database for it. We will try with max_modification=100, so that most commits are processed.

Mine repo:

In [None]:
# Remove database if exists
#if os.path.exists(sqlite_db_file):
#    os.remove(sqlite_db_file)

max_modifications = 100
    
git2net.mine_git_repo(git_repo_dir, sqlite_db_file, max_modifications=max_modifications)

Let's check the commits that had more than 100 files modified

In [None]:
git2net.mining_state_summary(git_repo_dir, sqlite_db_file)

### Database modification
Replacing the aliases belonging to the same person.

In [None]:
con = sqlite3.connect(sqlite_db_file)

# Query the db
data = pd.read_sql("""SELECT * FROM commits""", con)

# Get all the name-email pairs
names = data[['author_email', 'author_name']].copy()
names.groupby(['author_email', 'author_name']).size().reset_index().rename(columns={0:'count'})

# After adding the similar names to the aliases.json, replace the values
#with open('aliases.json') as file:
#    aliases = json.load(file)

#data = data.replace(aliases)

# Save changes in db
#data.to_sql('commits', con, if_exists='replace')

In [22]:
con = sqlite3.connect(sqlite_db_file)

# Query the db
data = pd.read_sql("""SELECT * FROM commits""", con)

with open('notebooks/replace_chars.json') as file:
    replace_dict = json.load(file)

for key, value in replace_dict.items():
    data['author_name'].str.replace(key, value)
    data['committer_name'].str.replace(key, value)
data.to_sql('commits', con, if_exists='replace')

In [23]:
data

Unnamed: 0,hash,author_email,author_name,committer_email,committer_name,author_date,committer_date,author_timezone,committer_timezone,no_of_modifications,commit_message_len,project_name,parents,merge,in_main_branch,branches
0,31eda8fc053ff3aacc20e7c01570efac01c35db1,mwaskom@stanford.edu,Michael Waskom,mwaskom@stanford.edu,Michael Waskom,2012-06-18 12:30:41,2012-06-18 12:30:41,25200,25200,1,13,seaborn4analysis,36dd99374a1d6857af7652a9fb832a712a066e16,0,1,master
1,36dd99374a1d6857af7652a9fb832a712a066e16,mwaskom@stanford.edu,Michael Waskom,mwaskom@stanford.edu,Michael Waskom,2012-06-18 11:40:27,2012-06-18 11:40:27,25200,25200,6,14,seaborn4analysis,,0,1,master
2,9caebbff04919e8db78baf544d825190d3fc49d8,mwaskom@stanford.edu,Michael Waskom,mwaskom@stanford.edu,Michael Waskom,2012-06-18 13:15:31,2012-06-18 13:15:31,25200,25200,1,31,seaborn4analysis,31eda8fc053ff3aacc20e7c01570efac01c35db1,0,1,master
3,f39de6924a5ae1d77c013177f177ac9f48e9a72f,mwaskom@stanford.edu,Michael Waskom,mwaskom@stanford.edu,Michael Waskom,2012-06-18 13:40:57,2012-06-18 13:40:57,25200,25200,1,24,seaborn4analysis,9caebbff04919e8db78baf544d825190d3fc49d8,0,1,master
4,22d9c3b4e73adc2f9699a4e00141373f695b8e2e,mwaskom@stanford.edu,Michael Waskom,mwaskom@stanford.edu,Michael Waskom,2012-06-18 16:19:02,2012-06-18 16:19:02,25200,25200,1,44,seaborn4analysis,f39de6924a5ae1d77c013177f177ac9f48e9a72f,0,1,master
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2754,adcc3fbd1e182bbf7e12a5500265f3b6a65d7af9,mwaskom@nyu.edu,Michael Waskom,mwaskom@nyu.edu,Michael Waskom,2020-01-18 18:19:29,2020-01-18 18:19:29,18000,18000,0,79,seaborn4analysis,"11f9de9f6b327f70ed7e4f9f6feb3dc17f172c1c,087f0...",1,1,master
2755,0b7697dea9f421e65bbd2c6e8fca73fee33e4191,mwaskom@users.noreply.github.com,Michael Waskom,noreply@github.com,GitHub,2020-09-08 07:18:28,2020-09-08 07:18:28,14400,14400,81,254,seaborn4analysis,0547bb1cef8f7a1d9080ec96bb99841afb6bdeba,0,1,master
2756,b9d649e9af5ec722f4c7fbdb807b5768adcd8bb3,mwaskom@nyu.edu,Michael Waskom,mwaskom@nyu.edu,Michael Waskom,2020-08-17 18:02:30,2020-08-17 18:02:30,14400,14400,0,97,seaborn4analysis,"c0c040cc0db547246b58bc7e73202fa2f01453e7,a40e4...",1,1,master
2757,0547bb1cef8f7a1d9080ec96bb99841afb6bdeba,mwaskom@users.noreply.github.com,Michael Waskom,noreply@github.com,GitHub,2020-09-07 15:59:15,2020-09-07 15:59:15,14400,14400,43,856,seaborn4analysis,89f71465c76d447cc5a22e38e4d8813cba2f84dc,0,1,master


### Co-author networks

Now let's visualize the result

In [None]:
t, node_info, edge_info = git2net.get_coediting_network(sqlite_db_file)
pp.Network.from_temporal_network(t)

which files were edited by the authors

In [None]:
t, node_info, edge_info = git2net.get_bipartite_network(sqlite_db_file)
n = pp.Network.from_temporal_network(t)

colour_map = {'author': '#73D2DE', 'file': '#2E5EAA'}
node_color = {node: colour_map[node_info['class'][node]] for node in n.nodes}
pp.visualisation.plot(n, node_color=node_color)

Co-authorship network

In [None]:
n, node_info, edge_info = git2net.get_coauthorship_network(sqlite_db_file)
n

The network is too complex because we consider the whole timeframe of the repository since its creation. Therefore we need to filter the time dimensin, and in order to do that we nee to find the first and last commit in the repo.

In [None]:
db = sqlite3.connect(sqlite_db_file)
max_date = datetime.strptime(pd.read_sql_query("SELECT max(committer_date) as max FROM commits", db)['max'].item(), '%Y-%m-%d %H:%M:%S')
min_date = datetime.strptime(pd.read_sql_query("SELECT min(committer_date) as min FROM commits", db)['min'].item(), '%Y-%m-%d %H:%M:%S')

print('Min date: ', min_date)
print('Max date: ', max_date)

In order not to select an 'empty' time period (relatively few commits, e.g. holiday season), it's also worth observing the number of commits over time.

In [None]:
pdCommits = pd.read_sql_query("SELECT * FROM commits", db)

days = {(min_date+timedelta(days=x)).date() : 0 for x in range((max_date-min_date).days + 1)}

commit_dates = pdCommits['committer_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').date()).value_counts()

for key in commit_dates.keys():
    days[key] = commit_dates.get(key)

keys = days.keys()
values = days.values()
plt.figure(figsize=(20,5))
plt.bar(keys, values)

Let's choose the year 2020 based on the plot, as that was the busiest year.

In [None]:
time_from = datetime(2020, 1, 1)
time_to = datetime(2020, 12, 31)
n, node_info, edge_info = git2net.get_coauthorship_network(sqlite_db_file, time_from=time_from, time_to=time_to)
n

We can set the node size based on the number of degrees each author has in the network. This emphasizes who collaborate more with the others in the given timeframe.

In [None]:
node_style = zip(n.nodes, n.node_properties('degree'))
style = {}
style['node_size'] = {v:3+u for v,u in node_style}
pp.visualisation.plot(n, **style)

## Bipartite networks
Because our network changes over time, we would like to visualize each year consecutively one after the other. We can use the pathpy temporal networks for this. 

In [24]:
def collab_network(sqlite_db_file, min_date, max_date):
    start = int(datetime.timestamp(min_date))
    end = int(datetime.timestamp(max_date))
    t, node_info, edge_info = git2net.get_bipartite_network(sqlite_db_file)
    n = pp.Network.from_temporal_network(t)

    

    new_n = copy.deepcopy(n)
    
    for node in n.nodes:
        if node_info['class'][node] == 'file':
            new_n.remove_node(node)

    for node in new_n.nodes:
        for f in n.successors[node]:
            for pre in n.predecessors[f]:
                if not node == pre:
                    new_n.add_edge(node, pre)

    
    return new_n


In [None]:
time_from = datetime(2013, 11, 1)
time_to = datetime(2013, 12, 5)
interval = timedelta(days=30)
delta = timedelta(days=1)

del t2
t2 = pp.TemporalNetwork()
for i in range(math.ceil((time_to - time_from - interval)/delta)):
    start = time_from + i * delta
    end = time_from + i * delta + interval
    network = collab_network(sqlite_db_file, start, end)

    for edge in network.edges:
        t2.add_edge(edge[0], edge[1], i)

t2


In [None]:
# help(pp.temporal_network)

t2 = pp.TemporalNetwork()
years = []

for year in range(min_date.year, max_date.year):
    print('Processing year: ', year)
    n, node_info, edge_info = git2net.get_coauthorship_network(sqlite_db_file, time_from=datetime(year, 1,1), time_to=datetime(year, 12, 31))
    years.append(n)
    for edge in list(n.edges.keys()):
        t2.add_edge(edge[0].replace(' ','_'),edge[1].replace(' ','_'),year)

# t2 = t.filter_edges(lambda v, w, time: True if (time_from <= time <= time_to) else False)

In [None]:
style = {    
  'ts_per_frame': 1, 
  'ms_per_frame': 2000,
  'look_ahead': 0, 
  'look_behind': 0, 
  'node_size': 15, 
  'inactive_edge_width': 2,
  'active_edge_width': 4, 
  'label_color' : '#000000',
  'label_size' : '8px',
  'label_offset': [0,5]
  }
print(t2)
pp.visualisation.plot(t2, **style, width=1000, height=1000)