In [18]:
import json

data = []
with open("dataset/github_commits.json") as f:
    for line in f:
        data.append(json.loads(line))

In [19]:
import pandas as pd

df = pd.json_normalize(data)

In [20]:
print("shape:", df.shape)
df.head()

shape: (633631, 11)


Unnamed: 0,commit,tree,parent,subject,message,repo,author.name,author.email,author.time_sec,author.tz_offset,author.date.seconds
0,aa891f17e0c1125126c12e1b287014dd69806e93,a993a3bd7a9cbb1c58e530b1fad0d469343e24ad,[9f57e19170ed93da4a2e308b356919ebbc301cc7],[clang] Add clangd documentation,[clang] Add clangd documentation\n\nAdd docume...,chromium/chromium,Haojian Wu,ca27d53ab9097d87024465d88d5a742b507e57d9@gmail...,1554476212,0,1554476212
1,cd4b25edba76a6ecd4b6ff2cd9e6663649aa1d67,14e1b8c595b27c2df04a2bf8ce8e5c0d62322a73,[ab6b83a2f81557c6cd8b0eff1f3306ef1ba87cfc],Add OWNERS file for content/network.,Add OWNERS file for content/network.\n\nBUG=No...,chromium/chromium,rdsmith,8692e49a5d99feee623028da27611d5ca92db77f@chrom...,1492638383,-420,1492638383
2,d9d12f298ddc011d5c621a47513889c56cce20a2,0ca9835a68adf80f8c98f24170b50f29f653d7c7,[5e55f2568b344cf8fac05eb5c535e7cc6aaacadb],RenderSubtree: Add a fieldset perftest.,RenderSubtree: Add a fieldset perftest.\n\nThi...,chromium/chromium,Vladimir Levin,8725865b9599790ed3778b39dc8e6dce7c901da3@chrom...,1578515323,0,1578515323
3,5b5906fc316e7c9cbd1b869ffa5499c30fbdb9b9,6574664bd669a6f4e442d0e87d3c9b8e50d9155c,[db789cec2a9c6c96c1c0770db8fd20117d897d48],CSS: parsing tests for sizing properties,CSS: parsing tests for sizing properties\n\nWe...,chromium/chromium,Eric Willigers,1f4abbcd3e1e2b3265a21aff61b02343fb361457@chrom...,1557142569,0,1557142569
4,7eec667b47b35671945e6d9b20238636c303e50c,8beb2e0c4e30acd5da6fbdd92eeb13a7e3b66f63,[1cecab463353bdbb4ad45bef32d5f7b0f6c3663c],"Reland ""[PE] Ensure update of LayoutSVGShape::...","Reland ""[PE] Ensure update of LayoutSVGShape::...",chromium/chromium,Xianzhu Wang,56bf68553a6eeecbbaf7c80da87195a59c9681bb@chrom...,1528322487,0,1528322487


In [6]:
# number of contirbutors
len(df["author.name"].unique())

6705

In [7]:
# time of the first and last commit
from datetime import datetime

fts = int(df["author.time_sec"].min())
lts = int(df["author.time_sec"].max())

print("first commit:", datetime.fromtimestamp(fts))
print("last commit:", datetime.fromtimestamp(lts))

first commit: 2013-09-27 13:58:57
last commit: 2022-04-14 00:40:02


In [25]:
from neo4j import GraphDatabase
from dotenv import load_dotenv
import os

# manually create a .env file with the 
# required variables in the same directory first
load_dotenv()
user = os.getenv("NEO4J_USERNAME", "neo4j")
pwd = os.getenv("NEO4J_PASSWORD", "abcde")
uri = os.getenv("NEO4J_URI", "neo4j+s://xxxxxxxx.databases.neo4j.io")

driver = GraphDatabase.driver(uri, auth=(user, pwd))

**Reminder:** We use driver to create a seesion to access Neo4j database. While executing the transactions, you should not run any other cells, otherwise the session will be interrupted.

In [27]:
def create_user(tx, name, email):
    tx.run("CREATE (:User {name: $name, email: $email})", name=name, email=email)

def create_commit(tx, hash, title):
    tx.run("CREATE (:Commit {hash: $hash, title: $title})", hash=hash, title=title)

def create_authorship(tx, user_name, commit_hash):
    tx.run("""
        MATCH (a:User {name: $name})
        MATCH (b:Commit {hash: $hash})
        MERGE (a)-[:AUTHOR]->(b)""",
        name=user_name, hash=commit_hash)

In [17]:
# add contirbutors to the graph
contirbutors = df[["author.name", "author.email"]].drop_duplicates()
with driver.session() as s:
    for i in range(len(contirbutors)):
        s.write_transaction(create_user, contirbutors.iloc[i, 0], contirbutors.iloc[i, 1])

In [None]:
# add commits and their relationships to the graph
with driver.session() as s:
    for i in range(len(df)):
        s.write_transaction(create_commit, df.loc[i, "commit"], df.loc[i, "subject"])
        s.write_transaction(create_authorship, df.loc[i, "author.name"], df.loc[i, "commit"])