In [21]:
import pandas as pd
pd.set_option('display.max_rows', 500)


In [22]:
actions = pd.read_csv("github-action-run.csv.gz")
                          
actions.createdat = pd.to_datetime(actions.createdat * 1000000, utc = True)
actions.updatedat = pd.to_datetime(actions.updatedat * 1000000, utc = True)
actions.firstjobstarted = pd.to_datetime(actions.firstjobstarted * 1000000, utc = True)
actions.lastjobcompleted = pd.to_datetime(actions.lastjobcompleted * 1000000, utc = True)                       

In [23]:
def asf_project(pr):
    name = pr.split("-")[0]
    if name == "incubator":
        return pr.split("-")[1]
    return name

actions["project"] = actions.repo.apply(asf_project)


In [24]:
actions["wall"] = actions.lastjobcompleted - actions.firstjobstarted

In [25]:
actions["wallhours"] = actions.wall.dt.seconds / 60 / 60 

In [26]:
actions["jobhours"] = actions.jobseconds / 60 / 60

In [27]:
actions["month"] = actions.createdat.dt.strftime("%Y-%m")

In [28]:
actions =  actions[actions.month.isin(["2021-03","2021-02"])]
last_month = "2021-03"

## Number of projects using github actions

In [29]:
actions[["id"]].groupby([actions.project,actions.month]).count().reset_index().groupby("month").count()

Unnamed: 0_level_0,project,id
month,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-02,84,84
2021-03,91,91


## Number of jobhours used by month

Maximum available value is 180 * 24 * days == 129600 (30 days) .. 133920 (31 days)

In [30]:
actions[["jobhours"]].groupby([actions.project,actions.month]).count().reset_index().groupby("month").sum()

Unnamed: 0_level_0,jobhours
month,Unnamed: 1_level_1
2021-02,90586
2021-03,102817


## Number of jobs executed by apache projects (last month)

In [31]:
actions[actions.month == last_month][["project","id"]].groupby("project").count().sort_values('id', ascending=False).head(20)

Unnamed: 0_level_0,id
project,Unnamed: 1_level_1
pulsar,22529
arrow,12492
apisix,11196
skywalking,7145
airflow,6659
shardingsphere,5672
camel,5621
nuttx,4108
beam,4042
iotdb,3274


## User job hours per project (last month)


In [39]:
x = actions[actions.month == last_month][["project","jobhours"]].groupby("project").agg({"jobhours":["sum","mean", "max"]}).sort_values(('jobhours',"sum"), ascending=False)
x["percentage"] = (x["jobhours"]["sum"] / x["jobhours"]["sum"].sum())
x["cumpercentage"] = x.percentage.cumsum()
x.head(20)

Unnamed: 0_level_0,jobhours,jobhours,jobhours,percentage,cumpercentage
Unnamed: 0_level_1,sum,mean,max,Unnamed: 4_level_1,Unnamed: 5_level_1
project,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
nuttx,14138.818889,3.441777,17.845833,0.173619,0.173619
pulsar,10785.601944,0.478743,2.011667,0.132443,0.306061
airflow,8305.211111,1.247216,20.768056,0.101984,0.408046
skywalking,6852.736667,0.959095,7.520278,0.084149,0.492194
arrow,6290.633889,0.503573,24.359444,0.077246,0.56944
ozone,5484.444722,4.440846,17.473333,0.067347,0.636787
camel,4241.184722,0.754525,18.681389,0.05208,0.688867
iotdb,4007.576667,1.224061,36.676944,0.049211,0.738078
shardingsphere,2858.329444,0.503937,21.633056,0.035099,0.773177
beam,2782.366111,0.688364,46.451667,0.034166,0.807344


In [40]:
x[x["jobhours"]["sum"] > 1000].to_csv("heavy_users.csv", header=False)