# Data Analysis

In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder

In [2]:
def mine_frequent_patterns(
    transactions: list[list], support: float
) -> pd.DataFrame:
    te = TransactionEncoder()
    encoding = te.fit(transactions).transform(transactions)
    encoding_df = pd.DataFrame(encoding, columns=te.columns_)
    frequent_itemsets = apriori(encoding_df, min_support=support, use_colnames=True)
    frequent_itemsets["length"] = frequent_itemsets["itemsets"].apply(
        lambda x: len(x)
    )
    return frequent_itemsets

In [3]:
workflows_df = pd.read_pickle("../dumps/workflows_df.pkl")
actions_df = pd.read_pickle("../dumps/actions_df.pkl")
frequent_actions_df = pd.read_pickle("../dumps/frequent_actions_df.pkl")
frequent_actions_noTags_df = pd.read_pickle("../dumps/frequent_actions_noTags_df.pkl")
frequent_docker_commands_subsample_df = pd.read_pickle("../dumps/frequent_docker_commands_subsample_df.pkl")

## Descriptive statistics

### Repo Filtering

- Original [RepoReapers dataset](https://reporeapers.github.io/results/1.html): **1,853,195**
- Repositories classified as containing an _engineered software project_: **446,511** (not available: **38,742**)
- Repositories with _DS-related keywords_ in topics or description: **2516**
- Repositories with at least one workflow: **155**

### Workflows

- Total number of workflows found: **399**
- Valid workflows (valid YAML file): **397**
- Invalid workflows (invalid YAML file): **2**

First of all, let's ensure that all repositories and valid workflows are being analyzed.

In [4]:
workflows_df["repository"].unique().shape

(155,)

In [5]:
workflows_df["path"] = workflows_df["repository"] + "/" + workflows_df["filename"]
workflows_df["path"].unique().shape

(397,)

#### Number of workflows per repository

In [6]:
workflows_df.groupby("repository").count()["path"].describe()

count    155.000000
mean       2.561290
std        2.401525
min        1.000000
25%        1.000000
50%        2.000000
75%        3.000000
max       14.000000
Name: path, dtype: float64

#### Most common events triggering workflows

In [7]:
transactions = workflows_df['trigger_events'].tolist()
mine_frequent_patterns(transactions, support=0.05)

Unnamed: 0,support,itemsets,length
0,0.612091,(pull_request),1
1,0.670025,(push),1
2,0.057935,(release),1
3,0.13602,(schedule),1
4,0.13602,(workflow_dispatch),1
5,0.493703,"(pull_request, push)",2
6,0.060453,"(schedule, pull_request)",2
7,0.078086,"(pull_request, workflow_dispatch)",2
8,0.057935,"(push, workflow_dispatch)",2


## RQ1 -  "Is GitHub Actions used to automate project deployment?"

- ratio of workflows presenting actions / run commands which relate to Docker
- ratio of workflows that upload a container image to Docker-Hub or to GitHub Packages
- ratio of workflows that upload any type of software package to GitHub Packages.

#### Workflows with keyword `deploy` in name or filename

In [8]:
temp_df = workflows_df.loc[
    workflows_df["filename"].str.contains("deploy") | workflows_df["name"].str.contains("deploy")
]
print("Size:", temp_df.shape[0])
temp_df

Size: 2


Unnamed: 0,repository,filename,name,trigger_events,n_of_actions,docker_related_actions,n_of_run_commands,docker_related_commands,docker_commands,path
54,m09/syllable-counter,deploy.yml,Deploy,[push],2,False,2,False,[],m09/syllable-counter/deploy.yml
295,insideout10/wordlift-plugin,qa.deplyment.yml,docker_build_and_k8s_deployment,"[push, workflow_dispatch]",9,True,0,False,[],insideout10/wordlift-plugin/qa.deplyment.yml


#### Actions with keyword `deploy` in the slug

##### Number of distinct actions used in the dataset

Considering tags

In [12]:
actions_df.loc[actions_df.loc[:,"action_slug"].str.contains("deploy"), "action_slug"].unique().shape

(3,)

Not considering tags

In [13]:
actions_df.loc[actions_df.loc[:,"action_slug"].str.contains("deploy"), "action_slug_noTag"].unique().shape

(3,)

##### Number of workflows using at least one of such actions

In [14]:
actions_df.loc[actions_df.loc[:,"action_slug"].str.contains("deploy"), "workflow"].unique().shape

(4,)

#### Actions of category "Deployment"

In [155]:
temp_list = actions_df.loc[
    (actions_df["category_1"] == "Deployment") | (actions_df["category_2"] == "Deployment"),
    "action_slug_noTag"
].unique().tolist()
print(len(temp_list))
temp_list

13


['pypa/gh-action-pypi-publish',
 'uibcdf/action-build-and-upload-conda-packages',
 'larsoner/circleci-artifacts-redirector-action',
 'RalfG/python-wheels-manylinux-build',
 'JamesIves/github-pages-deploy-action',
 'svenstaro/upload-release-action',
 'release-drafter/release-drafter',
 'hashicorp/setup-terraform',
 'shimataro/ssh-key-action',
 'steebchen/kubectl',
 'Azure/k8s-deploy',
 'crazy-max/ghaction-github-pages',
 'mad9000/actions-find-and-replace-string']

#### Run commands containing keyword `deploy`

- Number of workflows containing at least one of such commands

#### Workflows with keyword `publish` in name or filename

In [10]:
temp_df = workflows_df.loc[
    workflows_df["filename"].str.contains("publish") | workflows_df["name"].str.contains("publish")
]
print("Size:", temp_df.shape[0])
temp_df.head()

Size: 17


Unnamed: 0,repository,filename,name,trigger_events,n_of_actions,docker_related_actions,n_of_run_commands,docker_related_commands,docker_commands,path
82,gunthercox/ChatterBot,pythonpublish.yml,Upload Python Package,[release],2,False,2,False,[],gunthercox/ChatterBot/pythonpublish.yml
100,project8/katydid,publish.yaml,Publish,"[push, pull_request, release, workflow_dispatc...",9,True,8,True,[build],project8/katydid/publish.yaml
102,HIT-SCIR/ltp,python-publish.yml,Upload Python Package,[release],3,False,2,False,[],HIT-SCIR/ltp/python-publish.yml
121,hpcc-systems/HPCC-Platform,nightly-publish.yml,Nightly master build and publish,[schedule],2,True,1,False,[],hpcc-systems/HPCC-Platform/nightly-publish.yml
124,hpcc-systems/HPCC-Platform,build-and-publish.yml,Build and publish,[push],4,True,0,False,[],hpcc-systems/HPCC-Platform/build-and-publish.yml


#### Actions with keyword `publish` in the slug

##### Number of distinct actions used in the dataset

Considering tags

In [15]:
actions_df.loc[actions_df.loc[:,"action_slug"].str.contains("publish"), "action_slug"].unique().shape

(5,)

Not considering tags

In [16]:
actions_df.loc[actions_df.loc[:,"action_slug"].str.contains("publish"), "action_slug_noTag"].unique().shape

(3,)

##### Number of workflows using at least one of such actions

In [17]:
actions_df.loc[actions_df.loc[:,"action_slug"].str.contains("publish"), "workflow"].unique().shape

(13,)

#### Run commands containing keyword `publish`

- Number of workflows containing at least one of such commands

#### Workflows with keyword `docker` in name or filename

In [23]:
temp_df = workflows_df.loc[
    workflows_df["filename"].str.contains("docker") | workflows_df["name"].str.contains("docker")
]
print("Size:", temp_df.shape[0])
temp_df

Size: 7


Unnamed: 0,repository,filename,name,trigger_events,n_of_actions,docker_related_actions,n_of_run_commands,docker_related_commands,docker_commands,path
295,insideout10/wordlift-plugin,qa.deplyment.yml,docker_build_and_k8s_deployment,"[push, workflow_dispatch]",9,True,0,False,[],insideout10/wordlift-plugin/qa.deplyment.yml
297,weecology/retriever,docker-publish.yml,Docker,"[push, pull_request]",6,True,5,False,[],weecology/retriever/docker-publish.yml
323,tiborsimko/invenio-data,docker-build-server.yml,Build CAP server image,[push],1,False,1,False,[],tiborsimko/invenio-data/docker-build-server.yml
324,tiborsimko/invenio-data,docker-build-ui.yml,Build CAP UI image,[push],1,False,1,False,[],tiborsimko/invenio-data/docker-build-ui.yml
328,tiborsimko/data.cern.ch,docker-build-server.yml,Build CAP server image,[push],1,False,1,False,[],tiborsimko/data.cern.ch/docker-build-server.yml
329,tiborsimko/data.cern.ch,docker-build-ui.yml,Build CAP UI image,[push],1,False,1,False,[],tiborsimko/data.cern.ch/docker-build-ui.yml
383,linkedin/gobblin,docker_build_publish.yaml,Build and Publish Docker image,"[push, pull_request, release]",4,True,1,False,[],linkedin/gobblin/docker_build_publish.yaml


#### Actions with the keyword `docker` in the slug

##### Number of distinct actions used in the dataset

Considering tags

In [43]:
actions_df.loc[actions_df.loc[:,"action_slug"].str.contains("docker"), "action_slug"].unique().shape

(14,)

Not considering tags

In [44]:
actions_df.loc[actions_df.loc[:,"action_slug"].str.contains("docker"), "action_slug_noTag"].unique().shape

(10,)

##### Number of workflows using at least one of such actions

In [48]:
actions_df.loc[actions_df.loc[:,"action_slug"].str.contains("docker"), "workflow"].unique().shape

(14,)

#### Run commands containing keyword `docker`

I.e., number of workflows containing at least one of such commands

In [45]:
workflows_df.loc[workflows_df["docker_related_commands"]].shape[0]

20

#### Docker commands

In [47]:
temp_df = workflows_df.loc[workflows_df["docker_commands"].map(len) > 0]
print(temp_df.shape[0])
temp_df

9


Unnamed: 0,repository,filename,name,trigger_events,n_of_actions,docker_related_actions,n_of_run_commands,docker_related_commands,docker_commands,path
76,mgbellemare/Arcade-Learning-Environment,build-ale-py.yml,Build ALE Python,[push],8,False,4,True,[login],mgbellemare/Arcade-Learning-Environment/build-...
100,project8/katydid,publish.yaml,Publish,"[push, pull_request, release, workflow_dispatc...",9,True,8,True,[build],project8/katydid/publish.yaml
115,hpcc-systems/HPCC-Platform,build-containers-target-branch.yml,Docker target branch build,[push],6,True,3,True,[manifest],hpcc-systems/HPCC-Platform/build-containers-ta...
119,hpcc-systems/HPCC-Platform,build-containers-pr.yml,Docker smoketest build,[pull_request],6,True,3,True,[manifest],hpcc-systems/HPCC-Platform/build-containers-pr...
218,cernopendata/opendata.cern.ch,ci.yml,CI,"[push, pull_request]",14,False,10,True,[run],cernopendata/opendata.cern.ch/ci.yml
296,insideout10/wordlift-plugin,main.yml,CI,"[push, pull_request]",3,False,2,True,"[run, network]",insideout10/wordlift-plugin/main.yml
355,apache/incubator-calcite,main.yml,CI,"[push, pull_request]",26,False,7,True,[logs],apache/incubator-calcite/main.yml
378,bgruening/docker-galaxy-stable,pull-request.yml,pr-test,[pull_request],3,False,9,True,"[buildx, ps]",bgruening/docker-galaxy-stable/pull-request.yml
379,bgruening/docker-galaxy-stable,compose.yml,build-and-test,[push],9,True,18,True,"[login, buildx, ps]",bgruening/docker-galaxy-stable/compose.yml


#### Workflows with the keyword `package` in name or filename

In [11]:
temp_df = workflows_df.loc[
    workflows_df["filename"].str.contains("package") | workflows_df["name"].str.contains("package")
]
print("Size:", temp_df.shape[0])
temp_df

Size: 9


Unnamed: 0,repository,filename,name,trigger_events,n_of_actions,docker_related_actions,n_of_run_commands,docker_related_commands,docker_commands,path
45,rasbt/mlxtend,python-package-conda.yml,Python Package using Conda,[push],2,False,4,False,[],rasbt/mlxtend/python-package-conda.yml
81,ipython/ipython,python-package.yml,Python package,"[push, pull_request]",2,False,2,False,[],ipython/ipython/python-package.yml
103,HIT-SCIR/ltp,python-package.yml,Python package,"[push, pull_request]",2,False,3,False,[],HIT-SCIR/ltp/python-package.yml
275,jmschrei/pomegranate,python-package.yml,build,"[push, pull_request]",3,False,3,False,[],jmschrei/pomegranate/python-package.yml
298,weecology/retriever,python-package.yml,Python package,"[push, pull_request]",3,False,6,False,[],weecology/retriever/python-package.yml
301,TheTorProject/ooni-pipeline,build_deb_packages.yml,Build deb packages,[pull_request],1,False,4,False,[],TheTorProject/ooni-pipeline/build_deb_packages...
302,arcturusannamalai/open-tamil,regression.yml,Open-Tamil Python package,[push],2,False,2,False,[],arcturusannamalai/open-tamil/regression.yml
311,nipy/nipype,package.yml,Packaging,[push],3,False,5,False,[],nipy/nipype/package.yml
333,hankcs/HanLP,python-package.yml,Python package,"[push, pull_request]",3,False,4,False,[],hankcs/HanLP/python-package.yml


#### Actions with the keyword `package` in the slug

##### Number of distinct actions used in the dataset

Considering tags

In [18]:
actions_df.loc[actions_df.loc[:,"action_slug"].str.contains("package"), "action_slug"].unique().shape

(1,)

Not considering tags

In [19]:
actions_df.loc[actions_df.loc[:,"action_slug"].str.contains("package"), "action_slug_noTag"].unique().shape

(1,)

##### Number of workflows using at least one of such actions

In [20]:
actions_df.loc[actions_df.loc[:,"action_slug"].str.contains("package"), "workflow"].unique().shape

(1,)

#### Run commands containing keyword `package`

- Number of workflows containing at least one of such commands

## RQ2 - "What are the most frequently used Actions?"

#### Total number of actions used in the dataset

Considering slugs

In [50]:
actions_df.action_slug.unique().shape

(203,)

Not considering slugs

In [51]:
actions_df.action_slug_noTag.unique().shape

(160,)

#### Average number of actions per workflow

In [74]:
actions_df.groupby("workflow")['action_slug'].describe()

Unnamed: 0_level_0,count,unique,top,freq
workflow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Berico-Technologies/CLAVIN-NERD/develop.yml,2,2,actions/checkout@v2,1
Berico-Technologies/CLAVIN-NERD/feature.yml,2,2,actions/checkout@v2,1
Berico-Technologies/CLAVIN-NERD/featureWindows.yml,2,2,actions/checkout@v2,1
Berico-Technologies/CLAVIN-NERD/master.yml,2,2,actions/checkout@v2,1
Berico-Technologies/CLAVIN/develop.yml,2,2,actions/checkout@v2,1
...,...,...,...,...
xSAVIKx/AndroidScreencast/ci.yaml,3,3,actions/checkout@v2,1
xraypy/xraylarch/test-ubuntu.yml,2,2,actions/checkout@v2,1
xraypy/xraylarch/test-windows.yml,2,2,actions/checkout@v2,1
zblz/naima/examples.yaml,4,2,actions/checkout@v2,2


Total number of actions

In [73]:
temp_df = actions_df.groupby("workflow")['action_slug'].describe()
temp_df["count"].astype("int").describe()

count    391.000000
mean       3.976982
std        4.287966
min        1.000000
25%        2.000000
50%        3.000000
75%        4.000000
max       42.000000
Name: count, dtype: float64

Total unique actions

In [75]:
temp_df["unique"].astype("int").describe()

count    391.000000
mean       2.820972
std        1.612029
min        1.000000
25%        2.000000
50%        3.000000
75%        4.000000
max       13.000000
Name: unique, dtype: float64

#### Actions / Run Commands

In [85]:
(workflows_df["n_of_actions"] / (workflows_df["n_of_actions"] + workflows_df["n_of_run_commands"])).describe()

count    397.000000
mean       0.543910
std        0.258788
min        0.000000
25%        0.333333
50%        0.500000
75%        0.727273
max        1.000000
dtype: float64

#### Number of actions available on the GitHub Marketplace
Ratio of actions available in the Marketplace vs custom actions not registered in the marketplace

In [101]:
total_distinct_actions_noSlug = actions_df.loc[
    :, ["action_slug_noTag", "available_in_marketplace"]
].drop_duplicates().shape[0]
total_distinct_actions_noSlug

160

In [102]:
available_in_marketplace = actions_df.loc[
    actions_df["available_in_marketplace"], ["action_slug_noTag", "available_in_marketplace"]
].drop_duplicates().shape[0]
available_in_marketplace

104

In [104]:
available_in_marketplace / total_distinct_actions_noSlug

0.65

#### Number of actions by verified creators

In [109]:
by_verified_creator = actions_df.loc[
    actions_df["from_verified_creator"], ["action_slug_noTag", "from_verified_creator"]
].drop_duplicates().shape[0]
by_verified_creator

30

#### Distribution of action categories

Distinct predefined (i.e., available in marketplace) actions

In [170]:
distinct_predef_actions = actions_df.loc[
    actions_df["available_in_marketplace"], ["action_slug_noTag", "category_1", "category_2"]
].drop_duplicates()
distinct_predef_actions_count = distinct_predef_actions.shape[0]
print("Count:", distinct_predef_actions_count, end="\n\n")
distinct_predef_actions.head()

Count: 104



Unnamed: 0,action_slug_noTag,category_1,category_2
0,actions/checkout,Utilities,
1,actions/setup-python,Utilities,
3,actions/cache,Utilities,Dependency management
4,actions/setup-java,Utilities,
6,actions/setup-node,Utilities,


Uncategorized

In [187]:
uncategorized = distinct_predef_actions.loc[
    distinct_predef_actions["category_1"].isnull() & distinct_predef_actions["category_2"].isnull()
]
uncategorized

Unnamed: 0,action_slug_noTag,category_1,category_2
1168,ts-graphviz/setup-graphviz,,


In [190]:
uncategorized_series = pd.Series([1], index=["Uncategorized"])
uncategorized_series

Uncategorized    1
dtype: int64

In [167]:
cat_1 = distinct_predef_actions["category_1"].value_counts()
cat_1

Utilities                 30
Continuous integration    21
Code quality               7
Dependency management      7
Deployment                 7
Container CI               6
Publishing                 5
Testing                    5
Code review                5
Reporting                  2
Security                   2
Chat                       2
Project management         1
Support                    1
Open Source management     1
Localization               1
Name: category_1, dtype: int64

In [168]:
cat_2 = distinct_predef_actions["category_2"].value_counts()
cat_2

Continuous integration    19
Utilities                 18
Deployment                 6
Publishing                 6
Testing                    6
Reporting                  4
Container CI               4
Code quality               4
Dependency management      3
Code review                3
Project management         2
Community                  1
Desktop tools              1
Chat                       1
Open Source management     1
Name: category_2, dtype: int64

In [193]:
value_counts_sum = cat_1.add(cat_2, fill_value=0).sort_values(ascending=False)
value_counts_sum = value_counts_sum.append(uncategorized_series)
value_counts_sum

Utilities                 48.0
Continuous integration    40.0
Deployment                13.0
Testing                   11.0
Code quality              11.0
Publishing                11.0
Container CI              10.0
Dependency management     10.0
Code review                8.0
Reporting                  6.0
Project management         3.0
Chat                       3.0
Open Source management     2.0
Security                   2.0
Desktop tools              1.0
Support                    1.0
Community                  1.0
Localization               1.0
Uncategorized              1.0
dtype: float64

Replication of Table 1 of *"How Do Software Developers Use GitHub Actions to Automate Their Workflows?"*

In [194]:
df_temp = pd.DataFrame({"# of Actions": value_counts_sum, "%": ((value_counts_sum / distinct_predef_actions_count)*100).round(2)})
df_temp.append(df_temp.sum(), ignore_index=True)

Unnamed: 0,# of Actions,%
0,48.0,46.15
1,40.0,38.46
2,13.0,12.5
3,11.0,10.58
4,11.0,10.58
5,11.0,10.58
6,10.0,9.62
7,10.0,9.62
8,8.0,7.69
9,6.0,5.77


#### 10 most popular actions

In [113]:
actions_df.loc[:, "action_slug_noTag"].value_counts().head(10)

actions/checkout                   526
actions/setup-python               171
actions/cache                      123
actions/upload-artifact            117
actions/setup-java                 100
codecov/codecov-action              27
conda-incubator/setup-miniconda     25
svenstaro/upload-release-action     22
Gottox/irc-message-action           22
jurplel/install-qt-action           21
Name: action_slug_noTag, dtype: int64

#### 10 most popular actions available on the GitHub Marketplace

In [116]:
actions_df.loc[actions_df["available_in_marketplace"], "action_slug_noTag"].value_counts().head(10)

actions/checkout                   526
actions/setup-python               171
actions/cache                      123
actions/upload-artifact            117
actions/setup-java                 100
codecov/codecov-action              27
conda-incubator/setup-miniconda     25
svenstaro/upload-release-action     22
Gottox/irc-message-action           22
jurplel/install-qt-action           21
Name: action_slug_noTag, dtype: int64

## RQ3 - "What are the sets of actions that typically co-occur in workflows"

#### Frequent 2-item sets of Actions

##### With tags

In [119]:
frequent_actions_df.loc[frequent_actions_df["length"] == 2]

Unnamed: 0,support,itemsets,length
7,0.161209,"(actions/checkout@v2, actions/cache@v2)",2
8,0.050378,"(actions/upload-artifact@v2, actions/cache@v2)",2
9,0.118388,"(actions/setup-java@v1, actions/checkout@v2)",2
10,0.050378,"(actions/checkout@v2, actions/setup-python@v1)",2
11,0.2267,"(actions/setup-python@v2, actions/checkout@v2)",2
12,0.123426,"(actions/upload-artifact@v2, actions/checkout@v2)",2
13,0.052897,"(actions/upload-artifact@v2, actions/setup-pyt...",2


##### Without tags

In [121]:
frequent_actions_noTags_df.loc[frequent_actions_noTags_df["length"] == 2]

Unnamed: 0,support,itemsets,length
7,0.191436,"(actions/cache, actions/checkout)",2
8,0.083123,"(actions/cache, actions/setup-java)",2
9,0.065491,"(actions/cache, actions/upload-artifact)",2
10,0.183879,"(actions/setup-java, actions/checkout)",2
11,0.292191,"(actions/setup-python, actions/checkout)",2
12,0.156171,"(actions/checkout, actions/upload-artifact)",2
13,0.062972,"(actions/checkout, codecov/codecov-action)",2
14,0.052897,"(conda-incubator/setup-miniconda, actions/chec...",2
15,0.062972,"(actions/setup-python, actions/upload-artifact)",2


In [132]:
frequent_actions_noTags_df.loc[frequent_actions_noTags_df["length"] == 2].loc[14,'itemsets']

frozenset({'actions/checkout', 'conda-incubator/setup-miniconda'})

#### Frequent 3-item sets of Actions

##### With tags

In [124]:
frequent_actions_df.loc[frequent_actions_df["length"] == 3]

Unnamed: 0,support,itemsets,length
14,0.050378,"(actions/upload-artifact@v2, actions/setup-pyt...",3


Readable version

In [131]:
frequent_actions_df.loc[frequent_actions_df["length"] == 3]['itemsets'].tolist()

[frozenset({'actions/checkout@v2',
            'actions/setup-python@v2',
            'actions/upload-artifact@v2'})]

##### Without tags

In [126]:
frequent_actions_noTags_df.loc[frequent_actions_noTags_df["length"] == 3]

Unnamed: 0,support,itemsets,length
16,0.083123,"(actions/setup-java, actions/cache, actions/ch...",3
17,0.062972,"(actions/cache, actions/checkout, actions/uplo...",3
18,0.062972,"(actions/setup-python, actions/checkout, actio...",3


Readable version

In [129]:
frequent_actions_noTags_df.loc[frequent_actions_noTags_df["length"] == 3]['itemsets'].tolist()

[frozenset({'actions/cache', 'actions/checkout', 'actions/setup-java'}),
 frozenset({'actions/cache', 'actions/checkout', 'actions/upload-artifact'}),
 frozenset({'actions/checkout',
            'actions/setup-python',
            'actions/upload-artifact'})]