# Data Analysis

In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder

In [2]:
def mine_frequent_patterns(
    transactions: list[list], support: float
) -> pd.DataFrame:
    te = TransactionEncoder()
    encoding = te.fit(transactions).transform(transactions)
    encoding_df = pd.DataFrame(encoding, columns=te.columns_)
    frequent_itemsets = apriori(encoding_df, min_support=support, use_colnames=True)
    frequent_itemsets["length"] = frequent_itemsets["itemsets"].apply(
        lambda x: len(x)
    )
    return frequent_itemsets

In [88]:
workflows_df = pd.read_pickle("../dumps/workflows_df.pkl")
actions_df = pd.read_pickle("../dumps/actions_df.pkl")
frequent_actions_df = pd.read_pickle("../dumps/frequent_actions_df.pkl")
frequent_actions_noTags_df = pd.read_pickle("../dumps/frequent_actions_noTags_df.pkl")
frequent_docker_commands_subsample_df = pd.read_pickle("../dumps/frequent_docker_commands_subsample_df.pkl")
frequent_cml_commands_subsample_df = pd.read_pickle("../dumps/frequent_cml_commands_subsample_df.pkl")

## Descriptive statistics

First of all, let's ensure that all repositories and valid workflows are being analyzed.

In [4]:
workflows_df["repository"].unique().shape

(29,)

In [5]:
workflows_df["path"] = workflows_df["repository"] + "/" + workflows_df["filename"]
workflows_df["path"].unique().shape

(38,)

#### Number of workflows per repository

In [6]:
workflows_df.groupby("repository").count()["path"].describe()

count    29.000000
mean      1.310345
std       0.806379
min       1.000000
25%       1.000000
50%       1.000000
75%       1.000000
max       4.000000
Name: path, dtype: float64

#### Most common events triggering workflows

In [7]:
transactions = workflows_df['trigger_events'].tolist()
mine_frequent_patterns(transactions, support=0.05)

Unnamed: 0,support,itemsets,length
0,0.078947,(issue_comment),1
1,0.184211,(pull_request),1
2,0.789474,(push),1
3,0.052632,(release),1
4,0.052632,(schedule),1
5,0.131579,"(push, pull_request)",2


#### Workflows with keyword `docker` in name or filename

In [8]:
temp_df = workflows_df.loc[
    workflows_df["filename"].str.contains("docker") | workflows_df["name"].str.contains("docker")
]
print("Size:", temp_df.shape[0])
temp_df

Size: 2


Unnamed: 0,repository,filename,name,trigger_events,n_of_actions,cml_related_actions,docker_related_actions,n_of_run_commands,cml_related_commands,cml_commands,docker_related_commands,docker_commands,path
5,2796gaurav/automate,docker_destroy.yml,Docker Image destroy,[issue_comment],2,False,False,4,False,[],False,[],2796gaurav/automate/docker_destroy.yml
7,2796gaurav/automate,dockerize.yml,Docker Image CI,[issue_comment],3,False,False,5,False,[],True,"[build, push]",2796gaurav/automate/dockerize.yml


#### Actions with the keyword `docker` in the slug

##### Number of distinct actions used in the dataset

Considering tags

In [9]:
actions_df.loc[actions_df.loc[:,"action_slug"].str.contains("docker"), "action_slug"].unique().shape

(0,)

Not considering tags

In [59]:
actions_df.loc[actions_df.loc[:,"action_slug"].str.contains("docker"), "action_slug_noTag"].unique().shape

(0,)

##### Number of workflows using at least one of such actions

In [11]:
actions_df.loc[actions_df.loc[:,"action_slug"].str.contains("docker"), "workflow"].unique().shape

(0,)

#### Run commands containing keyword `docker`

I.e., number of workflows containing at least one of such commands

In [12]:
workflows_df.loc[workflows_df["docker_related_commands"]].shape[0]

1

#### Docker commands

In [13]:
temp_df = workflows_df.loc[workflows_df["docker_commands"].map(len) > 0]
print(temp_df.shape[0])
temp_df

1


Unnamed: 0,repository,filename,name,trigger_events,n_of_actions,cml_related_actions,docker_related_actions,n_of_run_commands,cml_related_commands,cml_commands,docker_related_commands,docker_commands,path
7,2796gaurav/automate,dockerize.yml,Docker Image CI,[issue_comment],3,False,False,5,False,[],True,"[build, push]",2796gaurav/automate/dockerize.yml


## Workflows containing commands with the substring `cml`

In [85]:
workflows_df.loc[workflows_df["cml_related_commands"]].shape[0]

28

### cml commands

In [86]:
temp_df = workflows_df.loc[workflows_df["cml_commands"].map(len) > 0]
print(temp_df.shape[0])
temp_df

27


Unnamed: 0,repository,filename,name,trigger_events,n_of_actions,cml_related_actions,docker_related_actions,n_of_run_commands,cml_related_commands,cml_commands,docker_related_commands,docker_commands,path
2,YannickLecroart/pmr,cml.yml,model-CHD,[push],1,False,False,1,True,"[publish, send-comment]",False,[],YannickLecroart/pmr/cml.yml
3,YannickLecroart/pmr_dvc_demo,cml.yml,model-CHD,[push],1,False,False,1,True,[send-comment],False,[],YannickLecroart/pmr_dvc_demo/cml.yml
4,YannickLecroart/pmr_cml_pipeline,cml.yml,PMR-CML-DVC-Pipeline,[push],1,False,False,1,True,"[publish, send-comment]",False,[],YannickLecroart/pmr_cml_pipeline/cml.yml
6,2796gaurav/automate,cml_report.yml,cml-report,[push],4,True,False,1,True,"[publish, send-comment]",False,[],2796gaurav/automate/cml_report.yml
9,akdsingh/cml,cml.yml,Model Selection,[push],3,True,False,1,True,"[publish, send-comment]",False,[],akdsingh/cml/cml.yml
10,akdsingh/cml_data,cml.yml,Model Selection,[push],3,True,False,1,True,"[publish, send-comment]",False,[],akdsingh/cml_data/cml.yml
11,tue-5ARA0/mlops-demo-live,cml.yml,train-my-model,"[push, pull_request, workflow_dispatch]",4,True,False,1,True,[send-comment],False,[],tue-5ARA0/mlops-demo-live/cml.yml
12,hacheemaster/wine,cml.yml,model-wine-quality,[push],1,False,False,1,True,"[publish, send-comment]",False,[],hacheemaster/wine/cml.yml
13,hurshd0/train_ml_with_github_actions,cml.yml,Titanic Model - train & test,[pull_request],2,False,False,2,True,"[publish, send-comment]",False,[],hurshd0/train_ml_with_github_actions/cml.yml
14,cheesama/morphine,cml.yml,Model Train,[push],1,False,False,1,True,"[tensorboard-dev, send-comment]",False,[],cheesama/morphine/cml.yml


In [94]:
temp_df.loc[:, ["cml_commands"]]

Unnamed: 0,cml_commands
2,"[publish, send-comment]"
3,[send-comment]
4,"[publish, send-comment]"
6,"[publish, send-comment]"
9,"[publish, send-comment]"
10,"[publish, send-comment]"
11,[send-comment]
12,"[publish, send-comment]"
13,"[publish, send-comment]"
14,"[tensorboard-dev, send-comment]"


In [90]:
frequent_cml_commands_subsample_df

Unnamed: 0,support,itemsets,length
0,0.62963,(publish),1
1,1.0,(send-comment),1
2,0.62963,"(publish, send-comment)",2


## "What are the most frequently used Actions?"

#### Total number of actions used in the dataset

Considering slugs

In [14]:
actions_df.action_slug.unique().shape

(15,)

Not considering slugs

In [15]:
actions_df.action_slug_noTag.unique().shape

(14,)

#### Average number of actions per workflow

In [16]:
actions_df.groupby("workflow")['action_slug'].describe()

Unnamed: 0_level_0,count,unique,top,freq
workflow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2796gaurav/automate/cml_report.yml,4,4,actions/checkout@v2,1
2796gaurav/automate/deploy.yml,3,3,machine-learning-apps/actions-chatops@1.41,1
2796gaurav/automate/docker_destroy.yml,2,2,actions/checkout@v2,1
2796gaurav/automate/dockerize.yml,3,3,actions/checkout@v2,1
ArilessTir/MLOPS_wine/cml.yml,1,1,actions/checkout@v2,1
AscendNTNU/perception_testing_21/cml.yml,4,2,actions/checkout@v2,3
AscendNTNU/perception_testing_21/mainci.yml,5,3,actions/checkout@v2,3
CasualModel/CancerCausality/cml.yml,2,2,actions/checkout@v2,1
CasualModel/CancerCausality/tests.yml,2,2,actions/checkout@v2,1
DavidGOrtega/terraform-provider-tpitest/build-cml-ami.yml,1,1,actions/checkout@v2,1


Total number of actions

In [18]:
temp_df = actions_df.groupby("workflow")['action_slug'].describe()
temp_df["count"].astype("int").describe()

count    38.000000
mean      2.552632
std       1.201291
min       1.000000
25%       1.250000
50%       3.000000
75%       3.000000
max       5.000000
Name: count, dtype: float64

Total unique actions

In [19]:
temp_df["unique"].astype("int").describe()

count    38.000000
mean      2.315789
std       0.961566
min       1.000000
25%       1.250000
50%       2.500000
75%       3.000000
max       4.000000
Name: unique, dtype: float64

Total number of actions within workflows with "CML" in the filename

In [28]:
CMLrestricted_actions_df = actions_df.loc[actions_df.workflow.str.contains("cml", case=False)]
CMLrestricted_temp_df = actions_df.groupby("workflow")['action_slug'].describe()
CMLrestricted_temp_df["count"].astype("int").describe()

count    38.000000
mean      2.552632
std       1.201291
min       1.000000
25%       1.250000
50%       3.000000
75%       3.000000
max       5.000000
Name: count, dtype: float64

Number of unique actions within workflows with "CML" in the filename

In [33]:
CMLrestricted_temp_df["unique"].astype("int").describe()

count    38.000000
mean      2.315789
std       0.961566
min       1.000000
25%       1.250000
50%       2.500000
75%       3.000000
max       4.000000
Name: unique, dtype: float64

#### Actions / Run Commands

In [20]:
(workflows_df["n_of_actions"] / (workflows_df["n_of_actions"] + workflows_df["n_of_run_commands"])).describe()

count    38.000000
mean      0.608678
std       0.167876
min       0.333333
25%       0.500000
50%       0.535714
75%       0.750000
max       1.000000
dtype: float64

#### Number of actions available on the GitHub Marketplace
Ratio of actions available in the Marketplace vs custom actions not registered in the marketplace

In [21]:
total_distinct_actions_noSlug = actions_df.loc[
    :, ["action_slug_noTag", "available_in_marketplace"]
].drop_duplicates().shape[0]
total_distinct_actions_noSlug

14

In [22]:
available_in_marketplace = actions_df.loc[
    actions_df["available_in_marketplace"], ["action_slug_noTag", "available_in_marketplace"]
].drop_duplicates().shape[0]
available_in_marketplace

11

In [23]:
available_in_marketplace / total_distinct_actions_noSlug

0.7857142857142857

#### Number of actions by verified creators

In [60]:
by_verified_creator = actions_df.loc[
    actions_df["from_verified_creator"], ["action_slug_noTag", "from_verified_creator"]
].drop_duplicates().shape[0]
by_verified_creator

8

Total distinct actions (no slug) found in workflows with "cml" in the filename

In [61]:
total_distinct_actions_noSlug_CMLworkflows = actions_df.loc[
    actions_df.workflow.str.contains("cml", case=False),
    ["action_slug_noTag", "available_in_marketplace"]
].drop_duplicates().shape[0]
total_distinct_actions_noSlug_CMLworkflows

6

In [62]:
actions_df.loc[
    actions_df.workflow.str.contains("cml", case=False),
    ["action_slug_noTag", "available_in_marketplace"]
].drop_duplicates()

Unnamed: 0,action_slug_noTag,available_in_marketplace
5,actions/checkout,True
8,ros-industrial/industrial_ci,False
15,iterative/setup-cml,False
16,iterative/setup-dvc,False
17,actions/setup-python,True
36,aws-actions/configure-aws-credentials,True


In [63]:
actions_df.loc[
    actions_df.workflow.str.contains("cml", case=False),
    ["action_slug_noTag"]
]["action_slug_noTag"].value_counts()

actions/checkout                         31
iterative/setup-cml                      17
actions/setup-python                     13
iterative/setup-dvc                       2
ros-industrial/industrial_ci              1
aws-actions/configure-aws-credentials     1
Name: action_slug_noTag, dtype: int64

In [64]:
actions_df.loc[
    actions_df.workflow.str.contains("cml", case=False),
    ["action_slug_noTag", "workflow"]
].drop_duplicates()["action_slug_noTag"].value_counts()

actions/checkout                         28
iterative/setup-cml                      17
actions/setup-python                     13
iterative/setup-dvc                       2
ros-industrial/industrial_ci              1
aws-actions/configure-aws-credentials     1
Name: action_slug_noTag, dtype: int64

Total number of workflows with "CML" in their filename

In [65]:
actions_df.loc[
    actions_df.workflow.str.contains("cml", case=False)
]["workflow"].drop_duplicates().shape

(28,)

#### Distribution of action categories

Distinct predefined (i.e., available in marketplace) actions

In [66]:
distinct_predef_actions = actions_df.loc[
    actions_df["available_in_marketplace"], ["action_slug_noTag", "category_1", "category_2"]
].drop_duplicates()
distinct_predef_actions_count = distinct_predef_actions.shape[0]
print("Count:", distinct_predef_actions_count, end="\n\n")
distinct_predef_actions.head()

Count: 11



Unnamed: 0,action_slug_noTag,category_1,category_2
0,actions/checkout,Utilities,
3,actions/cache,Utilities,Dependency management
13,aws-actions/configure-aws-credentials,Continuous integration,Deployment
17,actions/setup-python,Utilities,
20,aws-actions/amazon-ecr-login,Continuous integration,Deployment


Uncategorized

In [67]:
uncategorized = distinct_predef_actions.loc[
    distinct_predef_actions["category_1"].isnull() & distinct_predef_actions["category_2"].isnull()
]
uncategorized

Unnamed: 0,action_slug_noTag,category_1,category_2


In [68]:
uncategorized_series = pd.Series([1], index=["Uncategorized"])
uncategorized_series

Uncategorized    1
dtype: int64

In [69]:
cat_1 = distinct_predef_actions["category_1"].value_counts()
cat_1

Utilities                 5
Continuous integration    3
Chat                      1
Publishing                1
Code quality              1
Name: category_1, dtype: int64

In [70]:
cat_2 = distinct_predef_actions["category_2"].value_counts()
cat_2

Deployment                4
Continuous integration    2
Dependency management     1
Utilities                 1
Name: category_2, dtype: int64

In [71]:
value_counts_sum = cat_1.add(cat_2, fill_value=0).sort_values(ascending=False)
value_counts_sum = value_counts_sum.append(uncategorized_series)
value_counts_sum

  value_counts_sum = value_counts_sum.append(uncategorized_series)


Utilities                 6.0
Continuous integration    5.0
Deployment                4.0
Chat                      1.0
Code quality              1.0
Dependency management     1.0
Publishing                1.0
Uncategorized             1.0
dtype: float64

Replication of Table 1 of *"How Do Software Developers Use GitHub Actions to Automate Their Workflows?"*

In [72]:
df_temp = pd.DataFrame({"# of Actions": value_counts_sum, "%": ((value_counts_sum / distinct_predef_actions_count)*100).round(2)})
df_temp.append(df_temp.sum(), ignore_index=True)

  df_temp.append(df_temp.sum(), ignore_index=True)


Unnamed: 0,# of Actions,%
0,6.0,54.55
1,5.0,45.45
2,4.0,36.36
3,1.0,9.09
4,1.0,9.09
5,1.0,9.09
6,1.0,9.09
7,1.0,9.09
8,20.0,181.81


#### 10 most popular actions

In [73]:
actions_df.loc[:, "action_slug_noTag"].value_counts().head(10)

actions/checkout                         46
iterative/setup-cml                      18
actions/setup-python                     14
aws-actions/configure-aws-credentials     4
actions/setup-go                          4
ros-industrial/industrial_ci              2
iterative/setup-dvc                       2
actions/cache                             1
aws-actions/amazon-ecr-login              1
machine-learning-apps/actions-chatops     1
Name: action_slug_noTag, dtype: int64

#### 10 most popular actions available on the GitHub Marketplace

In [74]:
actions_df.loc[actions_df["available_in_marketplace"], "action_slug_noTag"].value_counts().head(10)

actions/checkout                         46
actions/setup-python                     14
aws-actions/configure-aws-credentials     4
actions/setup-go                          4
actions/cache                             1
aws-actions/amazon-ecr-login              1
machine-learning-apps/actions-chatops     1
pypa/gh-action-pypi-publish               1
goreleaser/goreleaser-action              1
hashicorp/setup-terraform                 1
Name: action_slug_noTag, dtype: int64

## RQ3 - "What are the sets of actions that typically co-occur in workflows"

#### Frequent 2-item sets of Actions

##### With tags

In [75]:
frequent_actions_df.loc[frequent_actions_df["length"] == 2]

Unnamed: 0,support,itemsets,length
7,0.078947,"(actions/checkout@v2, actions/setup-go@v2)",2
8,0.342105,"(actions/checkout@v2, actions/setup-python@v2)",2
9,0.105263,"(actions/checkout@v2, aws-actions/configure-aw...",2
10,0.473684,"(actions/checkout@v2, iterative/setup-cml@v1)",2
11,0.052632,"(actions/checkout@v2, iterative/setup-dvc@v1)",2
12,0.052632,"(actions/checkout@v2, ros-industrial/industria...",2
13,0.315789,"(iterative/setup-cml@v1, actions/setup-python@v2)",2
14,0.052632,"(iterative/setup-dvc@v1, actions/setup-python@v2)",2
15,0.052632,"(iterative/setup-cml@v1, iterative/setup-dvc@v1)",2


##### Without tags

In [83]:
frequent_actions_noTags_df.loc[
    (frequent_actions_noTags_df["length"] >= 2) & (frequent_actions_noTags_df["support"] > 0.07)
].sort_values(by="support", ascending=False)

Unnamed: 0,support,itemsets,length
10,0.473684,"(iterative/setup-cml, actions/checkout)",2
8,0.368421,"(actions/setup-python, actions/checkout)",2
13,0.342105,"(iterative/setup-cml, actions/setup-python)",2
16,0.342105,"(iterative/setup-cml, actions/setup-python, ac...",3
9,0.105263,"(aws-actions/configure-aws-credentials, action...",2
7,0.078947,"(actions/setup-go, actions/checkout)",2


In [84]:
frequent_actions_noTags_df.loc[
    (frequent_actions_noTags_df["length"] >= 2) & (frequent_actions_noTags_df["support"] > 0.07)
].sort_values(by="support", ascending=False)["itemsets"].tolist()

[frozenset({'actions/checkout', 'iterative/setup-cml'}),
 frozenset({'actions/checkout', 'actions/setup-python'}),
 frozenset({'actions/setup-python', 'iterative/setup-cml'}),
 frozenset({'actions/checkout',
            'actions/setup-python',
            'iterative/setup-cml'}),
 frozenset({'actions/checkout', 'aws-actions/configure-aws-credentials'}),
 frozenset({'actions/checkout', 'actions/setup-go'})]

#### Frequent 3-item sets of Actions

##### With tags

In [52]:
frequent_actions_df.loc[frequent_actions_df["length"] == 3]

Unnamed: 0,support,itemsets,length
14,0.050378,"(actions/checkout@v2, actions/setup-python@v2,...",3


Readable version

In [53]:
frequent_actions_df.loc[frequent_actions_df["length"] == 3]['itemsets'].tolist()

[frozenset({'actions/checkout@v2',
            'actions/setup-python@v2',
            'actions/upload-artifact@v2'})]

##### Without tags

In [54]:
frequent_actions_noTags_df.loc[frequent_actions_noTags_df["length"] == 3]

Unnamed: 0,support,itemsets,length
16,0.083123,"(actions/cache, actions/setup-java, actions/ch...",3
17,0.062972,"(actions/cache, actions/upload-artifact, actio...",3
18,0.062972,"(actions/setup-python, actions/upload-artifact...",3


Readable version

In [55]:
frequent_actions_noTags_df.loc[frequent_actions_noTags_df["length"] == 3]['itemsets'].tolist()

[frozenset({'actions/cache', 'actions/checkout', 'actions/setup-java'}),
 frozenset({'actions/cache', 'actions/checkout', 'actions/upload-artifact'}),
 frozenset({'actions/checkout',
            'actions/setup-python',
            'actions/upload-artifact'})]