In [7]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder

In [2]:
workflows_df = pd.read_pickle("../dumps/workflows_df.pkl")
actions_df = pd.read_pickle("../dumps/actions_df.pkl")
frequent_actions_df = pd.read_pickle("../dumps/frequent_actions_df.pkl")
frequent_actions_noTags_df = pd.read_pickle("../dumps/frequent_actions_noTags_df.pkl")
frequent_docker_commands_subsample_df = pd.read_pickle("../dumps/frequent_docker_commands_subsample_df.pkl")
frequent_cml_commands_subsample_df = pd.read_pickle("../dumps/frequent_cml_commands_subsample_df.pkl")

In [9]:
def mine_frequent_patterns(
    transactions: list[list], support: float
) -> pd.DataFrame:
    te = TransactionEncoder()
    encoding = te.fit(transactions).transform(transactions)
    encoding_df = pd.DataFrame(encoding, columns=te.columns_)
    frequent_itemsets = apriori(encoding_df, min_support=support, use_colnames=True)
    frequent_itemsets["length"] = frequent_itemsets["itemsets"].apply(
        lambda x: len(x)
    )
    return frequent_itemsets

# RQ1 – How common is workflow automation in ML-enabled systems hosted on GitHub?

Number of workflows per repository

In [6]:
workflows_df.groupby("repository").count()["path"].describe()

count    29.000000
mean      1.310345
std       0.806379
min       1.000000
25%       1.000000
50%       1.000000
75%       1.000000
max       4.000000
Name: path, dtype: float64

In [5]:
workflows_df["path"] = workflows_df["repository"] + "/" + workflows_df["filename"]
workflows_df["path"].unique().shape

(38,)

## RQ 2 – What type of events are used to trigger MLOps workflows?

Non è scontato che i wf vengano attivati da push x' un wf dovrebbe re-innescare l'intero processi di addestramento del modello.
controllare i branch.
i due casi Educational, uno solo su branch main, il blog sempre (attn. se pipeline di model building e' time consuming.)

sembra che la PR non sia usata come trigger per re-innescare il retraining del modello, che è strano perche' cml ha comandi specifico.

In [12]:
transactions = workflows_df['trigger_events'].tolist()
patterns_df = mine_frequent_patterns(transactions, support=0.05)
patterns_df.sort_values(by="support", ascending=False)

Unnamed: 0,support,itemsets,length
2,0.789474,(push),1
1,0.184211,(pull_request),1
5,0.131579,"(push, pull_request)",2
0,0.078947,(issue_comment),1
3,0.052632,(release),1
4,0.052632,(schedule),1


In [21]:
workflows_df

Unnamed: 0,repository,filename,name,trigger_events,n_of_actions,cml_related_actions,docker_related_actions,n_of_run_commands,cml_related_commands,cml_commands,docker_related_commands,docker_commands
0,AscendNTNU/perception_testing_21,mainci.yml,PerceptionCI,"[push, pull_request]",5,False,False,0,False,[],False,[]
1,AscendNTNU/perception_testing_21,cml.yml,cml,"[push, pull_request]",4,False,False,0,False,[],False,[]
2,YannickLecroart/pmr,cml.yml,model-CHD,[push],1,False,False,1,True,"[publish, send-comment]",False,[]
3,YannickLecroart/pmr_dvc_demo,cml.yml,model-CHD,[push],1,False,False,1,True,[send-comment],False,[]
4,YannickLecroart/pmr_cml_pipeline,cml.yml,PMR-CML-DVC-Pipeline,[push],1,False,False,1,True,"[publish, send-comment]",False,[]
5,2796gaurav/automate,docker_destroy.yml,Docker Image destroy,[issue_comment],2,False,False,4,False,[],False,[]
6,2796gaurav/automate,cml_report.yml,cml-report,[push],4,True,False,1,True,"[publish, send-comment]",False,[]
7,2796gaurav/automate,dockerize.yml,Docker Image CI,[issue_comment],3,False,False,5,False,[],True,"[build, push]"
8,2796gaurav/automate,deploy.yml,deploy to aws,[issue_comment],3,False,False,3,False,[],False,[]
9,akdsingh/cml,cml.yml,Model Selection,[push],3,True,False,1,True,"[publish, send-comment]",False,[]


In [22]:
workflows_df.loc[workflows_df.cml_related_actions | workflows_df.cml_related_commands].trigger_events.value_counts()

[push]                                     24
[pull_request]                              2
[push, pull_request, workflow_dispatch]     1
[push, pull_request]                        1
Name: trigger_events, dtype: int64

## RQ 3
RQ3. What are the most frequently executed tasks?

(Are there any sets of tasks that typically co-occur in workflows?)


In [23]:
frequent_actions_noTags_df.to_excel("../dumps/frequent_actions_noTags_df.xlsx")
frequent_actions_noTags_df

Unnamed: 0,support,itemsets,length
0,1.0,(actions/checkout),1
1,0.078947,(actions/setup-go),1
2,0.368421,(actions/setup-python),1
3,0.105263,(aws-actions/configure-aws-credentials),1
4,0.473684,(iterative/setup-cml),1
5,0.052632,(iterative/setup-dvc),1
6,0.052632,(ros-industrial/industrial_ci),1
7,0.078947,"(actions/setup-go, actions/checkout)",2
8,0.368421,"(actions/checkout, actions/setup-python)",2
9,0.105263,"(aws-actions/configure-aws-credentials, action...",2


Gli unici command sono di aggiornamento delle PR/commit, per aggiungere testo o immagini
Si nota dai comandi di CML che non esiste un comando nativo per il deploy di modelli o archiviazione in un registry esterno, quali MLFlow.
Per il secondo, non abbiamo trovato negli script riferimenti ad altre tecnologie esistenti.
Per il primo punto si usa docker?


In [24]:
frequent_cml_commands_subsample_df

Unnamed: 0,support,itemsets,length
0,0.62963,(publish),1
1,1.0,(send-comment),1
2,0.62963,"(send-comment, publish)",2


Quanti sono i wf contenenti almeno un comando CML? 28

In [30]:
workflows_df.loc[workflows_df.cml_related_commands].shape[0]

28

Quanti wf contengono almeno un comando docker? 1

In [29]:
workflows_df.loc[workflows_df.docker_related_commands].shape[0]


1

In [28]:
frequent_docker_commands_subsample_df

Unnamed: 0,repository,filename,name,trigger_events,n_of_actions,cml_related_actions,docker_related_actions,n_of_run_commands,cml_related_commands,cml_commands,docker_related_commands,docker_commands
0,AscendNTNU/perception_testing_21,mainci.yml,PerceptionCI,"[push, pull_request]",5,False,False,0,False,[],False,[]
1,AscendNTNU/perception_testing_21,cml.yml,cml,"[push, pull_request]",4,False,False,0,False,[],False,[]
2,YannickLecroart/pmr,cml.yml,model-CHD,[push],1,False,False,1,True,"[publish, send-comment]",False,[]
3,YannickLecroart/pmr_dvc_demo,cml.yml,model-CHD,[push],1,False,False,1,True,[send-comment],False,[]
4,YannickLecroart/pmr_cml_pipeline,cml.yml,PMR-CML-DVC-Pipeline,[push],1,False,False,1,True,"[publish, send-comment]",False,[]
5,2796gaurav/automate,docker_destroy.yml,Docker Image destroy,[issue_comment],2,False,False,4,False,[],False,[]
6,2796gaurav/automate,cml_report.yml,cml-report,[push],4,True,False,1,True,"[publish, send-comment]",False,[]
7,2796gaurav/automate,dockerize.yml,Docker Image CI,[issue_comment],3,False,False,5,False,[],True,"[build, push]"
8,2796gaurav/automate,deploy.yml,deploy to aws,[issue_comment],3,False,False,3,False,[],False,[]
9,akdsingh/cml,cml.yml,Model Selection,[push],3,True,False,1,True,"[publish, send-comment]",False,[]
