## Setup

In [None]:
import datetime
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

In [2]:
config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'noise_dist',
    'scale': 3
  }
}

def show_figure(fig, x_title, y_title, width:int=500, height:int=300, fontsize:int=15):
    fig.update_layout(
        xaxis_title=x_title,
        yaxis_title=y_title,
        width=width,
        height=height,
        font_family="Rockwell",
        font_size=fontsize,
        font_color="black",
        autosize=False,
        margin=dict(l=1, r=1, b=0, t=1, pad=0),
        template="plotly_white",
    )
    fig.update_xaxes(
        showline=True,
        linewidth=1,
        linecolor="black",
        mirror=True,
        ticks="outside",
        ticklabelmode="period",
        minor=dict(ticks="inside", showgrid=True),
    )
    fig.update_yaxes(
        showline=True,
        showgrid=True,
        linewidth=1,
        linecolor="black",
        mirror=True,
        ticks="outside",
        showticklabels=True,
    )
    fig.show()

In [3]:
intermittent_failures = pd.read_csv("../data/labeled_intermittent_failures.csv")
intermittent_failures["created_at"] = pd.to_datetime(intermittent_failures["created_at"], format="mixed", utc=True)

In [5]:
categories = pd.read_csv("./results/categories.csv")
categories.head(5)

Unnamed: 0,category,group,#,%,Proj.,Machine Cost,Diagnosis Cost,Cost,diagnosis_cost_proportion
0,container_already_exists,container_issues,2,0.04,1,0.9,0.54,1.44,37.5
1,container_not_found,container_issues,31,0.69,7,17.88,19686.51,19704.39,99.91
2,docker_daemon_connection_failure,container_issues,325,7.2,13,21.09,146619.61,146640.7,99.99
3,image_build_permission_denied,container_issues,8,0.18,1,0.75,700.84,701.59,99.89
4,image_build_read_error,container_issues,17,0.38,5,27.67,153653.07,153680.74,99.98


In [6]:
reference_date = datetime.date(2024, 7, 11) # date of collected data

We calculate the recency of each category in number of days

In [7]:
def recency(creation_dates):
    n = 3
    dates = [x.to_pydatetime().date() for x in creation_dates]
    dates.sort()
    last_n_dates = dates[-n:]
    recencies = [(reference_date - d).days for d in last_n_dates]
    return round(np.mean(recencies))


last_occurences = (
    intermittent_failures.groupby("subcategory")
    .agg({"created_at": recency})
    .reset_index()
)
last_occurences.columns = ["category", "recency"]
last_occurences.head()

Unnamed: 0,category,recency
0,api_gateway_deployment_error,29
1,apt_timezone_issue,964
2,broker_connection_failure,464
3,buggy_dependency,364
4,certificate_verification_failure,89


Then we add it to the categories dataset

In [8]:
categories = categories.drop(columns=["recency"], errors="ignore")
categories = categories.set_index("category").join(last_occurences[["category", "recency"]].set_index("category"))
categories = categories.reset_index()
categories.head()

Unnamed: 0,category,group,#,%,Proj.,Machine Cost,Diagnosis Cost,Cost,diagnosis_cost_proportion,recency
0,container_already_exists,container_issues,2,0.04,1,0.9,0.54,1.44,37.5,951
1,container_not_found,container_issues,31,0.69,7,17.88,19686.51,19704.39,99.91,70
2,docker_daemon_connection_failure,container_issues,325,7.2,13,21.09,146619.61,146640.7,99.99,915
3,image_build_permission_denied,container_issues,8,0.18,1,0.75,700.84,701.59,99.89,1415
4,image_build_read_error,container_issues,17,0.38,5,27.67,153653.07,153680.74,99.98,223


In [11]:
rfm = categories[["category", "recency", "#", "Cost"]]
rfm.columns = ["Category", "Recency", "Frequency", "Monetary"]
print(rfm.shape)
rfm

(46, 4)


Unnamed: 0,Category,Recency,Frequency,Monetary
0,container_already_exists,951,2,1.44
1,container_not_found,70,31,19704.39
2,docker_daemon_connection_failure,915,325,146640.7
3,image_build_permission_denied,1415,8,701.59
4,image_build_read_error,223,17,153680.74
5,image_push_write_error,983,1,0.12
6,image_security_scan_failure,182,8,65.41
7,db_table_undefined,304,16,11767.83
8,buggy_dependency,364,13,7781.37
9,dependencies_conflict_error,195,129,47211.21


In [12]:
rfm.to_csv("./results/rfm.csv", index=False)

In [13]:
rfm.describe()

Unnamed: 0,Recency,Frequency,Monetary
count,46.0,46.0,46.0
mean,288.304348,98.065217,69619.33913
std,350.896621,123.700607,101538.48691
min,8.0,1.0,0.12
25%,55.0,16.0,6727.935
50%,181.5,55.5,21706.595
75%,336.25,128.0,101813.4775
max,1415.0,673.0,444689.06


In [14]:
outliers = []

for i in range(100):
    IF = IsolationForest(n_estimators=500, contamination=0.1)
    IF.fit(rfm[["Recency", "Frequency", "Monetary"]])
    IF_anomalies = IF.predict(rfm[["Recency", "Frequency", "Monetary"]])
    outliers.append(set(rfm[IF_anomalies==-1]['Category'].to_list()))

set.union(*outliers)

{'docker_daemon_connection_failure',
 'image_build_permission_denied',
 'job_execution_timeout',
 'misconfigured_env_variable',
 'testing_device_oom_error'}

In [15]:
IF = IsolationForest(n_estimators=500, contamination=0.1)
IF.fit(rfm[["Recency", "Frequency", "Monetary"]])
IF_anomalies = IF.predict(rfm[["Recency", "Frequency", "Monetary"]])
outliers  = rfm[IF_anomalies==-1]
outliers

Unnamed: 0,Category,Recency,Frequency,Monetary
2,docker_daemon_connection_failure,915,325,146640.7
3,image_build_permission_denied,1415,8,701.59
11,misconfigured_env_variable,27,673,444689.06
24,job_execution_timeout,8,306,399576.8
30,testing_device_oom_error,1265,6,6376.79


We remove the outliers from our rfm dataset

In [16]:
rfm = rfm[~rfm['Category'].isin(outliers['Category'])]

In [17]:
rfm['R'] = pd.qcut(rfm['Recency'], q=5, labels=list(range(5, 0, -1)))
rfm['F'] = pd.qcut(rfm['Frequency'], q=5, labels=list(range(1, 6)))
rfm['M'] = pd.qcut(rfm['Monetary'], q=5, labels=list(range(1, 6)))
rfm

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rfm['R'] = pd.qcut(rfm['Recency'], q=5, labels=list(range(5, 0, -1)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rfm['F'] = pd.qcut(rfm['Frequency'], q=5, labels=list(range(1, 6)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rfm['M'] = pd.qcut(rfm['Monetary'], q=5, labels=list(range(1, 6)))


Unnamed: 0,Category,Recency,Frequency,Monetary,R,F,M
0,container_already_exists,951,2,1.44,1,1,1
1,container_not_found,70,31,19704.39,4,2,3
4,image_build_read_error,223,17,153680.74,2,2,5
5,image_push_write_error,983,1,0.12,1,1,1
6,image_security_scan_failure,182,8,65.41,3,1,1
7,db_table_undefined,304,16,11767.83,2,2,3
8,buggy_dependency,364,13,7781.37,1,1,2
9,dependencies_conflict_error,195,129,47211.21,3,4,4
10,dependency_installation_failure,54,124,227253.11,4,4,5
12,flaky_test,16,179,52484.3,5,5,4


In [18]:
rfm['M'].value_counts()

M
1    9
2    8
3    8
4    8
5    8
Name: count, dtype: int64

In [19]:
rfm.shape

(41, 7)

In [20]:
rfm["RFM"] = rfm["R"].astype(str) + rfm["F"].astype(str) + rfm["M"].astype(str)
len(rfm["RFM"].value_counts())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rfm["RFM"] = rfm["R"].astype(str) + rfm["F"].astype(str) + rfm["M"].astype(str)


29

In [21]:
scaler = StandardScaler()
pca = PCA(n_components=2)

cols = ["R", "F", "M"]
y = rfm["Category"]
X = rfm[cols].astype(int)
X_scaled = X.copy(deep=True)

X_pca = pca.fit_transform(X_scaled)
X_pca = pd.DataFrame(X_pca, columns=["Dim 1", "Dim 2"])

In [22]:
X['R'].value_counts()

R
5    9
1    8
4    8
2    8
3    8
Name: count, dtype: int64

# Clustering

We use k=8 and run the model 500 times and select the one with minimal SSE (inertia)

In [25]:
inertias = []
models = []
ks = []

for k in range(8, 9):
    for _ in range(500):
        kmeans = KMeans(n_clusters=k, init="k-means++")
        kmeans.fit(X_scaled)
        models.append(kmeans)
        ks.append(k)
        inertias.append(kmeans.inertia_)

i = np.argmin(inertias)
print(min(inertias))
print(f"k = {ks[i]}")

25.238095238095237
k = 8


In [26]:
i = np.argmin(inertias)
model = models[i]

df_cluster = pd.DataFrame(X).astype(int)
df_cluster['cluster'] = model.labels_
df_cluster['category'] = y
df_cluster['Recency'] = rfm['Recency']
df_cluster['Frequency'] = rfm['Frequency']
df_cluster['Monetary'] = rfm['Monetary']
df_cluster.head()

Unnamed: 0,R,F,M,cluster,category,Recency,Frequency,Monetary
0,1,1,1,0,container_already_exists,951,2,1.44
1,4,2,3,6,container_not_found,70,31,19704.39
4,2,2,5,2,image_build_read_error,223,17,153680.74
5,1,1,1,0,image_push_write_error,983,1,0.12
6,3,1,1,3,image_security_scan_failure,182,8,65.41


In [None]:
df_cluster.to_csv("./results/clusters.csv", index=False)

## Clustering Results Analysis

In [28]:
cluster_stats = df_cluster.drop(columns=["category"]).groupby("cluster").agg(["count", "mean"]).reset_index().round(2)
cluster_stats.columns = ["Cluster", "#", "R", "a", "F", "a", "M", "a", "Recency", "a", "Frequency", "a", "Monetary"]
cluster_stats = cluster_stats[["Cluster", "#", "R", "F", "M", "Recency", "Frequency", "Monetary"]]
cluster_stats

Unnamed: 0,Cluster,#,R,F,M,Recency,Frequency,Monetary
0,0,7,1.0,1.14,1.43,626.86,9.43,3735.86
1,1,6,4.5,3.83,3.17,55.83,88.17,29340.72
2,2,4,2.0,2.0,4.75,229.75,26.75,117990.57
3,3,4,3.75,1.5,1.0,120.5,16.0,743.18
4,4,5,2.8,4.8,4.2,162.2,204.4,107742.09
5,5,6,4.83,4.67,4.67,35.83,164.83,150700.09
6,6,5,3.6,2.6,2.4,119.4,47.2,10969.67
7,7,4,1.75,2.75,2.0,471.25,45.0,7403.75


In [29]:
r_mean = cluster_stats["R"].mean()
f_mean = cluster_stats["F"].mean()
m_mean = cluster_stats["M"].mean()

def rfm_pattern(row):
    pattern = ""
    pattern += "R+" if row["R"] > r_mean else "R-"
    pattern += "F+" if row["F"] > f_mean else "F-"
    pattern += "M+" if row["M"] > m_mean else "M-"
    return pattern

cluster_stats["Pattern"] = cluster_stats.apply(rfm_pattern, axis=1)
cluster_stats

Unnamed: 0,Cluster,#,R,F,M,Recency,Frequency,Monetary,Pattern
0,0,7,1.0,1.14,1.43,626.86,9.43,3735.86,R-F-M-
1,1,6,4.5,3.83,3.17,55.83,88.17,29340.72,R+F+M+
2,2,4,2.0,2.0,4.75,229.75,26.75,117990.57,R-F-M+
3,3,4,3.75,1.5,1.0,120.5,16.0,743.18,R+F-M-
4,4,5,2.8,4.8,4.2,162.2,204.4,107742.09,R-F+M+
5,5,6,4.83,4.67,4.67,35.83,164.83,150700.09,R+F+M+
6,6,5,3.6,2.6,2.4,119.4,47.2,10969.67,R+F-M-
7,7,4,1.75,2.75,2.0,471.25,45.0,7403.75,R-F-M-


In [31]:
df_cluster[df_cluster['cluster'] == 4]

Unnamed: 0,R,F,M,cluster,category,Recency,Frequency,Monetary
9,3,4,4,4,dependencies_conflict_error,195,129,47211.21
16,2,5,4,4,runner_pod_failure,269,154,93746.23
32,3,5,5,4,connection_closed_reset_broken,113,204,239901.85
40,3,5,4,4,image_not_found,106,221,81621.73
44,3,5,4,4,container_platform_auth_failure,128,314,76229.43


In [34]:
X_pca.columns

Index(['Dim 1', 'Dim 2'], dtype='object')

In [35]:
X_pca["cluster"] = [str(l) for l in model.labels_]
px.scatter(X_pca, x="Dim 1", y="Dim 2", color="cluster")

In [36]:
df_cluster[df_cluster['cluster'] == 0][['R', 'F', 'M']].std()

R    0.000000
F    0.377964
M    0.534522
dtype: float64

In [37]:
std = pd.DataFrame()
std['cluster'] = range(8)
std['std'] = [df_cluster[df_cluster['cluster'] == cluster][['R', 'F', 'M']].std().mean() for cluster in std['cluster']]


fig = px.line(std, x='cluster', y='std', color_discrete_sequence=px.colors.qualitative.G10, markers=True)
fig.update_layout(
    xaxis_title="Clusters",
    yaxis_title="Standard Deviation",
    width=350,
    height=230,
    font_family="Rockwell",
    font_size=14,
    font_color="black",
    autosize=False,
    margin=dict(l=1, r=1, b=0, t=10, pad=0),
    template="plotly_white",
)
fig.update_xaxes(
    showline=True,
    linewidth=1,
    linecolor="black",
    mirror=True,
    ticks="outside",
    tickmode = "array",
    tickvals = list(range(8)),
    ticktext = [f"C{i+1}" for i in list(range(8))],
)
fig.update_yaxes(
    showline=True,
    showgrid=True,
    linewidth=1,
    linecolor="black",
    mirror=True,
    ticks="outside",
    showticklabels=True,
    range=(0,1),
)
fig.show(config=config)

In [39]:
models[i]

In [40]:
import pickle

with open("./results/clustering_model.pickle", "wb") as f:
    pickle.dump(models[i], f)

In [41]:
with open('./results/clustering_model.pickle', 'rb') as f:
    km = pickle.load(f)
km.cluster_centers_

array([[1.        , 1.14285714, 1.42857143],
       [4.5       , 3.83333333, 3.16666667],
       [2.        , 2.        , 4.75      ],
       [3.75      , 1.5       , 1.        ],
       [2.8       , 4.8       , 4.2       ],
       [4.83333333, 4.66666667, 4.66666667],
       [3.6       , 2.6       , 2.4       ],
       [1.75      , 2.75      , 2.        ]])