# Analysis of OSM installations

In [None]:
import os
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from IPython.display import display, Markdown

In [None]:
# Sets a bigger default size for figures
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 80 # 100 gives great resolution and 200 gives optimal resolution, but much slower
sns.set(rc={'figure.figsize':(12,8)})

In [None]:
plt.style.use('fivethirtyeight')

## 1. Input parameters

In [None]:
install_events_uri = 'https://osm.etsi.org/stats/install-log.csv'

date_first_valid_sample = '2021-11-29'

In [None]:
relevant_releases = [
    # '11.0.0rc1',
    'Release ELEVEN',
    'Release ELEVEN-daily',
    # 'release',
    'testing-daily'
]

In [None]:
sorted_types_of_install_events = [
    'start',
    'checks',
    'prereq',
    'docker_ce',
    'k8scluster',
    'juju',
    'docker_images',
    'osm_files',
    'deploy_osm',
    'osmclient',
    'healthchecks',
    'final_ops',
    'end'
]

events_to_discard = ['hola', 'hola2', 'my-event', 'my-second-event', 'test-event']

events_renaming = {
    'add_local_k8scluster': 'final_ops',
    'after_healthcheck': 'healthchecks',
    'checkingroot': 'checks',
    'deploy_osm_pla': 'deploy_osm',
    'deploy_osm_services_k8s': 'deploy_osm',
    'docker_build': 'docker_images',
    'env_files': 'osm_files',
    'init_k8s': 'k8scluster',
    'install_helm': 'k8scluster',
    'install_k8s': 'k8scluster',
    'juju_controller': 'juju',
    'juju_install': 'juju',
    'k8s_metallb': 'k8scluster',
    'k8s_ready': 'k8scluster',
    'k8s_storageclass': 'k8scluster',
    'manifest_files': 'osm_files',
    'noroot': 'checks',
    'osm_unhealthy': 'healthchecks',
    'prereqok': 'prereq',
    'proceed': 'checks'
}

sorted_types_of_install_operations = [
    'start_ok',
    'release',
    'docker_tag',
    'installation_type',
    'checkingroot_ok',
    'noroot_ok',
    'proceed_ok',
    'prereqok_ok',
    'docker_ce_ok',
    'install_k8s_ok',
    'init_k8s_ok',
    'install_helm_ok',
    'k8s_storageclass_ok',
    'k8s_metallb_ok',
    'k8s_ready_ok',
    'k8scluster_ok',
    'juju_install_ok',
    'juju_controller_ok',
    'juju_ok',
    'docker_images_ok',
    'manifest_files_ok',
    'env_files_ok',
    'deploy_charmed_services_ok',
    'kube_secrets_ok',
    'update_manifest_files_ok',
    'namespace_vol_ok',
    'deploy_osm_pla_ok',
    'deploy_osm_services_k8s_ok',
    'osmclient_ok',
    'osm_unhealthy',
    'after_healthcheck_ok',
    'add_local_k8scluster_ok',
    'end',
    'fatal',
]

operations_to_discard = ['start', 'my-op', 'op1', 'op2', 'fatal_my-event', 'test-event', 'fatal_test-event']

operations_renaming = {
    'apt_proxy_configured': 'apt_proxy_configured_ok',
    'checkingroot': 'checkingroot_ok',
    'noroot': 'noroot_ok',
    'proceed': 'proceed_ok',
    'prereqok': 'prereqok_ok',
    'docker_ce': 'docker_ce_ok',
    'install_k8s': 'install_k8s_ok',
    'init_k8s': 'init_k8s_ok',
    'install_helm': 'install_helm_ok',
    'k8s_storageclass': 'k8s_storageclass_ok',
    'k8s_metallb': 'k8s_metallb_ok',
    'k8scluster': 'k8scluster_ok',
    'juju_controller': 'juju_controller_ok',
    'juju': 'juju_ok',
    'docker_build': 'docker_images_ok',
    'docker_build_ok': 'docker_images_ok',
    'manifest_files': 'manifest_files_ok',
    'env_files': 'env_files_ok',
    'deploy_osm_pla': 'deploy_osm_pla_ok',
    'deploy_osm_services_k8s': 'deploy_osm_services_k8s_ok',
    'osmclient': 'osmclient_ok',
    'after_healthcheck': 'after_healthcheck_ok',
    'add_local_k8scluster': 'add_local_k8scluster_ok'
}

## 2. Load raw installation events

In [None]:
column_names = ['timestamp', 'location', 'queries']

In [None]:
def convert_keyvalues_to_series(df_with_keys):
    cols = df_with_keys.columns
    df_with_series = df_with_keys.copy()

    for col in cols:
        # Breaks down each column into 2 columns: keys and values
        df = df_with_keys[col].str.split('=', expand=True)

        # If there are values, adds them; else, the column should be empty
        df_with_series[col] = df[1] if (df.shape[1] == 2) else pd.NA

        # New name for the column, from the key name
        df_with_series.rename(columns={col: df.iloc[0, 0]}, inplace=True)

    return df_with_series

In [None]:
def drop_undesired_categories(df, col_name, undesired_categories):
    mask = ~ df.loc[:, col_name].isin(undesired_categories)
    return df.loc[mask].copy()

In [None]:
def fix_categories(input_serie, known_categories):
    unknown = input_serie[~ input_serie.isin(known_categories)].unique().tolist()
    return [known_categories + unknown, unknown]

def make_sorted_categorical(sr, known_categories):
    extended_categories, unknown_categories = fix_categories(sr, known_categories)
    # display(pd.Series(extended_categories).value_counts())
    new_category = CategoricalDtype(categories=extended_categories, ordered=True)
    if unknown_categories:
        print(f"Unknown categories: {unknown_categories}")
    return sr.astype(new_category)

In [None]:
def load_install_events_and_operations():
    df_raw_install_events = pd.read_csv(install_events_uri, sep=';', header=0, names=column_names, skiprows=120)

    return (
        df_raw_install_events
        .drop(columns='queries')
        .join(
            (
                df_raw_install_events
                ['queries']
                .str.split('&', expand=True)
                .drop(columns=0)    # 1st column should be empty due to `&`
                .pipe(convert_keyvalues_to_series)
            )
        )

        # Empty strings should be NA
        .replace("", pd.NA)

        # Removes malformed lines ('installation_id' will be missing, among others)
        .dropna(subset=['installation_id'])

        # Removes lines with undesired 'event' or 'operation' categories
        .pipe(drop_undesired_categories, 'event', events_to_discard)
        .pipe(drop_undesired_categories, 'operation', operations_to_discard)

        # Replaces values of old 'event' or 'operation' categories by their new names
        .assign(event = lambda x: x.event.replace(events_renaming))
        .assign(operation = lambda x: x.operation.replace(operations_renaming))

        # If within the same installation attempt we have a duplicate 'operation', only last sample is kept
        .drop_duplicates(subset=['installation_id', 'operation'], keep='last')

        # Fixes data types
        .assign(timestamp = lambda x: pd.to_datetime(x.timestamp))
        .assign(location = lambda x: x.location.astype('category'))
        .assign(event = lambda x: make_sorted_categorical(x.event, sorted_types_of_install_events))
        .assign(operation = lambda x: make_sorted_categorical(x.operation, sorted_types_of_install_operations))

        # .assign(local_ts = lambda x: pd.to_datetime(x.local_ts))
        #.assign(local_ts = lambda x: dt.datetime.fromtimestamp(int(x.local_ts)/10**9, dt.timezone.utc))
        # .assign(event = lambda x: x.event.astype('category'))
    )

In [None]:
df_install_events_and_operations = (
    load_install_events_and_operations()
    .query("timestamp >= @date_first_valid_sample")
)

In [None]:
# FIXME: For CSV debugging
!wget -N {install_events_uri}

In [None]:
# FIXME: To force removal of unknown categories if needed
# mask = df_install_events.event.isin(sorted_types_of_install_events) & df_install_events.operation.isin(sorted_types_of_install_operations)
# df_install_events = df_install_events.loc[mask]

In [None]:
df_install_events_and_operations.info()

In [None]:
df_install_events_and_operations

## 2. Organization in wide format

In [None]:
def get_achieved_operations(df):
    return (
        pd.concat(
            [
                # All operations that explicitly show progress in the installation
                (
                    df
                    .assign(value = lambda x: x.value.fillna(True))
                    .query("value == True")
                    .rename(columns={'operation': 'achievement'})
                    .drop(columns='value')
                ),

                # Adds extra rows to flag the beginning of the installation
                (
                    df
                    .query("(event == 'start') & (operation == 'release')")
                    .assign(operation = 'start_ok')
                    .rename(columns={'operation': 'achievement'})
                    .drop(columns='value')
                )
            ]
        )
        .sort_index()
        .assign(achievement = lambda x: make_sorted_categorical(x.achievement, sorted_types_of_install_operations).cat.remove_unused_categories())
    )

In [None]:
df_achieved_operations = get_achieved_operations(df_install_events_and_operations)

In [None]:
df_achieved_operations

In [None]:
df_achieved_operations.info()

In [None]:
def cast_axis_to_str(df):
    df.columns = df.columns.astype(str)
    return df

def remove_column_axis_name(df):
    df.columns.name = None
    return df

def get_info_operations_wide(df):
    return (
        df
        .loc[:, ['installation_id', 'operation', 'value']]
        .assign(value = lambda x: x.value.fillna(True))
        .query("value != True")
        .pivot(
            index = 'installation_id',
            columns = 'operation',
            values = 'value'
        )
        .pipe(cast_axis_to_str)
        .pipe(remove_column_axis_name)
        .reset_index()
    )

In [None]:
df_info_operations_wide = get_info_operations_wide(df_install_events_and_operations)

In [None]:
df_info_operations_wide

In [None]:
df_info_operations_wide.info()

In [None]:
df_installations_wide = (
    df_achieved_operations
    .merge(
        df_info_operations_wide,
        how = 'left',
        on = 'installation_id'
    )
)

In [None]:
df_installations_wide.to_excel('prueba.xlsx')

In [None]:
df_installations_wide.info()

## 3. Analysis of installations

In [None]:
df_installations_wide.tail()

In [None]:
sns.countplot(
    data = df_installations_wide.drop_duplicates(subset='installation_id'),
    x = 'release',
)

plt.title('Number of installation attempts')
plt.ylabel(None)

plt.show()

In [None]:
(
    df_installations_wide
    .drop_duplicates(subset='installation_id')
    .release
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'release': 'INSTALLATIONS', 'index': 'RELEASE'})
)

In [None]:
sns.catplot(
    data = df_installations_wide,
    y = 'achievement',
    col = 'release',
    col_wrap = 4,
    kind = 'count'
)

plt.show()

In [None]:
name_mappings = {
    'ReleaseELEVEN': 'Release ELEVEN',
    'ReleaseELEVEN-daily': 'Release ELEVEN-daily'
}

In [None]:
def add_pct(df):
    return (
        df
        .assign(percentage =
            (
                df
                .groupby(['release'])['count']
                .transform(lambda x: x / x.max())
            )
        )
    )

In [None]:
df_installations_wide.tail()

In [None]:
def get_funnels_per_release(df_installations_wide):
    return (
        df_installations_wide
        .groupby(['achievement', 'release'])
        .installation_id
        .count()
        .reset_index()
        .rename(columns={'installation_id': 'count'})
        .replace({'release': name_mappings})
        .pipe(add_pct)
    )

In [None]:
df_funnels_per_release = get_funnels_per_release(df_installations_wide)

In [None]:
# df_funnels_per_release

In [None]:
def plot_funnel(df_funnel, x='count', title=None):
    fig = px.funnel(
        df_funnel.query("achievement != 'osm_unhealthy'"),
        x=x,
        y='achievement',
        title=title
    )

    fig.update_layout(
        width=800,
        height=1200,
        title_font_size=24,
        # paper_bgcolor="LightSteelBlue",
    )

    fig.update_traces(
        textinfo="value+percent initial",
        textposition = "inside"
    )

    fig.show()

In [None]:
funnels_per_release = df_funnels_per_release.groupby('release')

for release in relevant_releases:
    df_funnel = funnels_per_release.get_group(release)
    display(
        _ = plot_funnel(
            df_funnel,
            x = 'count',
            title = f'Funnel of {release} installations'
        )
    )

In [None]:
(
    df_funnels_per_release
    .query("achievement == 'end' or achievement == 'start_ok'")
    .drop(columns='percentage')
    .pivot(
        index = 'release',
        values = 'count',
        columns = 'achievement'
    )
    .pipe(cast_axis_to_str)
    .reset_index()
    .pipe(remove_column_axis_name)
    .rename(columns = {'release': 'RELEASE', 'start_ok': 'TOTAL_INSTALLS', 'end': 'SUCCESSFUL_INSTALLS'})
    .assign(SUCCESS_RATIO = lambda x: x.SUCCESSFUL_INSTALLS / x.TOTAL_INSTALLS)
)