# Analysis of OSM events in Bugzilla

In [None]:
import os
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from IPython.display import display, Markdown

In [None]:
# Sets a bigger default size for figures
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 80 # 100 gives great resolution and 200 gives optimal resolution, but much slower
sns.set(rc={'figure.figsize':(12,8)})

In [None]:
plt.style.use('fivethirtyeight')

## 0. Input parameters

In [None]:
inputs_folder = 'inputs'
outputs_folder = 'outputs'

#bugzilla_csv = os.path.join(inputs_folder, 'bugs.csv')
bugzilla_csv = 'https://osm.etsi.org/stats/bugs.csv'

most_relevant_modules = ['IM-NBI', 'NBI', 'RO', 'DEVOPS', 'Robot-tests', 'Descriptor-packages', 'N2VC', 'OSMClient', 'LCM', 'common', 'MON', 'POL', 'PLA', 'NG-UI', 'Any', 'Unknown', 'Other', 'Documentation / Wiki']

former_mdl_assessments_file = 'former_mdl_assessments.xlsx' # If it does not exist, it will be ignored

In [None]:
date_for_bug_deprecation = '2020-07-01'

# Comment to set the interval of recent bugs manually
#
days_4_recent_bugs = 21

In [None]:
# Uncomment to set the interval of recent bugs manually
#
# date_for_very_recent_bug = '2021-04-15'

In [None]:
today_as_datetime = pd.to_datetime("today")
today = today_as_datetime.strftime('%Y-%m-%d')

display(Markdown(f'**Date and time of the report:** {today_as_datetime}'))

In [None]:
last_date = today

if 'days_4_recent_bugs' in locals():
    date_for_very_recent_bug = today_as_datetime - dt.timedelta(days=days_4_recent_bugs)
    date_for_very_recent_bug = date_for_very_recent_bug.strftime('%Y-%m-%d')

In [None]:
display(
    Markdown(
        f'''**Analysed period:** {date_for_bug_deprecation} to {today}.

Bugs considered **_recent_**: Since {date_for_very_recent_bug}.'''
    )
)

## 1. Imports and cleans raw source data with all Bugzilla events

In [None]:
now = pd.to_datetime("now")
initial_header_list = ["BUG_ID", "BUG_DESCRIPTION", "RELEASE", "MODULE", "OPERATION", "VALUE", "TIMESTAMP"]
new_column_order = ['TIMESTAMP', 'BUG_ID', 'OPERATION', 'VALUE', 'RELEASE', 'MODULE', 'BUG_DESCRIPTION', 'ROW_NUMBER']

def classify_by_age(df):
    df = df.copy()

    df["AGE"] = "CURRENT"  # By default, they should be relevant
    df.loc[ df['TIMESTAMP']<date_for_bug_deprecation, "AGE" ] = "OLD"
    df.loc[ df['TIMESTAMP']>date_for_very_recent_bug, "AGE" ] = "RECENT"

    possible_ages_sorted = [
        'OLD',
        'CURRENT',
        'RECENT'
    ]
    df["AGE"] = pd.Categorical(df.AGE, categories=possible_ages_sorted, ordered=True)

    return df

def load_bug_full():

    return (
        pd.read_csv(bugzilla_csv, encoding='utf-8', header=None, names=initial_header_list)
        .astype(
            {
                'TIMESTAMP': 'datetime64',
                'OPERATION': 'category',
                'RELEASE': 'category',
                'MODULE': 'category'
            }
        )
        .sort_values(by=['TIMESTAMP', 'BUG_ID'])

        # Saves the original index as 'ROW_NUMBER' (useful for tracing back to initial data)
        .reset_index()
        .rename(columns={'index': 'ROW_NUMBER'})

        # Reorders the columns to ease inspection
        .reindex(columns=new_column_order)

        # Adds column that allows to aggregate per months
        # - Gets the last day (and second) of the month, so it is assigned as representative of the month
        .assign(MONTH = lambda x: pd.to_datetime(x.TIMESTAMP.dt.to_period('M').astype(str)) + pd.tseries.offsets.MonthEnd(1) + dt.timedelta(hours=23, minutes=59, seconds=59) )

        # Prevents dates in the future in "MONTH"
        .assign(MONTH = lambda x: (x.MONTH.where(x.MONTH < now, now)))

        # Classifies events by age: 'OLD', 'CURRENT', or 'RECENT'
        .pipe(classify_by_age)
    )

In [None]:
# Imports and cleans raw source data with all the events
df_bug_full = load_bug_full()

In [None]:
print('DONE')

## 2. Current state of open bugs

In [None]:
############### Analysis of the lifecycle of a bug ###############

In [None]:
#--- Changes of status per bug -----------------------------------

In [None]:
# Utility function to create a sorted category for bug states
def bug_states_as_category(df):

    df = df.copy()

    possible_bug_states_sorted = [
        'OPEN-UNCONFIRMED',
        'UNCONFIRMED',
        'CONFIRMED',
        'IN_PROGRESS',
        'RESOLVED',
        'VERIFIED'
    ]

    df['VALUE'] = pd.Categorical(df.VALUE, categories=possible_bug_states_sorted, ordered=True)

    return df

# Extracts the events where bugs change of state and concatenates them with the bug openings
def get_status_changes_by_bug(df):

    df_bug_openings = (
        df
        .query("OPERATION=='comment'")
        .groupby(['BUG_ID']).first()
        .reset_index()
        .assign(OPERATION = 'bug_status')
        .assign(ISSUER = lambda x: x.VALUE)
        .assign(VALUE = 'OPEN-UNCONFIRMED')
        .sort_values(by=['TIMESTAMP', 'BUG_ID'])
        .reset_index(drop=True)
    )

    return (
        pd.concat(
            [
                df.query("OPERATION=='bug_status'"),
                df_bug_openings
            ]
        )
        .sort_index()

        # Makes 'VALUE' categorical and sorts its categories properly
        .pipe(bug_states_as_category)

        .sort_values(by=['TIMESTAMP', 'BUG_ID'])
        .reset_index(drop=True)

        # Boolean to know if the bug is solved or remains open at that time
        .assign(SOLVED = lambda x: x.VALUE.isin(["RESOLVED", "VERIFIED"]))
    )

In [None]:
# Obtains all the changes of state per bug
df_status_changes_by_bug = get_status_changes_by_bug(df_bug_full)

In [None]:
# Uncomment to interpret results per bug
# df_status_changes_by_bug.query("BUG_ID==1647")

In [None]:
#--- Latest status of each bug -----------------------------------

In [None]:
def remove_index_name(df):
    df.index.name = ''
    return df

def get_current_bug_state(df):
    return (
        df
        .reset_index()
        .groupby('BUG_ID')
        .last()
        .reset_index()
        .set_index('index')
        .pipe(remove_index_name)
    )

In [None]:
df_current_bug_state = get_current_bug_state(df_status_changes_by_bug)

In [None]:
############### Summary table per bug ############################

#--- Extracts data from the list of events to build the main series of the summary ---

In [None]:
# Auxiliary functions to find easily the first and last ocurrences of a certain type of event per bug or make basic accountings:

In [None]:
# Gets all basic bug details + bug creation details
def get_basic_bug_details(df):

    return (
        df
        .groupby('BUG_ID')
        .first()
        .drop(columns=['OPERATION', 'VALUE', 'ROW_NUMBER'])
        .reindex(columns=['BUG_DESCRIPTION', 'ISSUER', 'TIMESTAMP', 'AGE', 'MONTH'])
        .rename(
            columns={
                'TIMESTAMP': 'CREATION_TIME',
                'AGE': 'CREATION_AGE'
            }
        )
    )

In [None]:
# Gets the last known state of the bug
def get_last_known_state(df):
    return (
        df
        .groupby('BUG_ID')
        .last()
        .drop(columns=['OPERATION', 'BUG_DESCRIPTION', 'ROW_NUMBER', 'ISSUER'])
        .rename(
            columns={
                'TIMESTAMP': 'STATE_UPDATE_TIME',
                'VALUE': 'STATE',
                'AGE': 'STATE_UPDATE_AGE',
                'MONTH': 'STATE_UPDATE_MONTH'
            }
        )
        .reindex(columns=['STATE', 'STATE_UPDATE_TIME', 'STATE_UPDATE_AGE', 'STATE_UPDATE_MONTH'])
    )

In [None]:
# Gets the total number of changes of state
def get_number_changes_of_state(df):
    return (
        df
        .query("OPERATION=='bug_status'")
        .reset_index()
        .loc[:, ['BUG_ID', 'index']]
        .groupby('BUG_ID').count()
        .rename(columns={'index': 'STATE_CHANGES'})
    )

In [None]:
# Gets how many times it has changed to `RESOLVED` (useful to detect ineffective resolutions)
def get_number_changes_to_resolved(df):
    return (
        df
        .query("(OPERATION=='bug_status') & (VALUE=='RESOLVED')")
        .reset_index()
        .loc[:, ['BUG_ID', 'index']]
        .groupby('BUG_ID').count()
        .rename(columns={'index': 'CHANGES_TO_RESOLVED'})
        .fillna(0)
        .astype({'CHANGES_TO_RESOLVED': 'int'})
    )

In [None]:
# Gets how many times the Release assignment has changed (useful to detect situations that persist or revive across releases)
def get_number_release_assignments(df):
    return (
        df
        .query("OPERATION=='version'")
        .reset_index()
        .loc[:, ['BUG_ID', 'index']]
        .groupby('BUG_ID').count()
        .rename(columns={'index': 'RELEASE_CHANGES'})
        .fillna(0)
        .astype({'RELEASE_CHANGES': 'int'})
    )

In [None]:
# Gets how many times the MDG assignment has changed (useful to detect "hot potato" situations)
def get_number_module_assignments(df):
    return (
        df
        .query("OPERATION=='component'")
        .reset_index()
        .loc[:, ['BUG_ID', 'index']]
        .groupby('BUG_ID').count()
        .rename(columns={'index': 'MODULE_CHANGES'})
        .fillna(0)
        .astype({'MODULE_CHANGES': 'int'})
    )

In [None]:
# Gets last assigned owner
def get_last_owner(df):
    return (
        df
        .query("OPERATION=='assigned_to'")
        .groupby('BUG_ID')
        .last()
        .drop(columns=['OPERATION', 'BUG_DESCRIPTION', 'ROW_NUMBER', 'RELEASE', 'MODULE'])
        .rename(
            columns={
                'VALUE': 'OWNER',
                'TIMESTAMP': 'LAST_ASSIGNMENT_TIME',
                'AGE': 'LAST_ASSIGNMENT_AGE',
                'MONTH': 'LAST_ASSIGNMENT_MONTH'
            }
        )
    )

In [None]:
# Gets how many times the owner assignment has changed (useful to detect "hot potato" situations)
def get_number_owner_assignments(df):
    return (
        df
        .query("OPERATION=='assigned_to'")
        .reset_index()
        .loc[:, ['BUG_ID', 'index']]
        .groupby('BUG_ID').count()
        .rename(columns={'index': 'OWNER_CHANGES'})
        .fillna(0)
        .astype({'OWNER_CHANGES': 'int'})
    )

In [None]:
# Gets last event that has happened to the bug so far
def get_last_event(df):
    return (
        df
        .groupby('BUG_ID')
        .last()
        .drop(columns=['BUG_DESCRIPTION', 'VALUE', 'ROW_NUMBER'])
        .rename(
            columns={
                'OPERATION': 'LAST_EVENT',
                'TIMESTAMP': 'LAST_EVENT_TIME',
                'AGE': 'LAST_EVENT_AGE',
                'MONTH': 'LAST_EVENT_MONTH'
            }
        )
        .reindex(columns=['LAST_EVENT', 'LAST_EVENT_TIME', 'LAST_EVENT_AGE', 'LAST_EVENT_MONTH'])
    )

In [None]:
# Creates bug summary

In [None]:
def get_bug_summary(df_status_changes_by_bug, df_bug_full):

    return (
        # All basic bug details + bug creation details
        get_basic_bug_details(df_status_changes_by_bug)

        # Adds last known state of the bug
        .merge(
            get_last_known_state(df_status_changes_by_bug),
            how='left',
            left_index=True, right_index=True
        )

        # Adds the total number of changes of state
        .merge(
            get_number_changes_of_state(df_status_changes_by_bug),
            how='left',
            left_index=True, right_index=True
        )

        # Adds how many times it has changed to `RESOLVED` (useful to detect ineffective resolutions)
        .merge(
            get_number_changes_to_resolved(df_status_changes_by_bug),
            how='left',
            left_index=True, right_index=True
        )
        .fillna({'CHANGES_TO_RESOLVED': 0})

        # Adds last known Release and Module for each bug
        .merge(
            (
                df_bug_full
                .groupby('BUG_ID').last()
                .loc[:, ['RELEASE', 'MODULE']]
            ),
            how='left', # how='outer'
            left_index=True, right_index=True
        )

        # Adds how many times the Release assignment has changed (useful to detect situations that persist or revive across releases)
        .merge(
            get_number_release_assignments(df_bug_full),
            how='left',
            left_index=True, right_index=True
        )

        # Adds how many times the module assignment has changed (useful to detect "hot potato" situations)
        .merge(
            get_number_module_assignments(df_bug_full),
            how='left',
            left_index=True, right_index=True
        )

        # Adds last assigned owner
        .merge(
            get_last_owner(df_bug_full),
            how='left',
            left_index=True, right_index=True
        )

        # Adds how many times the owner has changed (useful to detect "hot potato" situations)
        .merge(
            get_number_owner_assignments(df_bug_full),
            how='left',
            left_index=True, right_index=True
        )

        # Adds the last event that has happened to the bug so far
        .merge(
            get_last_event(df_bug_full),
            how='left',
            left_index=True, right_index=True
        )

        # Boolean to know if the bug was solved
        .assign(SOLVED = lambda x: x.STATE.isin(["RESOLVED", "VERIFIED"]))

        # Calculates the bug resolution time (when applicable)
        .assign(BUG_RESOLUTION_TIME = lambda x: (x.STATE_UPDATE_TIME - x.CREATION_TIME).dt.days)

        # Calculates the time since the bug was open
        .assign(TIME_SINCE_CREATION = lambda x: (pd.to_datetime("now") - x.CREATION_TIME).dt.days)
    )

In [None]:
df_bug_summary = get_bug_summary(df_status_changes_by_bug, df_bug_full)

### 2.1 Number of open bugs

In [None]:
df_open_bugs = df_current_bug_state[ ~df_current_bug_state['VALUE'].isin(['RESOLVED', 'VERIFIED']) ]

In [None]:
# df_open_bugs.tail()

In [None]:
# df_open_bugs.info()

#### Total number of open bugs:

In [None]:
df_open_bugs.shape[0]

In [None]:
# display(
#     Markdown(
#         f'''#### Total number of open bugs:

# {df_open_bugs.shape[0]}'''
#     )
# )

#### Number of bugs per module

In [None]:
ct_open_bugs = pd.crosstab(
    df_open_bugs.MODULE,
    df_open_bugs.AGE,
    margins_name='Total'
)

In [None]:
ct_open_bugs_selected = (
    ct_open_bugs
    .loc[ct_open_bugs.index.isin(most_relevant_modules)]
)

In [None]:
ct_open_bugs_selected.plot(kind='barh', stacked=True)

plt.title(f'Open Bugs per Module - Selected Modules ({today})', fontsize=20)
plt.ylabel('')

width = ct_open_bugs_selected.sum(axis=1)
plt.xticks(
    np.arange(min(width), max(width) + 1, 1.0)
)

plt.tight_layout()

filename = os.path.join(outputs_folder, 'n_bugs_open_per_key_module')
plt.savefig(filename + '.png', dpi=300)
plt.savefig(filename + '.svg')

plt.show()

In [None]:
# plt.figure(figsize=(12, 20))

ct_open_bugs.plot(kind='barh', stacked=True)

plt.title(f'Open Bugs per Module - All Modules ({today})', fontsize=20)
plt.ylabel('')

width = ct_open_bugs.sum(axis=1)
plt.xticks(
    np.arange(min(width), max(width) + 1, 1.0)
)

plt.tight_layout()

filename = os.path.join(outputs_folder, 'n_bugs_open_per_module')
plt.savefig(filename + '.png', dpi=300)
plt.savefig(filename + '.svg')

plt.show()

In [None]:
pd.crosstab(
    df_open_bugs.MODULE,
    df_open_bugs.AGE,
    margins=True,
    margins_name='Total'
)

### 2.2 Age of open bugs

#### Age vs. state of open bugs (global)

In [None]:
# pd.crosstab(
#     df_current_bug_state.VALUE,
#     df_current_bug_state.AGE,
#     margins=True,
#     margins_name='Total'
# )

In [None]:
ct_open_bugs_age_vs_state = pd.crosstab(
    df_open_bugs.VALUE,
    df_open_bugs.AGE,
    margins=True,
    margins_name='Total'
)

In [None]:
(
    ct_open_bugs_age_vs_state
    .query('VALUE != "Total"')
    .drop(columns=['Total'])
    .plot(kind='barh', stacked=True)
)


n = 10

# ax = locks.plot(kind='bar', y='SUM')
# ticks = ax.xaxis.get_ticklocs()
# ticklabels = [l.get_text() for l in ax.xaxis.get_ticklabels()]
# ax.xaxis.set_ticks(ticks[::n])
# ax.xaxis.set_ticklabels(ticklabels[::n])

# ax.figure.show()


plt.title(f'Global Number of bugs per State and Age ({today})', fontsize=20)
plt.ylabel('')

# width = ct_open_bugs_age_vs_state.sum(axis=1)
width = ct_open_bugs_age_vs_state.Total
plt.xticks(
    # np.arange(min(width), max(width) + 1, 23)
    np.arange(0, max(width) + 1, 25)
)

plt.tight_layout()

filename = os.path.join(outputs_folder, 'n_bugs_open_per_state_and_age')
plt.savefig(filename + '.png', dpi=300)
plt.savefig(filename + '.svg')

plt.show()

In [None]:
ct_open_bugs_age_vs_state

#### Age distribution per module

In [None]:
# Function to remove categories not in use in a given column
def clean_categorical_columns(col):
    if str(col.dtype)=='category':  # Checks if the column of 'category' type
        return col.cat.remove_unused_categories()
    else:   # Otherwise, returns the series verbatim
        return col

In [None]:
# Style 'fivethirtyeight' creates thick boxplots
# To avoid it, we may change to 'default' style temporarily
#plt.style.use('default')

In [None]:
fig, ax = plt.subplots(figsize = (10, 10))

df_open_bugs_selected = (
    df_bug_summary
    .loc[df_bug_summary.MODULE.isin(most_relevant_modules)]
    .assign(MODULE = lambda x: clean_categorical_columns(x.MODULE))
    .query('SOLVED==False')
)

sns.boxplot(
    data = df_open_bugs_selected,
    y = 'MODULE',
    x = 'TIME_SINCE_CREATION',
    ax=ax
)

ax.set_title(f'Age of Open Bugs - Selected Modules ({today})\n', fontsize=20)
ax.set_xlabel('Time since creation (days)', fontsize=14)
ax.set_ylabel(None)

fig.tight_layout()

filename = os.path.join(outputs_folder, 'time_bugs_open_per_key_module')
fig.savefig(filename + '.png', dpi=300)
fig.savefig(filename + '.svg')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize = (10, 20))

sns.boxplot(
    data = df_bug_summary.query('SOLVED==False'),
    y = 'MODULE',
    x = 'TIME_SINCE_CREATION',
    ax=ax
)

ax.set_title(f'Age of Open Bugs - All Modules ({today})\n', fontsize=20)
ax.set_xlabel('Time since creation (days)', fontsize=14)
ax.set_ylabel(None)

fig.tight_layout()

filename = os.path.join(outputs_folder, 'time_bugs_open_per_module')
fig.savefig(filename + '.png', dpi=300)
fig.savefig(filename + '.svg')

plt.show()

**NOTE:** To see how a boxplot representation works, you may refer [to this article](https://towardsdatascience.com/understanding-boxplots-5e2df7bcbd51).

In [None]:
print('Table with distributions of ages of currently open bugs:')
display(
    # Exploratory table - Time open bugs
    df_bug_summary.query('(SOLVED==False)').groupby('MODULE').TIME_SINCE_CREATION.describe().query('count>0').reset_index()
)

In [None]:
plt.style.use('fivethirtyeight')


### 2.3 Outliers


Four types of bug age outliers are identified:

1. Too old bugs.
2. Pure outliers for boxplot: Beyond Q3+1.5IQR for the MDG.
3. Beyond 3rd quartile of ages.
4. Age beyond 2 months.

In [None]:
bug_open_times = (
    df_bug_summary
    .query('(~SOLVED) & (CREATION_AGE!="OLD")')
    .loc[:, ["CREATION_AGE", "RELEASE", "MODULE", "LAST_EVENT_TIME", "TIME_SINCE_CREATION"]]
    .assign(CREATION_AGE = lambda x: clean_categorical_columns(x.CREATION_AGE))
    .fillna({'MODULE': 'Unknown'})
    .assign(RELEASE = lambda x: x.RELEASE.cat.add_categories('Unknown'))
    .fillna({'RELEASE': 'Unknown'})
    .rename(
        columns={
            'CREATION_AGE': 'Age',
            'RELEASE': 'OSM Release',
            'MODULE': 'Module',
            'TIME_SINCE_CREATION': 'Time bug open (days)'
        }
    )
)

In [None]:
def flatten_multilevel_columns(df):
    df.columns = [col[1] if col[1] else col[0] for col in df.columns]
    return df

quantiles_bug_open_times = (
    bug_open_times
    .groupby(['Module', 'OSM Release'])[['Time bug open (days)']]
    .agg('describe')
    .reset_index()
    .pipe(flatten_multilevel_columns)
    .assign(IQR = lambda x: (x['75%'] - x['25%']))
    .assign(outliers_th = lambda x: (x['75%'] + 1.5*x['IQR']))
)

In [None]:
bug_open_times_with_thresholds = (
    bug_open_times
    .reset_index()
    .merge(
        quantiles_bug_open_times,
        how='left',
        left_on=['Module', 'OSM Release'],
        right_on=['Module', 'OSM Release']
    )
    .merge(
        df_bug_summary.reset_index(),
        how='left',
        left_on=['BUG_ID'],
        right_on=['BUG_ID']
    )
    .rename(columns={'LAST_EVENT_TIME_x': 'LAST_EVENT_TIME'})
    .drop(columns=['Age', 'OSM Release', 'Module', 'LAST_EVENT_TIME_y'])
)

In [None]:
#### 1. Too old bugs still open

df_old_still_open = df_bug_summary.loc[ (df_bug_summary.CREATION_AGE=='OLD') & (~df_bug_summary.STATE.isin(['RESOLVED', 'VERIFIED'])) ].reset_index().sort_values(by=['MODULE', 'BUG_ID'])

In [None]:
#### 2. Pure outliers:

bug_open_outliers = (
    bug_open_times_with_thresholds
    .query('`Time bug open (days)` > outliers_th')
)

In [None]:
#### 3. Beyond Q3:

bug_open_q3 = (
    bug_open_times_with_thresholds
    .query("(`Time bug open (days)` > `75%`) & (`Time bug open (days)` <= outliers_th)")
)

In [None]:
#### 4. Beyond 2 months:

bug_open_2_months = (
    bug_open_times_with_thresholds
    .query("(`Time bug open (days)` > 60) & (`Time bug open (days)` <= `75%`)")
)

In [None]:
######################## Saves bug lists in spreadsheet ########################

In [None]:
bugs_sheet_names = ['0. Too old bugs still open',
                    '1. Open outliers',
                    '2. Open beyond q3',
                    '3. Open beyond 2 months']

In [None]:
# Imports latest MDLs recommendations, if they exist:

former_mdl_assessments_cols = ['RECOMMENDATION', 'BUG_ID']
f_assess = os.path.join(inputs_folder, former_mdl_assessments_file)

df_list = []

try:
    for sheet in bugs_sheet_names:
        df = pd.read_excel(f_assess, sheet_name=sheet, usecols=former_mdl_assessments_cols)
        df_list.append(df)
    df_recommendations = pd.concat(df_list, ignore_index=True)
except FileNotFoundError:
    print('Recommendations file does not exist.')
    df_recommendations = pd.DataFrame(columns=former_mdl_assessments_cols)

In [None]:
# Adds the recommendation to existing tables:

df_old_still_open = pd.merge(df_recommendations, df_old_still_open, on='BUG_ID', how='right')
bug_open_outliers = pd.merge(df_recommendations, bug_open_outliers, on='BUG_ID', how='right')
bug_open_q3 = pd.merge(df_recommendations, bug_open_q3, on='BUG_ID', how='right')
bug_open_2_months = pd.merge(df_recommendations, bug_open_2_months, on='BUG_ID', how='right')

In [None]:
# Exports to spreadsheet:

columns = ['RECOMMENDATION', 'MODULE', 'BUG_ID', 'BUG_DESCRIPTION', 'RELEASE', 'STATE', 'ISSUER', 'OWNER', 'CREATION_TIME', 'CREATION_AGE',
           'STATE_UPDATE_TIME', 'STATE_UPDATE_AGE', 'LAST_ASSIGNMENT_TIME', 'LAST_EVENT', 'LAST_EVENT_TIME', 'LAST_EVENT_AGE',
           'STATE_CHANGES', 'CHANGES_TO_RESOLVED', 'RELEASE_CHANGES', 'MODULE_CHANGES', 'OWNER_CHANGES']

timestamp = today_as_datetime.strftime('%Y%m%d')

filename = os.path.join(outputs_folder, timestamp + '_bugs_for_mdl_review.xlsx')
with pd.ExcelWriter(filename, engine='xlsxwriter') as writer:
    df_old_still_open.to_excel(writer, columns=columns, index=False, sheet_name='0. Too old bugs still open')
    bug_open_outliers[columns].to_excel(writer, index=False, sheet_name='1. Open outliers')
    bug_open_q3[columns].to_excel(writer, index=False, sheet_name='2. Open beyond q3')
    bug_open_2_months[columns].to_excel(writer, index=False, sheet_name='3. Open beyond 2 months')

#### Summary table of outliers:

In [None]:
keys = ['0. Too old bugs still open', '1. Open outliers', '2. Open beyond q3', '3. Open beyond 2 months']
tables = [df_old_still_open, bug_open_outliers, bug_open_q3, bug_open_2_months]
summary_table_outliers = (
  pd.concat(tables, keys=keys)
  .reset_index()
  .rename(columns={'level_0': 'AGE'})
  .pivot_table(index='MODULE', columns='AGE', values='BUG_ID', aggfunc='count', margins=True)
  .fillna(0)
  .query('All>0')
)

filename = os.path.join(outputs_folder, timestamp + '_SUMMARY_bug_outliers.xlsx')
summary_table_outliers.to_excel(filename)

In [None]:
summary_table_outliers

#### Outlier details

Too old bugs:

In [None]:
df_old_still_open

Pure outliers:

In [None]:
bug_open_outliers

Beyond Q3:

In [None]:
bug_open_q3

Beyond 2 months:

In [None]:
bug_open_2_months

## 3. Temporal evolution

In [None]:
# Uncomment to see example of data per bug:
#
# df_status_changes_by_bug.query("BUG_ID==1433")

In [None]:
# Auxiliary function to add extra columns with effective timestamps of the event:
# - If the event closes the bug, TIMESTAMP_4_EVENT = TIMESTAMP
# - If the event opens the bug, TIMESTAMP_4_EVENT = end of current month
# - Drops rows of events that do not open/close any bug.
def add_effective_timestamps(df):
    '''Function to add extra columns with effective timestamps of the event:

    - If the event closes the bug, TIMESTAMP_4_EVENT = TIMESTAMP
    - If the event opens the bug, TIMESTAMP_4_EVENT = end of current month
    - Drops rows of events that do not open/close any bug.
    '''

    return (
        df

        # Adds column with timestamp of opening of each bug
        .assign(
            TIMESTAMP_OPENING = lambda x:
            (
                x
                .groupby('BUG_ID')
                ['TIMESTAMP']
                #.transform(lambda col: col.min())
                .transform('first')
            )
        )

        # Adds column of effective timestamp of each event
        .assign(
            TIMESTAMP_4_EVENT = lambda x: (
                x.MONTH
                .where(x.OPENED)
                .fillna(
                    x.TIMESTAMP
                    .where(x.CLOSED)
                )
            )
        )
        .dropna(subset=['TIMESTAMP_4_EVENT'])   # Drops rows that do not open or close any bug

        # Fixes attributed timestamps that are "in the future"
        .assign(
            TIMESTAMP_4_EVENT = lambda x: (
                x.TIMESTAMP_4_EVENT
                .where(x.TIMESTAMP_4_EVENT <= today_as_datetime)
                .fillna(today_as_datetime)
            )
        )

        # Adds column with the difference
        .assign(TIME = lambda x: x.TIMESTAMP_4_EVENT - x.TIMESTAMP_OPENING)
    )

# MAIN FUNCTION:
# Function to add extra columns to status changes that allow to determine if the event results in a bug opening or a bug closing
def get_status_changes_by_bug_extended(df):
    '''Columns added by this stage:

    - `SOLVED`: Is the bug currently considered solved?
    - `WAS_SOLVED`: Right before this event, was the bug considered solved?
    - `JUST_OPENED`: Has the bug been just opened for the first time in this event?
    - `REOPENED`: Is this event a reopening?
    - `OPENED`: Is this event opening or reopening the bug? (`JUST_OPENED` or `REOPENED`)
    - `CLOSED`: Is this event closed by this event?
    '''

    was_solved = (
        df
        .groupby('BUG_ID')
        ['SOLVED']
        .transform(lambda x: x.shift())
        .fillna(method='bfill')
    )

    return (
        df
        .assign(WAS_SOLVED = was_solved)
        .assign(JUST_OPENED = lambda x: (x.VALUE=='OPEN-UNCONFIRMED'))
        .assign(REOPENED = lambda x: (x.WAS_SOLVED > x.SOLVED))
        .assign(OPENED = lambda x: (x.JUST_OPENED | x.REOPENED))
        .assign(CLOSED = lambda x: (x.WAS_SOLVED < x.SOLVED))

        # Adds columns with effective timestamps of open/close events
        .pipe(add_effective_timestamps)
    )

In [None]:
# Status changes with added information to track the dynamics per bug
df_status_changes_by_bug_extended = get_status_changes_by_bug_extended(df_status_changes_by_bug)

In [None]:
# Uncomment to see example of data per bug:
#
# df_status_changes_by_bug_extended.query("BUG_ID==1433")

In [None]:
# Oversamples the dataframe of bug events to add to each month
# rows that represent the bugs that remain open by that time
def get_monthly_time_samples_per_bug(df):
    return (
        df

        # Keeps only the last relevant event of each month per bug
        .groupby(['BUG_ID', 'MONTH'])
        .last()

        # Forces to include one sample per month of each known bug
        .unstack()
        .stack(dropna=False)

        # Extends the values of key columns, per bug
        .assign(
            SOLVED = lambda x: (
                x.groupby('BUG_ID').SOLVED.ffill().astype('bool')
            ),
            VALUE = lambda x: (
                x.groupby('BUG_ID').VALUE.ffill().astype('bool')
            ),
            TIMESTAMP_OPENING = lambda x: (
                x.groupby('BUG_ID').TIMESTAMP_OPENING.ffill()
            ),
            RELEASE = lambda x: (
                x.groupby('BUG_ID').RELEASE.ffill()
            ),
            MODULE = lambda x: (
                x.groupby('BUG_ID').MODULE.ffill()
            )
        )

        # Drops rows in months before the bug was created
        # (we have filled NA forward, not backwards)
        .dropna(subset=['TIMESTAMP_OPENING'])

        # If effective timestamp is NA and the bug is still open,
        # it should be assigned the end of the corresponding month
        .reset_index()
        .assign(
            TIMESTAMP_4_EVENT = lambda x: (
                x.TIMESTAMP_4_EVENT
                .fillna(
                    x.MONTH.where(~ x.SOLVED)
                )
            )
        )

        # Drops samples of bugs already solved in prior months (not current)
        # since they do not add relevant information
        # (they are the ones where 'TIMESTAMP_4_EVENT' is still NA)
        .dropna(subset=['TIMESTAMP_4_EVENT'])

        # Recalculates times since opening, to fill N/A
        .assign(TIME = lambda x: x.TIMESTAMP_4_EVENT - x.TIMESTAMP_OPENING)

        # Expresses TIME in days
        .assign(TIME = lambda x: (x.TIME / pd.Timedelta(days=1)))
    )

In [None]:
# Table will all bugs still open each month
df_monthly_time_samples_per_bug = get_monthly_time_samples_per_bug(df_status_changes_by_bug_extended)

In [None]:
# Uncomment to see example of data per bug:
#
# df_monthly_time_samples_per_bug.query("BUG_ID==1433")

In [None]:
# Uncomment to see another example of data per bug:
#
# df_monthly_time_samples_per_bug.query("BUG_ID==1598")

In [None]:
df_stats_monthly_time_samples_per_bug = (
    df_monthly_time_samples_per_bug
    .groupby(['MONTH'])
    .TIME
    .describe()
)

In [None]:
# df_stats_monthly_time_samples_per_bug.tail()

In [None]:
########################### Net variation of open bugs ##########################

In [None]:
#----------------------- Quantification of bug reopenings -----------------------

In [None]:
def agg_reopenings(df):

    df = df.copy()

    df['REAL_CLOSED'] = df.CLOSED - df.REOPENED
    df.loc[(df.REAL_CLOSED < 0), 'REAL_CLOSED'] = 0
    df['FALSE_CLOSED'] = df.CLOSED - df.REAL_CLOSED

    # Net contribution
    df['BUG_VARIATION'] = df.OPENED - df.CLOSED

    return df

def get_open_reopen_closed(df):
    return (
        df
        .groupby(['MONTH', 'BUG_ID'])
        [['OPENED', 'JUST_OPENED', 'REOPENED', 'CLOSED']]
        .sum()
        .pipe(agg_reopenings)
        .reset_index()
    )

def get_bug_summary_per_month(df):
    '''Aggregates opening/closing events per month per bug
    '''

    return (
        df
        .groupby(['MONTH', 'BUG_ID'])
        .last()
        .drop(columns=['OPENED', 'JUST_OPENED', 'REOPENED', 'CLOSED'])
        .reset_index()
        .merge(
            get_open_reopen_closed(df),
            how = 'left',
            left_on = ['MONTH', 'BUG_ID'],
            right_on = ['MONTH', 'BUG_ID']
    )
)

In [None]:
# Obtains, per month and per bug, how many times it has been opened, reopened and closed (for real or false)
df_bug_summary_per_month = get_bug_summary_per_month(df_status_changes_by_bug_extended)

In [None]:
# Uncomment to see example of data per bug:
#
# df_bug_summary_per_month.query("BUG_ID==1433")

In [None]:
############### Calculates no. open bugs per module each month ###############

In [None]:
# Calculates cumulative sum of open bugs per module

def get_cummulative_bug_summary_per_month(df):

    return (
        df
        .groupby(['MODULE', 'MONTH'])
        ['BUG_VARIATION']
        .sum()
        .reset_index()
        .assign(OPEN_BUGS = lambda x: (
            x
            .groupby(['MODULE'])
            ['BUG_VARIATION']
            .transform(lambda x: x.cumsum())
        ))
    )

In [None]:
# Dataframe with the temporal evolution of the open bugs per module
df_cummulative_bug_summary_per_month = get_cummulative_bug_summary_per_month(df_bug_summary_per_month)

In [None]:
# Example to inspect the temporal evolution in a module
#
# df_cummulative_bug_summary_per_month.query('MODULE == "RO"').tail()

### 3.1 Number of open bugs

In [None]:
sns.lineplot(
    data = (
        df_cummulative_bug_summary_per_month
        .loc[df_cummulative_bug_summary_per_month.MODULE.isin(most_relevant_modules)]
    ),
    x = 'MONTH',
    y = 'OPEN_BUGS',
    estimator = sum,
    ci = False,
    linewidth = 3
)

plt.title(f'Evolution of Number of Open Bugs ({today})', fontsize=20)
plt.ylabel('No. Net Bugs')
plt.xlabel('Time')

plt.tight_layout()

filename = os.path.join(outputs_folder, 'open_bugs_per_month')
plt.savefig(filename + '.png', dpi=300)
plt.savefig(filename + '.svg')

plt.show()

In [None]:
g = sns.relplot(
    data = (
        df_cummulative_bug_summary_per_month
        .loc[df_cummulative_bug_summary_per_month.MODULE.isin(most_relevant_modules)]
        .assign(MODULE = lambda x: clean_categorical_columns(x.MODULE))
    ),
    x = 'MONTH',
    y = 'OPEN_BUGS',
    col = "MODULE",
    col_wrap = 4,
    kind = "line",
    estimator = sum,
    ci = False,
    facet_kws={'sharey': True, 'sharex': False}
)

# Rotates the labels
for ax in g.axes.ravel():
    ax.set_xticklabels(ax.get_xticklabels(), rotation=30)

g.fig.suptitle(f'Open Bugs per Month and Module ({today})\n\n', fontsize=20)

# To avoid overlaps
g.fig.tight_layout()
plt.tight_layout()

filename = os.path.join(outputs_folder, 'open_bugs_per_month_and_module')
plt.savefig(filename + '.png', dpi=300)
plt.savefig(filename + '.svg')

plt.show()

In [None]:
# FIXME:

# Evolution of the number of open bugs per module in the last 6 months

# for module, df_module in df_cummulative_bug_summary_per_month.groupby(['MODULE']):
#     display(Markdown(f'**{module}:**'))
#     display(df_module.tail())

### 3.2 Age of open bugs

In [None]:
# Evolution of average age of open bugs

sns.lineplot(
    data = df_monthly_time_samples_per_bug,
    x = 'MONTH',
    y = 'TIME',
    # ci = None
    linewidth = 3
)

plt.title(f'Evolution of Age of Open Bugs ({today})', fontsize=20)
plt.ylabel('Age (days)')
plt.xlabel('Time')

plt.tight_layout()

filename = os.path.join(outputs_folder, 'age_open_bugs_per_month')
plt.savefig(filename + '.png', dpi=300)
plt.savefig(filename + '.svg')

plt.show()

In [None]:
# # FIXME: Exploratory figure

# sns.relplot(
#     data=df_monthly_time_samples_per_bug,
#     x = 'MONTH',
#     y = 'TIME',
#     col = 'MODULE',
#     col_wrap = 4,
#     kind = 'line',
#     # ci = None
#     facet_kws={'sharey': True, 'sharex': False}
# )

In [None]:
# Evolution of average age of open bugs, per module

g = sns.relplot(
    data = (
            df_monthly_time_samples_per_bug
            .loc[df_monthly_time_samples_per_bug.MODULE.isin(most_relevant_modules)]
            .assign(MODULE = lambda x: clean_categorical_columns(x.MODULE))
    ),
    x = 'MONTH',
    y = 'TIME',
    col = 'MODULE',
    col_wrap = 4,
    kind = 'line',
    # ci = None
    facet_kws={'sharey': True, 'sharex': False}
)

# Rotates the labels
for ax in g.axes.ravel():
    ax.set_xticklabels(ax.get_xticklabels(), rotation=30)

g.fig.suptitle(f'Evolution of Age of Open Bugs, per Module ({today})\n\n', fontsize=20)

# To avoid overlaps
g.fig.tight_layout()
plt.tight_layout()

filename = os.path.join(outputs_folder, 'age_open_bugs_per_month_and_module')
plt.savefig(filename + '.png', dpi=300)
plt.savefig(filename + '.svg')

plt.show()

### 3.3 Monthly activity

In [None]:
# Net contributions per month
sns.lineplot(
    data = df_bug_summary_per_month,
    x = 'MONTH',
    y = 'BUG_VARIATION',
    #hue = 'MODULE',
    estimator = sum,
    ci = False,
    linewidth = 3
)

plt.title(f'Net Variation Open Bugs per Month ({today})', fontsize=20)
plt.ylabel('No. Net Bugs')
plt.xlabel('Time')

plt.tight_layout()

filename = os.path.join(outputs_folder, 'net_variation_open_bugs_per_month')
plt.savefig(filename + '.png', dpi=300)
plt.savefig(filename + '.svg')

plt.show()

In [None]:
# Adapts series for easier visualization:
# - New + reopen bugs are positive
# - Solved (true or false) are made negative (for convetion)
# - Keeps the "false" changes ('REOPENED' and 'FALSE_CLOSED') to compare them with totals
# - 'JUST_OPEN' and 'REAL_CLOSED' are dropt for being redundant
# - 'TOTAL_OPEN' is also dropt

temporal_bug_data_for_plots = (
    df_bug_summary_per_month

    .assign(
        CLOSED = lambda x: (- x.CLOSED),
        FALSE_CLOSED = lambda x: (- x.FALSE_CLOSED),
    )
    .drop(columns=['JUST_OPENED', 'REAL_CLOSED'])
    .melt(
        id_vars=['MONTH', 'BUG_ID', 'MODULE'],
        value_vars=['OPENED', 'REOPENED', 'FALSE_CLOSED', 'CLOSED']
    )
)

sns.lineplot(
    data = temporal_bug_data_for_plots,
    x = 'MONTH',
    y = 'value',
    hue = 'variable',
    estimator = sum,
    ci = False,
    linewidth = 3
)

plt.title(f'Contributions to Variation of Open Bugs per Month ({today})', fontsize=20)
plt.ylabel('No. Net Bugs')
plt.xlabel('Time')

plt.tight_layout()

filename = os.path.join(outputs_folder, 'contribs_variation_open_bugs_per_month')
plt.savefig(filename + '.png', dpi=300)
plt.savefig(filename + '.svg')

plt.show()

In [None]:
g = sns.relplot(
    data = (
        temporal_bug_data_for_plots
        .loc[temporal_bug_data_for_plots.MODULE.isin(most_relevant_modules)]
        .assign(MODULE = lambda x: clean_categorical_columns(x.MODULE))
    ),
    x = 'MONTH',
    y = 'value',
    hue = 'variable',
    col = "MODULE",
    col_wrap = 4,
    kind="line",
    estimator = sum,
    ci = False,
    facet_kws={'sharey': True, 'sharex': False}
)

# Rotates the labels
for ax in g.axes.ravel():
    ax.set_xticklabels(ax.get_xticklabels(), rotation=30)

g.fig.suptitle(f'Contributions to Variation of Open Bugs per Month and Module ({today})\n\n', fontsize=20)

# To avoid overlaps
g.fig.tight_layout()
plt.tight_layout()

filename = os.path.join(outputs_folder, 'contribs_variation_open_bugs_per_month_and_module')
plt.savefig(filename + '.png', dpi=300)
plt.savefig(filename + '.svg')

plt.show()

TODO: (table) Evolution of the number of open bugs per module in the last 6 months

## 4. Top contributors

### 4.1 Top reporters of bugs

Individual contributors:

In [None]:
df_bug_reporters = (
    df_current_bug_state.ISSUER
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index': 'issuer', 'ISSUER': 'reported_bugs'})
    .assign(company = lambda x: x.issuer.str.split('@', expand=True)[1])
)

In [None]:
topN = 25

df_top_bug_reporters = (
    pd.concat(
        [
            df_bug_reporters[0:topN],
            pd.DataFrame(
                {
                    'issuer': 'Other',
                    'reported_bugs': df_bug_reporters[topN:].reported_bugs.sum(),
                    'company': 'Other'
                },
                index = [topN]
            )
        ]
    )
)

In [None]:
fig, ax = plt.subplots(figsize = (10, 15))

sns.barplot(
    data = df_top_bug_reporters,
    y = 'issuer',
    x = 'reported_bugs',
    ax = ax
)

ax.set_title(f'Top Individual Bug Reporters ({today})\n', fontsize=20)
ax.set_xlabel('No. Reported Bugs', fontsize=14)
ax.set_ylabel(None)

fig.tight_layout()

filename = os.path.join(outputs_folder, 'top_bug_reporters_individual')
fig.savefig(filename + '.png', dpi=300)
fig.savefig(filename + '.svg')

plt.show()

In [None]:
df_top_bug_reporters

Contributions per company:

In [None]:
df_bug_reporting_companies = (
    df_bug_reporters
    .groupby('company')
    .reported_bugs
    .sum()
    .sort_values(ascending=False)
)

In [None]:
topN = 15

df_top_bug_reporting_companies = (
    pd.concat(
        [
            df_bug_reporting_companies[0:topN].to_frame().reset_index(),
            pd.DataFrame(
                {
                    'company': 'Other',
                    'reported_bugs': df_bug_reporting_companies[topN:].sum()
                },
                index = [topN]
            )
        ]
    )
)

In [None]:
fig, ax = plt.subplots(figsize = (10, 10))

sns.barplot(
    data = df_top_bug_reporting_companies,
    y = 'company',
    x = 'reported_bugs',
    ax = ax
)

ax.set_title(f'Top Bug Reporters per Organization ({today})\n', fontsize=20)
ax.set_xlabel('No. Reported Bugs', fontsize=14)
ax.set_ylabel(None)

fig.tight_layout()

filename = os.path.join(outputs_folder, 'top_bug_reporters_per_organization')
fig.savefig(filename + '.png', dpi=300)
fig.savefig(filename + '.svg')

plt.show()

In [None]:
df_top_bug_reporting_companies

### 4.2 Bugs assignments

In [None]:
df_owner_events_by_bug = df_bug_full[ df_bug_full.OPERATION=='assigned_to' ]

In [None]:
def get_current_owner_bug(df):
    return (
        df
        .reset_index()
        .groupby('BUG_ID')
        .last()
        .reset_index()
        .set_index('index')
        .pipe(remove_index_name)
    )

In [None]:
df_current_owner_bug = get_current_owner_bug(df_owner_events_by_bug)

In [None]:
print(f'Share of bugs with owner: {df_current_owner_bug.shape[0] / df_bug_full.BUG_ID.nunique() * 100:.2f}%')

TODO: Top assignees of bugs (needs to be filtered by bugs still open)

### 4.3 TODO: Top closers of bugs

## 5. Last significant events

### 5.1 Last reported bugs

In [None]:
df_current_bug_state.sort_values('TIMESTAMP').tail(10)

### 5.2 Last bug assignments

In [None]:
df_current_owner_bug.sort_values('TIMESTAMP', ascending=True).tail(10)

### 5.3 TODO: Last resolved bugs

## 6. Saving the notebook as webpage

In [None]:
!jupyter nbconvert --to html --output outputs/bugzilla_analysis.html --TemplateExporter.exclude_input=True bugzilla_analysis.ipynb

In [None]:
print('DONE')

---
