# Analysis of OSM events in Bugzilla

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# Sets a bigger default size for figures
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 80 # 100 gives great resolution and 200 gives optimal resolution, but much slower
sns.set(rc={'figure.figsize':(12,8)})

## 0. Input parameters

In [None]:
inputs_folder = 'inputs'
outputs_folder = 'outputs'
#bugzilla_csv = os.path.join(inputs_folder, 'bugs.csv')
bugzilla_csv = 'https://osm.etsi.org/stats/bugs.csv'

date_for_bug_deprecation = '2020-07-01'
date_for_very_recent_bug = '2021-04-15'

former_mdl_assessments_file = 'former_mdl_assessments.xlsx' # If it does not exist, it will be ignored

## 1. Imports and cleans source data

In [None]:
header_list = ["BUG_ID", "BUG_DESCRIPTION", "RELEASE", "MODULE", "OPERATION", "VALUE", "TIMESTAMP"]
df_bug_full = pd.read_csv(bugzilla_csv, encoding='utf-8', header=None, names=header_list)
df_bug_full

In [None]:
df_bug_full.info()

Fixes the timestamp type:

In [None]:
df_bug_full["TIMESTAMP"] = df_bug_full.TIMESTAMP.astype('datetime64')
df_bug_full.info()

Fixes the `OPERATION` field to make it a categorical type.

First, we check what "operations" are possible (and checks if any has wrong data):

In [None]:
types_of_operations = df_bug_full.OPERATION.unique().tolist()
list(enumerate(types_of_operations))

In [None]:
df_bug_full.OPERATION.value_counts()

In [None]:
df_bug_full["OPERATION"] = df_bug_full.OPERATION.astype('category')
df_bug_full["RELEASE"] = df_bug_full.RELEASE.astype('category')
df_bug_full["MODULE"] = df_bug_full.MODULE.astype('category')
df_bug_full.info()

Checks the different types of values allowed for some of these operations:

In [None]:
df_bug_full.loc[ (df_bug_full.OPERATION=='resolution'), "VALUE" ].value_counts()

In [None]:
df_bug_full.loc[ (df_bug_full.OPERATION=='bug_status'), "VALUE" ].value_counts()

In [None]:
df_bug_full.RELEASE.value_counts()

In [None]:
df_bug_full.loc[ (df_bug_full.OPERATION=='version'), "VALUE" ].value_counts()

In [None]:
df_bug_full.MODULE.value_counts()

Summary of the most relevant operations:

- `assigned_to`: (Re)assignment of the bug to a **person**.
- `version`: Explicit assignment to OSM Release: `master`, `v9.0`, etc.
- `component`: (Re)assigment of the bug to an **OSM module**.
- `bug_status`: Change of bug state. Possible values:
  - `RESOLVED`
  - `CONFIRMED`
  - `IN_PROGRESS`
  - `VERIFIED`
  - `UNCONFIRMED`
- `resolution`: Change of resolution state of the bug (related to `bug_status` when `RESOLVED`). Possible values:
  - `FIXED`
  - `WONTFIX`
  - `INVALID`
  - `DUPLICATE`
  - `WORKSFORME`
- `comment`: New comment to the bug. The first one is the event that **opens** the bug:
- `priority`: Change of bug priority.
- `bug_severity`: Change of bug severity.

Characterizes malformed types of `operations`, if any:

In [None]:
#i = 15
#print(types_of_operations[i])
#wrong1 = df_bug_full[ df_bug_full["OPERATION"]==types_of_operations[i] ]
#wrong1

In [None]:
#i = 20
#print(types_of_operations[i])
#wrong2 = df_bug_full[ df_bug_full["OPERATION"]==types_of_operations[i] ]
#wrong2

In [None]:
## Extracts the line numbers where there are errors
#wrong_lines = pd.concat([wrong1, wrong2, wrong3])
#wrong_lines_num = wrong_lines.index.to_list()
#print(wrong_lines_num)
#wrong_lines

In [None]:
## Saves the wrong lines
#wrong_lines.reset_index(inplace=True)
#wrong_lines = wrong_lines.rename(columns = {'index':'Row no. in original file'})
#wrong_lines.to_excel("wrong_lines.xlsx", encoding="utf-8", index=False)
#
#with open(os.path.join(inputs_folder, 'bugs.csv'), 'r', encoding='utf-8') as fin:
#    with open('original_wrong_lines.csv', 'w', encoding='utf-8') as fout:
#        for n, line in enumerate(fin):
#            if n in wrong_lines_num:
#                print(line.strip(), file=fout)

Creates a fixed CSV file from a manually created XLSX

In [None]:
#df = pd.read_excel('20210507 Importación de CSV de bugzilla.xlsx', sheet_name='fixed_wrong_lines')
#df.to_csv('fixed_wrong_lines.csv', encoding='utf-8', index=False, header=None)

Processes a fixed CSV file (escaping the comma):

In [None]:
#bugzilla_csv = os.path.join(inputs_folder, 'bugs.csv')
#df_bug_full = pd.read_csv(bugzilla_csv, encoding='utf-8', names=header_list, escapechar='\\')

In [None]:
#types_of_operations = df_bug_full.OPERATION.unique().tolist()
#list(enumerate(types_of_operations))

### 1.1 Sorts by timestamp, then by bug id

In [None]:
df_bug_full.sort_values(by=['TIMESTAMP', 'BUG_ID'], inplace=True)
df_bug_full.head()

Resets the index and adds the former index as a new column (`ROW_NUMBER`):

In [None]:
df_bug_full.reset_index(inplace=True)
df_bug_full.rename(columns={'index': 'ROW_NUMBER'}, inplace=True)
df_bug_full.head()

### 1.2 Reorders the columns to make them more convenient

In [None]:
#df_bug_full = df_bug_full[['TIMESTAMP', 'BUG_ID', 'OPERATION', 'VALUE', 'BUG_DESCRIPTION', 'ROW_NUMBER']]
new_column_order = ['TIMESTAMP', 'BUG_ID', 'OPERATION', 'VALUE', 'RELEASE', 'MODULE', 'BUG_DESCRIPTION', 'ROW_NUMBER']
df_bug_full = df_bug_full.reindex(columns=new_column_order)
df_bug_full.head()

### 1.3 Tags events by age

To the list of events related to status changes, we add a new column with a qualitative tag about the age of the event:

In [None]:
df_bug_full["AGE"] = "CURRENT"  # By default, they should be relevant
df_bug_full.loc[ df_bug_full['TIMESTAMP']<date_for_bug_deprecation, "AGE" ] = "OLD"
df_bug_full.loc[ df_bug_full['TIMESTAMP']>date_for_very_recent_bug, "AGE" ] = "RECENT"

df_bug_full["AGE"] = df_bug_full.AGE.astype('category')   # To save RAM
df_bug_full.AGE.cat.reorder_categories(['RECENT', 'CURRENT', 'OLD'], ordered=True)
df_bug_full.tail()

## 2. Analysis of the lifecycle of a bug

Example of the typical lifecycle of a bug:

In [None]:
df_bug_full[ df_bug_full.BUG_ID==1513 ]

Examples of bugs that change of release:

In [None]:
df_bug_full.loc[ (df_bug_full.OPERATION=='version'), ["BUG_ID", "VALUE"] ].tail()

In [None]:
# Finding examples of bugs that changed of release
df_op_version_changes = df_bug_full.loc[ (df_bug_full.OPERATION=='version'), ["BUG_ID", "VALUE"] ]
bug_ids_with_change_of_version = df_op_version_changes.BUG_ID.unique()
df_initial_version = df_bug_full.loc[ df_bug_full.BUG_ID.isin(bug_ids_with_change_of_version) ].groupby(by='BUG_ID').first().reset_index().loc[:, ['BUG_ID', 'RELEASE']]
df_bugs_with_version_changes = pd.merge(df_op_version_changes, df_initial_version, on='BUG_ID')
df_bugs_with_version_changes[ df_bugs_with_version_changes.VALUE!=df_bugs_with_version_changes.RELEASE ]

In [None]:
# Example of lifecycle
df_bug_full[ df_bug_full.BUG_ID==510 ]

Interpretation:

- `RELEASE` reflects the **latest** state **even from the first row**.
- `VALUE` in a `version` operation reflects **the new state at that time**. It may not be the latest state.

### 2.1 Comments per bug

In [None]:
df_comments_by_bug = df_bug_full[ df_bug_full.OPERATION=='comment' ]
df_comments_by_bug.head()

Obtains the first comment of each bug, i.e. the event by which it was open:

In [None]:
# Finds the lowest index where each bug appears and converts it to a list:
#
# 1. Converts the index in a regular column
# 2. Groups by BUG_ID
# 3. Gets the minimum value of the remaining columns. In this case, the minimal index
# 4. Extracts the 'index' Series from the Dataframe
# 5. Converts the Series in a regular Python list
bug_openings_index = df_comments_by_bug.reset_index()[['BUG_ID', 'index']].groupby('BUG_ID').min()['index'].tolist()
df_bug_openings = df_comments_by_bug.loc[bug_openings_index]
df_bug_openings

### 2.2 Bug status

#### 2.2.1 Changes of status of each bug

In [None]:
df_status_changes_by_bug = df_bug_full[ df_bug_full.OPERATION=='bug_status' ]
df_status_changes_by_bug.head(20)

Since there is no explicit "bug creation" event, we need to add it using the first comment received.

First, we isolate the line of the first comment and reformat it to become a sort of change of bug status:

In [None]:
df_bug_openings_events = df_bug_openings.copy()
df_bug_openings_events["OPERATION"] = "bug_status"
df_bug_openings_events["ISSUER"] = df_bug_openings_events["VALUE"]
df_bug_openings_events["VALUE"] = "OPEN-UNCONFIRMED"
df_bug_openings_events


Then, we concatenate it with the rest of bug status changes:

In [None]:
df_status_changes_by_bug = pd.concat([df_status_changes_by_bug, df_bug_openings_events]).sort_index()
df_status_changes_by_bug

In [None]:
df_status_changes_by_bug.VALUE.value_counts()

Now we can make the field `VALUE` of type category and even set the expected ordering for that category:

In [None]:
df_status_changes_by_bug['VALUE'] = df_status_changes_by_bug.VALUE.astype('category')
df_status_changes_by_bug.VALUE.cat.reorder_categories(['OPEN-UNCONFIRMED', 'UNCONFIRMED', 'CONFIRMED', 'IN_PROGRESS', 'VERIFIED', 'RESOLVED'], ordered=True)
df_status_changes_by_bug.head()

In [None]:
# Example
df_status_changes_by_bug[ df_status_changes_by_bug.BUG_ID==1513 ]

#### 2.2.2 Tags each bug event by its age

In [None]:
df_status_changes_by_bug["AGE"].value_counts()

#### 2.2.3 Finds the current state of each bug

For each bug, we find the latest known change of state, which should be the current state of the bug:

In [None]:
current_bug_state_index = df_status_changes_by_bug.reset_index()[['BUG_ID', 'index']].groupby('BUG_ID').max()['index'].tolist()
df_current_bug_state = df_status_changes_by_bug.loc[current_bug_state_index].sort_index()
df_current_bug_state

In [None]:
df_current_bug_state.VALUE.value_counts()

Finds the list of unresolved bugs:

In [None]:
#df_unresolved_bugs = df_current_bug_state[ df_current_bug_state['VALUE']!='RESOLVED' ]
df_unresolved_bugs = df_current_bug_state[ ~df_current_bug_state['VALUE'].isin(['RESOLVED', 'VERIFIED']) ]
df_unresolved_bugs

Too old bugs that remain open (candidates for administrative closing):

In [None]:
df_deprecated_bugs = df_unresolved_bugs[ df_unresolved_bugs['AGE']=='OLD' ]
df_deprecated_bugs.tail()

In [None]:
deprecated_bugs_id = df_deprecated_bugs.BUG_ID.sort_values().to_list()
print(deprecated_bugs_id)

In [None]:
df_deprecated_bugs.VALUE.value_counts()

#### 2.2.4 Current bug status vs. age

In [None]:
pd.crosstab(df_current_bug_state.VALUE, df_current_bug_state.AGE, margins=True, margins_name='Total')

With a more convenient rearrangement for rows and columns of the crosstab, and addition of "totals" per row and column:

In [None]:
# Ordered crosstab (absolute figures)
by_maturity = ['OPEN-UNCONFIRMED', 'UNCONFIRMED', 'CONFIRMED', 'IN_PROGRESS', 'VERIFIED', 'RESOLVED', 'Total']
by_age = ['RECENT', 'CURRENT', 'OLD', 'Total']
df_status_vs_age = pd.crosstab(df_current_bug_state.VALUE, df_current_bug_state.AGE, margins=True, margins_name='Total')
df_status_vs_age.index.rename('STATUS', inplace=True)
df_status_vs_age.loc[by_maturity, by_age]

In [None]:
# Ordered crosstab (%)
by_maturity = ['OPEN-UNCONFIRMED', 'UNCONFIRMED', 'CONFIRMED', 'IN_PROGRESS', 'VERIFIED', 'RESOLVED', 'Total']
by_age = ['RECENT', 'CURRENT', 'OLD', 'Total']
df_status_vs_age_pct = pd.crosstab(df_current_bug_state.VALUE, df_current_bug_state.AGE, margins=True, margins_name='Total', normalize=True)
df_status_vs_age_pct.index.rename('STATUS', inplace=True)
df_status_vs_age_pct.loc[by_maturity, by_age]

### 2.3 OSM Release (`version`) referred in the bug

Sequence of events where a release was assigned to a bug:

In [None]:
df_release_events_by_bug = df_bug_full[ df_bug_full.OPERATION=='version' ]
df_release_events_by_bug

Finds which bugs have been reassigned to more than one release during their lifetime:

In [None]:
df_releases_per_bug = df_release_events_by_bug.loc[:, ['BUG_ID', 'VALUE']]
n_changes_of_release_per_bug = df_releases_per_bug.groupby('BUG_ID').count()
bugs_reassigned_to_several_releases = n_changes_of_release_per_bug[ n_changes_of_release_per_bug.VALUE>1 ].reset_index().BUG_ID.tolist()
bugs_reassigned_to_several_releases
df_release_events_by_bug[ df_release_events_by_bug.BUG_ID.isin(bugs_reassigned_to_several_releases) ]

In [None]:
len(bugs_reassigned_to_several_releases)

Determines which bugs experienced at least one explicit event of change of release:

In [None]:
# Bugs assigned to a release
bugs_with_release = df_release_events_by_bug.BUG_ID.unique()
bugs_with_release

_**FIXME:**_ Esto no es cierto ya. Habría que tener en cuenta la release

Bugs which do not have a a release explicitly associated to them (**we should assume they refer to the latest known stable release**):

In [None]:
# # All bug IDs
# all_bug_ids = df_bug_full.BUG_ID.unique()
# all_bug_ids

In [None]:
# # Bug not assigned to any release
# bugs_without_release = np.delete(all_bug_ids, bugs_with_release)
# bugs_without_release

List of bugs with no specific release assigned:

In [None]:
# df_bugs_without_release = df_bug_full[ df_bug_full.BUG_ID.isin(bugs_without_release) ]
# df_bugs_without_release.tail()

In [None]:
# df_bug_full[ df_bug_full.BUG_ID==1522 ]

Determines the last release to which the bug has been explicitly assigned:

In [None]:
# current_bug_release_index = df_release_events_by_bug.reset_index()[['BUG_ID', 'index']].groupby('BUG_ID').max()['index'].tolist()
# df_current_bug_release = df_release_events_by_bug.loc[current_bug_release_index].sort_index()
# df_current_bug_release.tail(20)

### 2.4 MDG (`component`) associated to each bug

In [None]:
# df_mdg_events_by_bug = df_bug_full[ df_bug_full.OPERATION=='component' ]
# df_mdg_events_by_bug

In [None]:
# df_mdg_events_by_bug.VALUE.value_counts()

Determines the last MDG to which the bug has been explicitly assigned:

In [None]:
# current_bug_mdg_index = df_mdg_events_by_bug.reset_index()[['BUG_ID', 'index']].groupby('BUG_ID').max()['index'].tolist()
# df_current_bug_mdg = df_mdg_events_by_bug.loc[current_bug_mdg_index].sort_index()
# df_current_bug_mdg.tail(20)

### 2.5 Bug owner (`assigned_to`)

In [None]:
df_owner_events_by_bug = df_bug_full[ df_bug_full.OPERATION=='assigned_to' ]
df_owner_events_by_bug

In [None]:
df_owner_events_by_bug.VALUE.value_counts()

Determines the last owner to which the bug has been explicitly assigned:

In [None]:
current_owner_bug_index = df_owner_events_by_bug.reset_index()[['BUG_ID', 'index']].groupby('BUG_ID').max()['index'].tolist()
df_current_owner_bug = df_owner_events_by_bug.loc[current_owner_bug_index].sort_index()
df_current_owner_bug.tail(20)

## 3. Summary table per bug

The summary table (`df_bug_summmary`) should collect, at least:

- Basic bug details:
  - Bug id
  - Bug description
  - Issuer (reporter of the bug)
- Latest states of the bug:
  - Currently assigned status
  - Currently assigned Release
  - Currently assigned MDG
  - Currently assigned owner
- Relevant timestamps:
  - Date of creation.
  - Date of latest change of state.
  - Date of latest event.
- Age tags:
  - Date of creation
  - Date of last event.
- Other relevant summary statistics, such as:
  - No. Release reassignments.
  - No. State reassignments.
  - No. State reassignments to `RESOLVED`.

### 3.1 Extracts data from the list of events to build the main series of the summary

Auxiliary functions to find easily the first and last ocurrences of a certain type of event per bug or make basic accountings:

In [None]:
def first_event_by_bug(df_filtered):
    index = df_filtered.reset_index()[['BUG_ID', 'index']].groupby('BUG_ID').min()['index'].tolist()
    return df_filtered.loc[index].sort_index()

def last_event_by_bug(df_filtered):
    index = df_filtered.reset_index()[['BUG_ID', 'index']].groupby('BUG_ID').max()['index'].tolist()
    return df_filtered.loc[index].sort_index()

def count_event_by_bug(df_filtered):
    summary = df_filtered.reset_index()[['BUG_ID', 'index']].groupby('BUG_ID').count()
    return summary.rename(columns={'index': 'COUNT'})

# Some handy shortcuts

def first_event_by_bug_and_type(df, event_type):
    return first_event_by_bug( df[df.OPERATION==event_type] )

def last_event_by_bug_and_type(df, event_type):
    return last_event_by_bug( df[df.OPERATION==event_type] )

def count_event_by_bug_and_type(df, event_type):
    return count_event_by_bug( df[df.OPERATION==event_type] )

In [None]:
# Testing:
last_event_by_bug_and_type(df_status_changes_by_bug, 'bug_status').VALUE.value_counts()

In [None]:
# Obtains all basic bug details + bug creation details
df_bug_summary = first_event_by_bug(df_status_changes_by_bug).set_index('BUG_ID')
df_bug_summary.drop(columns=['OPERATION', 'VALUE', 'ROW_NUMBER'], inplace=True)
df_bug_summary = df_bug_summary.reindex(columns=['BUG_DESCRIPTION', 'ISSUER', 'TIMESTAMP', 'AGE'])
df_bug_summary.rename(columns={'TIMESTAMP': 'CREATION_TIME', 'AGE': 'CREATION_AGE'}, inplace=True)
df_bug_summary

In [None]:
# Obtains last known state of the bug
last_status = last_event_by_bug(df_status_changes_by_bug).set_index('BUG_ID')
last_status.drop(columns=['OPERATION', 'BUG_DESCRIPTION', 'ROW_NUMBER', 'ISSUER'], inplace=True)
last_status.rename(columns={'TIMESTAMP': 'STATE_UPDATE_TIME', 'VALUE': 'STATE', 'AGE': 'STATE_UPDATE_AGE'}, inplace=True)
last_status = last_status.reindex(columns=['STATE', 'STATE_UPDATE_TIME', 'STATE_UPDATE_AGE'])
df_bug_summary = pd.merge(df_bug_summary, last_status, left_index=True, right_index=True, how='left')
df_bug_summary

In [None]:
# Adds the total number of changes of state
n_changes_of_state = count_event_by_bug_and_type(df_status_changes_by_bug, 'bug_status')
n_changes_of_state.rename(columns={'COUNT': 'STATE_CHANGES'}, inplace=True)
df_bug_summary = pd.merge(df_bug_summary, n_changes_of_state, left_index=True, right_index=True, how='left')
df_bug_summary

In [None]:
# Adds how many times it has changed to `RESOLVED` (useful to detect ineffective resolutions)
df = df_status_changes_by_bug

# Checks strictly changes to `RESOLVED` (`VERIFIED` would be redundant)
n_changes_to_resolved = count_event_by_bug( df[ (df.OPERATION=='bug_status') & (df.VALUE=='RESOLVED')] )
n_changes_to_resolved.rename(columns={'COUNT': 'CHANGES_TO_RESOLVED'}, inplace=True)
df_bug_summary = pd.merge(df_bug_summary, n_changes_to_resolved, left_index=True, right_index=True, how='left')
df_bug_summary

In [None]:
# Replaces NA in count of changes to `RESOLVED` state
df_bug_summary['CHANGES_TO_RESOLVED'] = df_bug_summary['CHANGES_TO_RESOLVED'].fillna(0)
df_bug_summary['CHANGES_TO_RESOLVED'] = df_bug_summary.CHANGES_TO_RESOLVED.astype('int64')
df_bug_summary['CHANGES_TO_RESOLVED']

In [None]:
# Obtains last known Release and MDG of the bug
df_bug_summary = pd.merge(df_bug_summary, 
                          df_bug_full.groupby('BUG_ID').last()[['RELEASE', 'MODULE']], 
                          how='outer', left_index=True, right_index=True)
df_bug_summary

In [None]:
# # Obtains last known Release of the bug
# last_release = last_event_by_bug_and_type(df_bug_full, 'version').set_index('BUG_ID')
# last_release.drop(columns=['TIMESTAMP', 'OPERATION', 'BUG_DESCRIPTION', 'ROW_NUMBER', 'AGE'], inplace=True)
# last_release.rename(columns={'VALUE': 'RELEASE'}, inplace=True)
# df_bug_summmary = pd.merge(df_bug_summmary, last_release, left_index=True, right_index=True, how='left')
# df_bug_summmary

In [None]:
# Adds how many times the Release assignment has changed (useful to detect situations that persist or revive across releases)
n_release_assignment = count_event_by_bug_and_type(df_bug_full, 'version')
n_release_assignment.rename(columns={'COUNT': 'RELEASE_CHANGES'}, inplace=True)
df_bug_summary = pd.merge(df_bug_summary, n_release_assignment, left_index=True, right_index=True, how='left')
df_bug_summary

In [None]:
# Replaces NA in count of changes to zero
df_bug_summary['RELEASE_CHANGES'] = df_bug_summary['RELEASE_CHANGES'].fillna(0)
df_bug_summary['RELEASE_CHANGES'] = df_bug_summary.CHANGES_TO_RESOLVED.astype('int64')
df_bug_summary['RELEASE_CHANGES']

In [None]:
# # Obtains last assigned MDG
# last_mdg = last_event_by_bug_and_type(df_bug_full, 'component').set_index('BUG_ID')
# last_mdg.drop(columns=['TIMESTAMP', 'OPERATION', 'BUG_DESCRIPTION', 'ROW_NUMBER', 'AGE'], inplace=True)
# last_mdg.rename(columns={'VALUE': 'MODULE'}, inplace=True)
# df_bug_summmary = pd.merge(df_bug_summmary, last_mdg, left_index=True, right_index=True, how='left')
# df_bug_summmary

In [None]:
# bugs_ids_with_missing_data = df_bug_summmary.loc[df_bug_summmary.MODULE.isna() | df_bug_summmary.RELEASE.isna()].index.values

# # bugs_with_missing_data = df_bug_full.loc[df_bug_full.BUG_ID.isin(bugs_ids_with_missing_data.index.values), ['BUG_ID', 'BUG_DESCRIPTION', 'ROW_NUMBER', 'TIMESTAMP']].set_index('ROW_NUMBER')
# bugs_with_missing_data = df_bug_full.loc[df_bug_full.BUG_ID.isin(bugs_ids_with_missing_data), ['BUG_ID', 'BUG_DESCRIPTION']].groupby('BUG_ID').first()
# bugs_with_missing_data

In [None]:
# # Saves bugs with missing data
# bugs_with_missing_data.to_excel(os.path.join(outputs_folder, 'bugs_with_missing_events.xlsx'))

---

In [None]:
# # Replaces NA in MDG assignment to "Not assigned"
# df_bug_summmary['MODULE'] = df_bug_summmary['MODULE'].fillna('Not assigned')
# df_bug_summmary['MODULE']

In [None]:
df_bug_summary.MODULE.value_counts(dropna=False)

In [None]:
# Adds how many times the MDG assignment has changed (useful to detect "hot potato" situations)
n_mdg_assignment = count_event_by_bug_and_type(df_bug_full, 'component')
n_mdg_assignment.rename(columns={'COUNT': 'MODULE_CHANGES'}, inplace=True)
df_bug_summary = pd.merge(df_bug_summary, n_mdg_assignment, left_index=True, right_index=True, how='left')
df_bug_summary

In [None]:
# Replaces NA in count of changes to zero
df_bug_summary['MODULE_CHANGES'] = df_bug_summary['MODULE_CHANGES'].fillna(0)
df_bug_summary['MODULE_CHANGES'] = df_bug_summary.CHANGES_TO_RESOLVED.astype('int64')
df_bug_summary['MODULE_CHANGES']

In [None]:
# Obtains last assigned owner
last_owner = last_event_by_bug_and_type(df_bug_full, 'assigned_to').set_index('BUG_ID')
last_owner.drop(columns=['OPERATION', 'BUG_DESCRIPTION', 'ROW_NUMBER', 'AGE', 'RELEASE', 'MODULE'], inplace=True)
last_owner.rename(columns={'VALUE': 'OWNER', 'TIMESTAMP': 'LAST_ASSIGMENT_TIME'}, inplace=True)
df_bug_summary = pd.merge(df_bug_summary, last_owner, left_index=True, right_index=True, how='left')
df_bug_summary

In [None]:
# Adds to the dataframe how many times the owner assignment has changed (useful to detect "hot potato" situations)
n_owner_assignment = count_event_by_bug_and_type(df_bug_full, 'assigned_to')
n_owner_assignment.rename(columns={'COUNT': 'OWNER_CHANGES'}, inplace=True)
n_owner_assignment
df_bug_summary = pd.merge(df_bug_summary, n_owner_assignment, left_index=True, right_index=True, how='left')
df_bug_summary

In [None]:
# Replaces NA in count of changes to zero
df_bug_summary['OWNER_CHANGES'] = df_bug_summary['OWNER_CHANGES'].fillna(0)
df_bug_summary['OWNER_CHANGES'] = df_bug_summary.CHANGES_TO_RESOLVED.astype('int64')
df_bug_summary['OWNER_CHANGES']

In [None]:
# Obtains the last event that has happened to the bug so far
last_event = last_event_by_bug(df_bug_full).set_index('BUG_ID')
last_event.drop(columns=['BUG_DESCRIPTION', 'VALUE', 'ROW_NUMBER'], inplace=True)
last_event.rename(columns={'OPERATION': 'LAST_EVENT', 'TIMESTAMP': 'LAST_EVENT_TIME', 'AGE': 'LAST_EVENT_AGE'}, inplace=True)
last_event = last_event.reindex(columns=['LAST_EVENT', 'LAST_EVENT_TIME', 'LAST_EVENT_AGE'])
df_bug_summary = pd.merge(df_bug_summary, last_event, left_index=True, right_index=True, how='left')
df_bug_summary

In [None]:
df_bug_summary.info()

In [None]:
df_bug_summary.STATE.value_counts()

### 3.2 Addition of series based on calculations

In [None]:
# Convenient shortcut to know if a bug was solved or it is still open
df_bug_summary["SOLVED"] = False
df_bug_summary.loc[ df_bug_summary.STATE.isin(["RESOLVED", "VERIFIED"]), "SOLVED" ] = True
df_bug_summary

In [None]:
# df_bug_summary.loc[df_bug_summary.STATE.isin(["RESOLVED", "VERIFIED"]), "BUG_RESOLUTION_TIME"] = df_bug_summary.loc[df_bug_summary.STATE.isin(["RESOLVED", "VERIFIED"]), "STATE_UPDATE_TIME"] - df_bug_summary.loc[df_bug_summary.STATE.isin(["RESOLVED", "VERIFIED"]), "CREATION_TIME"]
# df_bug_summary.loc[df_bug_summary.STATE.isin(["RESOLVED", "VERIFIED"]), "BUG_RESOLUTION_TIME"]
df_bug_summary.loc[df_bug_summary.SOLVED, "BUG_RESOLUTION_TIME"] = df_bug_summary.loc[df_bug_summary.SOLVED, "STATE_UPDATE_TIME"] - df_bug_summary.loc[df_bug_summary.SOLVED, "CREATION_TIME"]
df_bug_summary.loc[df_bug_summary.SOLVED, "BUG_RESOLUTION_TIME"]

In [None]:
df_bug_summary

## 4. Reports and graphical representations

### 4.1 Analysis of state of bugs, per module

Prepares a dataframe slice, better suited for representation:

In [None]:
bug_solved_or_open = df_bug_summary.loc[df_bug_summary.CREATION_AGE!="OLD", ["SOLVED", "CREATION_AGE", "RELEASE", "MODULE"]]

# Replaces N/A by "Unknown" in "Module" column
bug_solved_or_open.loc[:, 'MODULE'] = bug_solved_or_open.MODULE.fillna("Unknown")

#bug_solved_or_open['MODULE'] = bug_solved_or_open.MODULE.astype('str')
#bug_solved_or_open['MODULE'] = bug_solved_or_open.MODULE.astype('category')

# Replaces N/A by "Unknown" in "RELEASE" column
bug_solved_or_open['RELEASE'] = bug_solved_or_open.RELEASE.astype('str')
bug_solved_or_open.loc[:, 'RELEASE'] = bug_solved_or_open.RELEASE.fillna("Unknown")
bug_solved_or_open['RELEASE'] = bug_solved_or_open.RELEASE.astype('category')

# Renames the names of the columns
bug_solved_or_open.rename(columns={'CREATION_AGE': 'Age', 'RELEASE': 'OSM Release', 'MODULE': 'Module', 'SOLVED': 'Solved'}, inplace=True)

bug_solved_or_open

In [None]:
bug_solved_or_open.info()

In [None]:
# Statistics open by age
bugs_per_module = pd.crosstab(bug_solved_or_open.Module, bug_solved_or_open.Solved)
bugs_per_module.columns.name = None
bugs_per_module.rename(columns={True: 'Solved', False: 'Open'}, inplace=True)
bugs_per_module

In [None]:
# Statistics open by age
bugs_per_module_detailed = pd.crosstab(bug_solved_or_open.Module, [bug_solved_or_open.Solved, bug_solved_or_open.Age])
bugs_per_module_detailed.columns = [str(col[0]) + '_' + col[1] for col in bugs_per_module_detailed.columns]
bugs_per_module_detailed.rename(columns={'False_CURRENT': 'Open', 'False_RECENT': 'Open last cycle', 'True_CURRENT': 'Solved', 'True_RECENT': 'Solved last cycle'}, inplace=True)
bugs_per_module_detailed

In [None]:
cat_order = bug_solved_or_open.Module.unique()
cat_order = np.delete(cat_order, np.where(cat_order=='Unknown'))
cat_order = np.append(cat_order, 'Unknown')

ax = sns.countplot(data=bug_solved_or_open,
                   y="Module",
                   hue="Solved",
                   order=cat_order)
#plt.xticks(rotation=45)
plt.show()

In [None]:
labels = bugs_per_module.reset_index().Module

solved_bugs = bugs_per_module.Solved
open_bugs = bugs_per_module.Open
width = 0.35       # the width of the bars: can also be len(x) sequence

fig, ax = plt.subplots()

ax.barh(labels, solved_bugs, width, label='Solved')
ax.barh(labels, open_bugs, width, left=solved_bugs, label='Open')

ax.set_ylabel('Number of bugs')
ax.set_title('Status of non-deprecated bugs per module')
ax.legend()

filename = os.path.join(outputs_folder, 'open_bugs_per_module')
fig.savefig(filename + '.png', dpi=300)
fig.savefig(filename + '.svg')

plt.show()

In [None]:
# 'Open', 'Open last cycle', 'Solved', 'Solved last cycle'}
#bugs_per_module_detailed

In [None]:
# labels = bugs_per_module_detailed.reset_index().Module

# solved_bugs = bugs_per_module_detailed.Solved
# solved_bugs_cycle = bugs_per_module_detailed['Solved last cycle']
# open_bugs = bugs_per_module_detailed.Open
# open_bugs_cycle = bugs_per_module_detailed['Open last cycle']
# #width = 0.35       # the width of the bars: can also be len(x) sequence

# fig, ax = plt.subplots()

# ax.barh(labels, solved_bugs, width, label='Solved', color='royalblue')
# ax.barh(labels, solved_bugs_cycle, width, left=solved_bugs, label='Solved last cycle', color='cornflowerblue')
# ax.barh(labels, open_bugs, width, left=solved_bugs+solved_bugs_cycle, label='Open', color='firebrick')
# ax.barh(labels, open_bugs_cycle, width, left=solved_bugs+solved_bugs_cycle+open_bugs, label='Open last cycle', color='lightcoral')

# ax.set_ylabel('Number of bugs')
# ax.set_title('Status of non-deprecated bugs per module')
# ax.legend()

# filename = os.path.join(outputs_folder, 'open_bugs_per_module_detailed')
# fig.savefig(filename + '.png', dpi=300)
# fig.savefig(filename + '.svg')

# plt.show()

In [None]:
bugs_per_module_detailed['total_open'] = bugs_per_module_detailed.Open + bugs_per_module_detailed['Open last cycle']
bugs_per_module_detailed.sort_values(by='total_open', inplace=True)

labels = bugs_per_module_detailed.reset_index().Module

solved_bugs = bugs_per_module_detailed.Solved
solved_bugs_cycle = bugs_per_module_detailed['Solved last cycle']
open_bugs = bugs_per_module_detailed.Open
open_bugs_cycle = bugs_per_module_detailed['Open last cycle']
#width = 0.35       # the width of the bars: can also be len(x) sequence

fig, ax = plt.subplots(figsize = (12,16))

ax.barh(labels, solved_bugs, width, label='Solved', color='royalblue')
ax.barh(labels, solved_bugs_cycle, width, left=solved_bugs, label='Solved last cycle', color='cornflowerblue')
ax.barh(labels, open_bugs, width, left=solved_bugs+solved_bugs_cycle, label='Open', color='firebrick')
ax.barh(labels, open_bugs_cycle, width, left=solved_bugs+solved_bugs_cycle+open_bugs, label='Open last cycle', color='lightcoral')

ax.set_ylabel('Number of bugs')
ax.set_title('Status of non-deprecated bugs per module', fontsize=20)
ax.legend()

filename = os.path.join(outputs_folder, 'open_bugs_per_module_detailed')
fig.savefig(filename + '.png', dpi=300)
fig.savefig(filename + '.svg')

plt.show()

### 4.2 Analysis of bug resolution times

Prepares a dataframe slice, better suited for representation:

In [None]:
bug_resolution_times = df_bug_summary.loc[df_bug_summary.SOLVED, ["CREATION_AGE", "RELEASE", "MODULE"]]

# Converts the timediff to days
bug_resolution_times["Bug resolution time (days)"] = df_bug_summary.loc[df_bug_summary.SOLVED, "BUG_RESOLUTION_TIME"].dt.days

# Replaces N/A by "Unknown" in "Module" column
bug_resolution_times.loc[:, 'MODULE'] = bug_resolution_times.MODULE.fillna("Unknown")

# Replaces N/A by "Unknown" in "RELEASE" column
bug_resolution_times['RELEASE'] = bug_resolution_times.RELEASE.astype('str')
bug_resolution_times.loc[:, 'RELEASE'] = bug_resolution_times.RELEASE.fillna("Unknown")
bug_resolution_times['RELEASE'] = bug_resolution_times.RELEASE.astype('category')

# Renames the names of the columns
bug_resolution_times.rename(columns={'CREATION_AGE': 'Age', 'RELEASE': 'OSM Release', 'MODULE': 'Module'}, inplace=True)
bug_resolution_times

In [None]:
bug_resolution_times.describe()

In [None]:
# Histogram (all)
g = sns.displot(data=bug_resolution_times, x="Bug resolution time (days)")
g.ax.set_title("Distribution of time to solve a bug")
g.ax.set_ylabel('No. resolved bugs')
plt.show()

#### 4.2.1 Time to resolve a bug, by age

In [None]:
# Statistics open by age
group = bug_resolution_times.groupby('Age')
group.agg('describe')

In [None]:
# Violin plot (per age)
ax = sns.violinplot(data=bug_resolution_times,
                   x="Age",
                   y="Bug resolution time (days)",
                   cut=0,
                   order=['OLD', 'CURRENT', 'RECENT'])
ax.set_title("Time for bug resolution, per age")
#ax.set_ylabel('No. resolved bugs')
plt.show()

In [None]:
# Boxplot (per age)
ax = sns.boxplot(data=bug_resolution_times,
                 x="Age",
                 y="Bug resolution time (days)",
                 order=['OLD', 'CURRENT', 'RECENT'])
ax.set_title("Time for bug resolution, per age")
plt.show()

---

**IMPORTANT:** From here, we will **remove the OLD samples**, since they have numerous and huge outliers which skew the analysis.

In [None]:
# Removes old samples
bug_resolution_times = bug_resolution_times.loc[ bug_resolution_times['Age']!='OLD' ]
bug_resolution_times.loc[:, 'Age'] = bug_resolution_times.Age.astype('string').astype('category')
bug_resolution_times.head()

In [None]:
bug_resolution_times.describe()

In [None]:
# Histogram (aggregated, excluding OLD)
g = sns.displot(data=bug_resolution_times, x="Bug resolution time (days)", stat='density', common_norm=False, kde=True)
g.ax.set_title("Distribution of time to solve a bug (excluding OLD)")
plt.show()

In [None]:
# Histogram (per age)
g = sns.displot(data=bug_resolution_times, x="Bug resolution time (days)", hue='Age', stat='density', common_norm=False, kde=True)
g.ax.set_title("Distribution of time to solve a bug")
plt.show()

In [None]:
# Violin plot (per age, excluding OLD)
ax = sns.violinplot(data=bug_resolution_times,
                   x="Age",
                   y="Bug resolution time (days)",
                   cut=0,
                   order=['CURRENT', 'RECENT'])
ax.set_title("Time for bug resolution, per age")
plt.show()

In [None]:
# Boxplot (per age, excluding OLD)
ax = sns.boxplot(data=bug_resolution_times,
                 x="Age",
                 y="Bug resolution time (days)",
                 order=['CURRENT', 'RECENT'])
ax.set_title("Time for bug resolution, per age")
plt.show()

Here we find the outliers in the bugs of "Current" age to try to resolve them as soon as possible:

In [None]:
# Shortcut for the series of the bug resolution times that are candidates for deprecation (i.e. 'OLD' age)
times = bug_resolution_times.loc[bug_resolution_times.Age=='CURRENT', "Bug resolution time (days)"]

# Stats, including main quantiles
times.describe()

In [None]:
q025 = times.quantile(0.25)
q075 = times.quantile(0.75)
threshold_outliers = q075 + 1.5*(q075-q025)
threshold_outliers

In [None]:
# Outliers
bug_resolution_outliers = bug_resolution_times.loc[(bug_resolution_times.Age=='CURRENT') & (bug_resolution_times["Bug resolution time (days)"]>threshold_outliers)]
bug_resolution_outliers

In [None]:
# Histogram (current but not recent)
g = sns.displot(data=bug_resolution_times[ bug_resolution_times.Age=='CURRENT' ], x="Bug resolution time (days)", stat='density', common_norm=False, kde=True)
g.ax.set_title("Distribution of time to solve a bug (Age=CURRENT)")
plt.show()

In [None]:
# Histogram (recent)
g = sns.displot(data=bug_resolution_times[ bug_resolution_times.Age=='RECENT' ], x="Bug resolution time (days)", stat='density', common_norm=False, kde=True)
g.ax.set_title("Distribution of time to solve a bug (Age=RECENT)")
plt.show()

#### 4.2.2 Time to resolve a bug, by module

In [None]:
# Statistics open by module
group = bug_resolution_times.groupby('Module')
group.agg('describe')

In [None]:
# Violin plot of times to resolve a bug (per module)

# Fixes the order of the categories to put 'Unknown' at the end
cat_order = bug_resolution_times.Module.unique()
cat_order = np.delete(cat_order, np.where(cat_order=='Unknown'))
cat_order = np.append(cat_order, 'Unknown')

# ax = sns.violinplot(data=bug_resolution_times,
#                    x="Module",
#                    y="Bug resolution time (days)",
#                    cut=0,
#                    order=cat_order)
ax = sns.violinplot(data=bug_resolution_times,
                   y="Module",
                   x="Bug resolution time (days)",
                   cut=0,
                   order=cat_order)
ax.set_title("Time for bug resolution, per module")

plt.show()

In [None]:
# # Boxplot of times to resolve a bug (per module, excluding OLD)

# # Fixes the order of the categories to put 'Unknown' at the end
# cat_order = bug_resolution_times.Module.unique()
# cat_order = np.delete(cat_order, np.where(cat_order=='Unknown'))
# cat_order = np.append(cat_order, 'Unknown')

# ax = sns.boxplot(data=bug_resolution_times,
#                  y="Module",
#                  x="Bug resolution time (days)",
#                  order=cat_order)
# ax.set_title("Time for bug resolution, per module")

# filename = os.path.join(outputs_folder, 'time_bug_resolution_per_module')
# fig.savefig(filename + '.png', dpi=300)
# fig.savefig(filename + '.svg')

# plt.show()

In [None]:
# Boxplot of times to resolve a bug (per module, excluding OLD)

# Fixes the order of the categories to put 'Unknown' at the end
cat_order = bug_resolution_times.groupby('Module').median().dropna().sort_values('Bug resolution time (days)', ascending=False).reset_index().Module.unique()
#cat_order = np.delete(cat_order, np.where(cat_order=='Unknown'))
#cat_order = np.append(cat_order, 'Unknown')

fig, ax = plt.subplots(figsize = (12,16))

# ax = sns.boxplot(data=bug_resolution_times,
#                  y="Module",
#                  x="Bug resolution time (days)",
#                  order=cat_order)
sns.boxplot(data=bug_resolution_times,
            y="Module",
            x="Bug resolution time (days)",
            order=cat_order,
            ax=ax)
ax.set_title("Time for bug resolution, per module", fontsize=20)

filename = os.path.join(outputs_folder, 'time_bug_resolution_per_module')
fig.savefig(filename + '.png', dpi=300)
fig.savefig(filename + '.svg')

plt.show()

#### 4.2.3 Time of open bugs, by age

First, creates a new dataframe more suitable for this analysis and its visualisation:

In [None]:
# Selects the right data: unresolved bugs and not too old
#bug_open_times = df_bug_summary.loc[(~df_bug_summary.STATE.isin(["RESOLVED", "VERIFIED"])) & (df_bug_summary.CREATION_AGE!="OLD"), ["CREATION_AGE", "RELEASE", "MODULE", "LAST_EVENT_TIME"]]
bug_open_times = df_bug_summary.loc[(~df_bug_summary.SOLVED) & (df_bug_summary.CREATION_AGE!="OLD"), ["CREATION_AGE", "RELEASE", "MODULE", "LAST_EVENT_TIME"]]

# Removes "OLD" from the "CREATION_AGE" category
bug_open_times['CREATION_AGE'] = bug_open_times.CREATION_AGE.astype('string').astype('category')

# Calculates timediff between creation time and today, in days (for Seaborn to work smoothly)
#time_open = np.datetime64('now') - df_bug_summary.loc[~df_bug_summary.STATE.isin(["RESOLVED", "VERIFIED"]), "CREATION_TIME"]
time_open = np.datetime64('now') - df_bug_summary.loc[~df_bug_summary.SOLVED, "CREATION_TIME"]
bug_open_times["Time bug open (days)"] = time_open.dt.days

# Replaces N/A by "Unknown" in "Module" column
bug_open_times.loc[:, 'MODULE'] = bug_open_times.MODULE.fillna("Unknown")

# Replaces N/A by "Unknown" in "RELEASE" column
bug_open_times['RELEASE'] = bug_open_times.RELEASE.astype('str')
bug_open_times.loc[:, 'RELEASE'] = bug_open_times.RELEASE.fillna("Unknown")
bug_open_times['RELEASE'] = bug_open_times.RELEASE.astype('category')

# Renames the names of the columns
bug_open_times.rename(columns={'CREATION_AGE': 'Age', 'RELEASE': 'OSM Release', 'MODULE': 'Module'}, inplace=True)
bug_open_times

In [None]:
# Aggregated statistics
bug_open_times.describe()

In [None]:
# Statistics open by age
group = bug_open_times.groupby('Age')
group.agg('describe')

4.2.4 Time of open bugs, by module

In [None]:
# Violin plot of times of bugs still open (per module)

# Fixes the order of the categories to put 'Unknown' at the end
cat_order = bug_open_times.Module.unique()
cat_order = np.delete(cat_order, np.where(cat_order=='Unknown'))
cat_order = np.append(cat_order, 'Unknown')

# ax = sns.violinplot(data=bug_open_times,
#                    x="Module",
#                    y="Time bug open (days)",
#                    cut=0,
#                    order=cat_order)
ax = sns.violinplot(data=bug_open_times,
                   y="Module",
                   x="Time bug open (days)",
                   cut=0,
                   order=cat_order)
ax.set_title("Duration of bugs still open, per module")
plt.show()

In [None]:
# # Box plot of times of bugs still open (per module)

# # Fixes the order of the categories to put 'Unknown' at the end
# cat_order = bug_open_times.Module.unique()
# cat_order = np.delete(cat_order, np.where(cat_order=='Unknown'))
# cat_order = np.append(cat_order, 'Unknown')

# # ax = sns.boxplot(data=bug_open_times,
# #                  x="Module",
# #                  y="Time bug open (days)",
# #                  order=cat_order)
# ax = sns.boxplot(data=bug_open_times,
#                  y="Module",
#                  x="Time bug open (days)",
#                  order=cat_order)
# ax.set_title("Duration of bugs still open, per module")

# filename = os.path.join(outputs_folder, 'time_bugs_open_per_module')
# fig.savefig(filename + '.png', dpi=300)
# fig.savefig(filename + '.svg')

# plt.show()

In [None]:
# Box plot of times of bugs still open (per module)

# Fixes the order of the categories to put 'Unknown' at the end
cat_order = bug_open_times.groupby('Module').median().dropna().sort_values('Time bug open (days)', ascending=False).reset_index().Module.unique()
#cat_order = np.delete(cat_order, np.where(cat_order=='Unknown'))
#cat_order = np.append(cat_order, 'Unknown')

fig, ax = plt.subplots(figsize = (12,16))

# ax = sns.boxplot(data=bug_open_times,
#                  y="Module",
#                  x="Time bug open (days)",
#                  order=cat_order)

sns.boxplot(data=bug_open_times,
            y="Module",
            x="Time bug open (days)",
            ax=ax,
            order=cat_order)
ax.set_title("Duration of bugs still open, per module", fontsize=20)

filename = os.path.join(outputs_folder, 'time_bugs_open_per_module')
fig.savefig(filename + '.png', dpi=300)
fig.savefig(filename + '.svg')

plt.show()

## 5. Bugs per module worth a review

### 5.1 Old bugs still open _(prior to Rel NINE cycle)_

**PROPOSAL:** MDL to confirm if the bug can be safely closed or it is still relevant for any special reason (or if it should be re-created referring to newer releases).

In [None]:
df_bug_summary.head()

In [None]:
df_bug_summary.info()

In [None]:
df_bug_summary.columns

In [None]:
df_old_still_open = df_bug_summary.loc[ (df_bug_summary.CREATION_AGE=='OLD') & (~df_bug_summary.STATE.isin(['RESOLVED', 'VERIFIED'])) ].reset_index().sort_values(by=['MODULE', 'BUG_ID'])
df_old_still_open.head()

In [None]:
df_old_still_open.MODULE.value_counts()

### 5.2 Bugs open for too much time _(only Rel NINE cycle or later)_

Three types:

1. Outliers: Beyond Q3+1.5IQR for the MDG.
2. Beyond Q3
3. Beyond 2 months

![boxplot clarification](https://miro.medium.com/max/18000/1*2c21SkzJMf3frPXPAR_gZA.png)

MDL to confirm their status and if:

- The bug still exist (if not, should be closed).
- In case the bugs is still valid, determine its severity and the complexity for addressing it.

In [None]:
bug_open_times.info()

In [None]:
quantiles_bug_open_times = bug_open_times.groupby(['Module', 'OSM Release'])[['Time bug open (days)']].agg('describe').reset_index()
quantiles_bug_open_times.columns = [col[1] if col[1] else col[0] for col in quantiles_bug_open_times.columns]
quantiles_bug_open_times['IQR'] = quantiles_bug_open_times['75%'] - quantiles_bug_open_times['25%']
quantiles_bug_open_times['outliers_th'] = quantiles_bug_open_times['75%'] + 1.5*quantiles_bug_open_times['IQR']
quantiles_bug_open_times.tail()

In [None]:
bug_open_times_with_thresholds = pd.merge(bug_open_times.reset_index(), quantiles_bug_open_times, how='left', left_on=['Module', 'OSM Release'], right_on=['Module', 'OSM Release'])
bug_open_times_with_thresholds = pd.merge(df_bug_summary.reset_index(),
                                          bug_open_times_with_thresholds,
                                          how='right',
                                          left_on=['BUG_ID'],
                                          right_on=['BUG_ID'])
bug_open_times_with_thresholds.rename(columns={'LAST_EVENT_TIME_x': 'LAST_EVENT_TIME'}, inplace=True)
bug_open_times_with_thresholds.drop(columns=['Age', 'OSM Release', 'Module', 'LAST_EVENT_TIME_y'], inplace=True)
bug_open_times_with_thresholds.sort_values(by=['MODULE', 'BUG_ID'])
bug_open_times_with_thresholds.tail()

In [None]:
bug_open_times_with_thresholds.info()

#### 5.2.1 Outliers

In [None]:
bug_open_outliers = bug_open_times_with_thresholds[ bug_open_times_with_thresholds['Time bug open (days)']>bug_open_times_with_thresholds.outliers_th ]
bug_open_outliers

#### 5.2.2 Beyond Q3

In [None]:
bug_open_q3 = bug_open_times_with_thresholds[ (bug_open_times_with_thresholds['Time bug open (days)']>bug_open_times_with_thresholds['75%']) &
                                              (bug_open_times_with_thresholds['Time bug open (days)']<=bug_open_times_with_thresholds.outliers_th) ]
bug_open_q3

#### 5.2.3 Beyond 2 months

In [None]:
bug_open_2_months = bug_open_times_with_thresholds[ (bug_open_times_with_thresholds['Time bug open (days)']>60) &
                                                     (bug_open_times_with_thresholds['Time bug open (days)']<=bug_open_times_with_thresholds['75%'])]
bug_open_2_months

### 5.3 Saves bug lists in spreadsheet

In [None]:
bugs_sheet_names = ['0. Too old bugs still open',
                    '1. Open outliers',
                    '2. Open beyond q3',
                    '3. Open beyond 2 months']

Imports latest MDLs recommentations, if they exist:

In [None]:
former_mdl_assessments_cols = ['RECOMMENDATION', 'BUG_ID']
f_assess = os.path.join(inputs_folder, former_mdl_assessments_file)

df_list = []

try:
    for sheet in bugs_sheet_names:
        df = pd.read_excel(f_assess, sheet_name=sheet, usecols=former_mdl_assessments_cols)
        df_list.append(df)
    df_recommendations = pd.concat(df_list, ignore_index=True)
except FileNotFoundError:
    df_recommendations = pd.DataFrame(columns=former_mdl_assessments_cols)

df_recommendations.head()

Adds the recommendation to existing tables:

In [None]:
df_old_still_open = pd.merge(df_recommendations, df_old_still_open, on='BUG_ID', how='right')
bug_open_outliers = pd.merge(df_recommendations, bug_open_outliers, on='BUG_ID', how='right')
bug_open_q3 = pd.merge(df_recommendations, bug_open_q3, on='BUG_ID', how='right')
bug_open_2_months = pd.merge(df_recommendations, bug_open_2_months, on='BUG_ID', how='right')

Exports to spreadsheet:

In [None]:
columns = ['RECOMMENDATION', 'MODULE', 'BUG_ID', 'BUG_DESCRIPTION', 'RELEASE', 'STATE', 'ISSUER', 'OWNER', 'CREATION_TIME', 'CREATION_AGE',
           'STATE_UPDATE_TIME', 'STATE_UPDATE_AGE', 'LAST_ASSIGMENT_TIME', 'LAST_EVENT', 'LAST_EVENT_TIME', 'LAST_EVENT_AGE',
           'STATE_CHANGES', 'CHANGES_TO_RESOLVED', 'RELEASE_CHANGES', 'MODULE_CHANGES', 'OWNER_CHANGES']

In [None]:
today = pd.to_datetime("today")

In [None]:
filename = os.path.join(outputs_folder, today.strftime('%Y%m%d') + '_bugs_for_mdl_review.xlsx')
with pd.ExcelWriter(filename, engine='xlsxwriter') as writer:
    df_old_still_open.to_excel(writer, columns=columns, index=False, sheet_name='0. Too old bugs still open')
    bug_open_outliers[columns].to_excel(writer, index=False, sheet_name='1. Open outliers')
    bug_open_q3[columns].to_excel(writer, index=False, sheet_name='2. Open beyond q3')
    bug_open_2_months[columns].to_excel(writer, index=False, sheet_name='3. Open beyond 2 months')

In [None]:
bug_open_outliers[columns]