# Data Preparation

In [42]:
import re

import numpy  as np
import pandas as pd

from google.colab import  drive
drive.mount('/drive')

Mounted at /drive


In [43]:
def clean_data_fn(text):
    """Clean and convert a text to store only alphabetical characters 
       in lower case.
       
    Args:
        text (str): a text string.
    
    Returns:
        text (str): a text string converted.
    """
    text = text.lower()
    text = re.sub(r"([?.!,¿])", r" ", text)
    text = re.sub(r'[" "]+', " ", text)
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
    text=re.sub(r'@\w+', '',text)
    return text

def load_data_fn(file_path):
    """Read a bug report data set.

    Args:
        filepath (str): a complete filename path.

    Returns:
        result (dataframe): a bug report dataframe.

    """
    reports = pd.read_csv(file_path, encoding='utf8', sep=',', parse_dates=True
      ,low_memory=False)

    reports.dropna(inplace=True)
    reports['long_description'] = reports['long_description'].map(clean_data_fn)
    reports['long_description'] = reports['long_description'].replace('', np.nan)
    
    result = reports.loc[:, ('long_description', 'severity_category')]
    result.dropna(inplace=True)
    result.reset_index(drop=True, inplace=True)

    return result

def convert_to_ordinal_fn(severity):
    """Convert severity category to ordinal.

    Args:
        severity (str): a severity category.

    Returns:
        severity (int): ordinal value of severity.
    """
    categories={'trivial': 0,
                'minor': 1,
                'major': 2,
                'critical': 3,
                'blocker': 4}
    return categories.get(severity, 'Invalid severity category!')

    

In [48]:
reports_input_url = 'https://raw.githubusercontent.com/gomesluiz/bug-severity-predictor/main/data/raw/mozilla_bug_report_data.csv?token=AAGZN3E7SWOKPRFI5HBOW2TABL7O6'
!mkdir -p '/drive/My Drive/data/raw'
!wget -O '/drive/My Drive/data/raw/mozilla_bug_report_data.csv' {reports_input_url}
#!head -2 {reports_input_path}

--2021-01-22 16:36:43--  https://raw.githubusercontent.com/gomesluiz/bug-severity-predictor/main/data/raw/mozilla_bug_report_data.csv?token=AAGZN3E7SWOKPRFI5HBOW2TABL7O6
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2077348 (2.0M) [text/plain]
Saving to: ‘/drive/My Drive/data/raw/mozilla_bug_report_data.csv’


2021-01-22 16:36:43 (9.20 MB/s) - ‘/drive/My Drive/data/raw/mozilla_bug_report_data.csv’ saved [2077348/2077348]



In [49]:
reports_input_path =  '/drive/My Drive/data/raw/mozilla_bug_report_data.csv'
reports_data = load_data_fn(reports_input_path)

In [50]:
reports_data.head()

Unnamed: 0,long_description,severity_category
0,is broken many users can t enter bugs on it p...,blocker
1,adding support for custom headers and cookie n...,blocker
2,the patch in bug regressed the fix from bug th...,major
3,from bugzilla helper user agent mozilla x u li...,major
4,i found it odd that relogin cgi didn t clear o...,minor


In [51]:
reports_data['severity_category'].value_counts()

major       737
critical    605
minor       540
trivial     302
blocker     204
Name: severity_category, dtype: int64

In [52]:
reports_data['severity_code'] = reports_data['severity_category'].apply(convert_to_ordinal_fn) 

In [53]:
reports_data.head()

Unnamed: 0,long_description,severity_category,severity_code
0,is broken many users can t enter bugs on it p...,blocker,4
1,adding support for custom headers and cookie n...,blocker,4
2,the patch in bug regressed the fix from bug th...,major,2
3,from bugzilla helper user agent mozilla x u li...,major,2
4,i found it odd that relogin cgi didn t clear o...,minor,1


In [54]:
reports_data['severity_code'].value_counts()

2    737
3    605
1    540
0    302
4    204
Name: severity_code, dtype: int64

In [57]:
!mkdir -p '/drive/My Drive/data/clean'
reports_output_path = '/drive/My Drive/data/clean/mozilla_bug_report_data.csv'
reports_data[['long_description', 'severity_code']].to_csv(reports_output_path, index=False)