### Initialize

In [8]:
import math
import pandas as pd
import re
import html2text as h2t
import os
from sklearn.model_selection import train_test_split
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy.stats import zscore
from datasets import Dataset

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

dataset_path = "datasets/Choet_Dataset"
dataset_splitted_path_raw = "datasets/Choet_Dataset_Raw"
dataset_splitted_path_N = "datasets/Case_N"
dataset_splitted_path_M = "datasets/Case_M"

choet_filenames = [
    'APSTUD_deep-se.csv',
    'BAM_deep-se.csv',
    'CLOV_deep-se.csv',
    'DM_deep-se.csv',
    'DURACLOUD_deep-se.csv',
    'JRESERVER_deep-se.csv',
    'MDL_deep-se.csv',
    'MESOS_deep-se.csv',
    'MULE_deep-se.csv',
    'MULESTUDIO_deep-se.csv',
    'TIMOB_deep-se.csv',
    'TISTUD_deep-se.csv',
    'USERGRID_deep-se.csv',
    'XD_deep-se.csv'
]

# dataset_names = [
#     'APSTUD', 'BAM', 'CLOV', 'DM', 'DURACLOUD', 'JRESERVER', 'MDL', 'MESOS', 'MULE', 'MULESTUDIO', 'TIMOB', 'TISTUD', 'USERGRID', 'XD'  
# ]

batch_1 = ['APSTUD', 'BAM', 'CLOV', 'DM']
batch_2 = ['DURACLOUD', 'JRESERVER', 'MDL', 'MESOS']
batch_3 = ['MULE', 'MULESTUDIO', 'TIMOB', 'USERGRID']
batch_4 = ['TISTUD', 'XD']

dataset_names = batch_1 + batch_2

project_names = ['AS', 'BB', 'CV', 'DM', 'DC', 'JS', 'MD', 'ME', 'MU', 'MS', 'AP', 'TS', 'UG', 'XD']

def extract_markdown(text):
    h = h2t.HTML2Text()
    h.ignore_links = True
    text = h.handle(text)
    text = re.sub(r'\n', ' ', text)
    
    return text

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def preprocess_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize the words
    words = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join words back to text
    text = ' '.join(words)
    
    return text

def clean_text(text):
    text = text.fillna('')
    text = text.apply(extract_markdown)
    text = text.apply(remove_urls)
    text = text.apply(preprocess_text)
    return text

def load_split_file(dataset_name, type='train'):
    path = os.path.join(dataset_splitted_path_A, dataset_name, f'{type}.csv')
    df = pd.read_csv(path)

    return df

def load_data(path):
    df = pd.read_csv(path)
    df = df.fillna(' ')

    # generate new dataframe
    d = {'text': (df['title'] + '. ' + df['description']).tolist(), 'storypoint': df['storypoint'], 'issuekey': df['issuekey']}
    dfx = pd.DataFrame(data=d)

    train_data = pd.DataFrame(columns=['issuekey', 'text', 'storypoint'])
    train_data = pd.concat([train_data, dfx], ignore_index=False)

    return train_data

def load_clean_data(path):
    dataset = pd.read_csv(path)

    # clean title and description
    dataset['title'] = clean_text(dataset['title'])
    dataset['description'] = clean_text(dataset['description'])
    dataset['text'] = pd.concat([dataset['title'], dataset['description']], axis=1).apply(lambda x: '. '.join(x), axis=1)

    return dataset

def split_data_ordered(data):
    trainingSize = 60
    validationSize = 20
    testSize = 20

    numData = len(data)
    numTrain = (trainingSize * numData) / 100
    numVal = (validationSize * numData) / 100
    numTest = (testSize * numData) / 100

    # round the numbers
    numTrain = math.floor(numTrain)
    numVal = math.floor(numVal)
    numTest = math.floor(numTest)

    # split the dataset into train, validation, and test sets
    train_data = data.iloc[0:numTrain - 1]
    val_data = data.iloc[numTrain - 1:numTrain + numVal - 1]
    test_data = data.iloc[numTrain + numVal - 1:numData]

    return train_data, val_data, test_data

def split_data(data, labels, test_size=0.2, random_state=42):

    train_data, temp_data, train_labels, temp_labels = train_test_split(data, labels, test_size=test_size, random_state=random_state)
    val_data, test_data, val_labels, test_labels = train_test_split(temp_data, temp_labels, test_size=0.5, random_state=random_state)

    return train_data, val_data, test_data, train_labels, val_labels, test_labels

# def drop_outliers(df, field_name):
#     iqr = 1.5 * (np.percentile(df[field_name], 75) - np.percentile(df[field_name], 25))
#     df.drop(df[df[field_name] > (iqr + np.percentile(df[field_name], 75))].index, inplace=True)
#     df.drop(df[df[field_name] < (np.percentile(df[field_name], 25) - iqr)].index, inplace=True)

#     return df

from scipy.stats import iqr

def drop_outliers(df, field_name):
    Q1 = df[field_name].quantile(0.25)
    Q3 = df[field_name].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out the outliers
    df_out = df.loc[(df[field_name] > lower_bound) & (df[field_name] < upper_bound)]
    return df_out


def drop_outliers_modified_z_score(df, field_name, threshold=3.5):
    median = df[field_name].median()
    median_absolute_deviation = np.median(np.abs(df[field_name] - median))
    
    # Add a small constant to avoid division by zero
    median_absolute_deviation = median_absolute_deviation if median_absolute_deviation != 0 else 1e-10

    modified_z_scores = 0.6745 * (df[field_name] - median) / median_absolute_deviation

    # Filter out the outliers
    df_out = df.loc[np.abs(modified_z_scores) <= threshold]

    return df_out

def cut_of90(labels):
        # This method is copied from the original code of the paper "A Deep Learning Model for Estimating Story Points by Choetkiertikul et al. (2019)"
        #         To investigate and compare the performance of the proposed model with state-of-the-art models
        val_y = list(set(labels))
        val_y.sort()
        l_dict = dict()
        for i, val in enumerate(val_y): 
            l_dict[int(val)] = i

        count_y = [0] * len(val_y)
    
        for label in labels:
            count_y[l_dict[int(label)]] += 1

        n_samples = len(labels)
        s, threshold = 0, 0
        for i, c in enumerate(count_y):
            s += c
            if s * 10 >= n_samples * 9:
                threshold = val_y[i]
                break
        for i, l in enumerate(labels):
            labels[i] = min(threshold, l)

        return labels.astype('float32')

print('done')

done


[nltk_data] Downloading package punkt to /Users/zuhaimi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zuhaimi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/zuhaimi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Case N: Preprocessing & remove outliers

In [6]:
records = []
for filename in choet_filenames:
    dataset = load_data(f'{dataset_path}/{filename}')

    dataset_name = filename.split('_')[0]
    project_name = project_names[choet_filenames.index(filename)]

    train_data, val_data, test_data = split_data_ordered(dataset)
    
    # clean the data
    train_data.loc[:, 'text'] = clean_text(train_data['text'])
    val_data.loc[:, 'text'] = clean_text(val_data['text'])
    test_data.loc[:, 'text'] = clean_text(test_data['text'])

    # save the datasets
    os.makedirs(f'{dataset_splitted_path_N}/{dataset_name}', exist_ok=True)

    # remove outlier on train data
    train_data = drop_outliers(train_data, 'storypoint')

    record = {
        'dataset': project_name,
        'num_records': len(train_data),
        'min': train_data['storypoint'].min(),
        'max': train_data['storypoint'].max(),
        'mean': train_data['storypoint'].mean(),
        'median': train_data['storypoint'].median()
    } 
    records.append(record)

    # save the datasets to json
    train_data.to_json(f'{dataset_splitted_path_N}/{dataset_name}/train.json', orient='records', lines=True)
    val_data.to_json(f'{dataset_splitted_path_N}/{dataset_name}/val.json', orient='records', lines=True)
    test_data.to_json(f'{dataset_splitted_path_N}/{dataset_name}/test.json', orient='records', lines=True)

    # print length of each dataset
    print(f"Train/Validation/Test data for {dataset_name}: {len(train_data)} - {len(val_data)} - {len(test_data)} ({len(train_data) + len(val_data) + len(test_data)})")
    
    # if 1 == 1:
    #     break  
# save the records to csv
records_df = pd.DataFrame(records)
records_df.to_csv(f'{dataset_splitted_path_N}/records.csv', index=False)

print("Done!")

Train/Validation/Test data for APSTUD: 491 - 165 - 168 (824)
Train/Validation/Test data for BAM: 294 - 104 - 106 (504)
Train/Validation/Test data for CLOV: 197 - 76 - 79 (352)
Train/Validation/Test data for DM: 2480 - 933 - 935 (4348)
Train/Validation/Test data for DURACLOUD: 378 - 133 - 135 (646)
Train/Validation/Test data for JRESERVER: 204 - 70 - 72 (346)
Train/Validation/Test data for MDL: 657 - 233 - 235 (1125)
Train/Validation/Test data for MESOS: 978 - 336 - 337 (1651)
Train/Validation/Test data for MULE: 525 - 177 - 180 (882)
Train/Validation/Test data for MULESTUDIO: 431 - 146 - 148 (725)
Train/Validation/Test data for TIMOB: 1238 - 450 - 452 (2140)
Train/Validation/Test data for TISTUD: 1719 - 583 - 586 (2888)
Train/Validation/Test data for USERGRID: 250 - 96 - 98 (444)
Train/Validation/Test data for XD: 2011 - 705 - 707 (3423)
Done!


### Case M: Preprocessing & Transform outliers

In [9]:
import math
records = []

for filename in choet_filenames:
    # print(f"start processing - {filename}...")
    dataset = load_data(f'{dataset_path}/{filename}')

    dataset_name = filename.split('_')[0]
    project_name = project_names[choet_filenames.index(filename)]

    # print("max value in dataset column 'storypoint': ", dataset['storypoint'].max())

    # 90% transformation
    cut_of90(dataset['storypoint'])

    # print max and min values in column 'storypoint'
    # print("max value in column 'storypoint': ", dataset['storypoint'].max())

    train_data, val_data, test_data = split_data_ordered(dataset)
    
    # clean the train data
    train_data.loc[:, 'text'] = clean_text(train_data['text'])
    val_data.loc[:, 'text'] = clean_text(val_data['text'])
    test_data.loc[:, 'text'] = clean_text(test_data['text'])

    record = {
        'dataset': project_name,
        'num_records': len(train_data),
        'min': train_data['storypoint'].min(),
        'max': train_data['storypoint'].max(),
        'mean': train_data['storypoint'].mean(),
        'median': train_data['storypoint'].median()
    } 
    records.append(record)

    # save the datasets
    os.makedirs(f'{dataset_splitted_path_M}/{dataset_name}', exist_ok=True)

    # save the datasets to json
    train_data.to_json(f'{dataset_splitted_path_M}/{dataset_name}/train.json', orient='records', lines=True)
    val_data.to_json(f'{dataset_splitted_path_M}/{dataset_name}/val.json', orient='records', lines=True)
    test_data.to_json(f'{dataset_splitted_path_M}/{dataset_name}/test.json', orient='records', lines=True)

    # print length of each dataset
    print(f"Train/Validation/Test data for {dataset_name}: {len(train_data)} - {len(val_data)} - {len(test_data)} ({len(train_data) + len(val_data) + len(test_data)})")
    
    # if 1 == 1:
    #     break  
     
# save the records to csv
records_df = pd.DataFrame(records)
records_df.to_csv(f'{dataset_splitted_path_M}/records.csv', index=False)

print("Done!")

Train/Validation/Test data for APSTUD: 496 - 165 - 168 (829)
Train/Validation/Test data for BAM: 311 - 104 - 106 (521)
Train/Validation/Test data for CLOV: 229 - 76 - 79 (384)
Train/Validation/Test data for DM: 2799 - 933 - 935 (4667)
Train/Validation/Test data for DURACLOUD: 398 - 133 - 135 (666)
Train/Validation/Test data for JRESERVER: 210 - 70 - 72 (352)
Train/Validation/Test data for MDL: 698 - 233 - 235 (1166)
Train/Validation/Test data for MESOS: 1007 - 336 - 337 (1680)
Train/Validation/Test data for MULE: 532 - 177 - 180 (889)
Train/Validation/Test data for MULESTUDIO: 438 - 146 - 148 (732)
Train/Validation/Test data for TIMOB: 1349 - 450 - 452 (2251)
Train/Validation/Test data for TISTUD: 1750 - 583 - 586 (2919)
Train/Validation/Test data for USERGRID: 288 - 96 - 98 (482)
Train/Validation/Test data for XD: 2114 - 705 - 707 (3526)
Done!
