In [15]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import collections

# Ensure NLTK data is downloaded
nltk.download('stopwords')
nltk.download('wordnet')

def nlpsteps(text):
    """
    Preprocesses text by handling negation, tokenizing, removing stopwords, and lemmatizing.

    Args:
        text (str): The text to be processed.

    Returns:
        str: The processed text after tokenizing, removing stopwords, and lemmatizing.
    """
    # Remove punctuation
    removed_punctuation = re.sub('[^a-zA-Z]', ' ', str(text))
    removed_punctuation = removed_punctuation.lower()
    tokens = removed_punctuation.split()

    # Remove stopwords and 'not' is preserved
    all_stopwords = set(stopwords.words('english'))
    all_stopwords.remove('not')
    filtered_tokens = [token for token in tokens if token.lower() not in all_stopwords]

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos='v') for token in filtered_tokens]  # Use 'v' for verbs
    
    return ' '.join(lemmatized_tokens)

def convert(corpus_trainingdata):
    """
    Data after preprocessing splitting into separate words

    Args:
        corpus_trainingdata: Preprocessed data of the training dataset
     
    Returns: Splitted words
    """
    return [i for item in corpus_trainingdata for i in item.split()]

def getwordcounts(splittedWords):
    occurrences = collections.Counter(splittedWords)
    return occurrences

def get_r1(ns, s):
    """
    Calculate ratio for severe

    Args:
        ns:Number of counts for a word as nonsevere
        s: Number of counts for a word as severe
      
    Returns: severe ratio for a given word
    """
    return s / (s + ns)

def get_r2(ns, s):
    """
    Calculate ratio for nonsevere

    Args:
        ns:Number of counts for a word as nonsevere
        s: Number of counts for a word as severe
      
    Returns: non-severe ratio for a given word
    """
    return ns / (s + ns)

def get_distribution(training_data_df):
    """
    Collects word counts for Severe and NonSevere categories.

    Args:
        training_data_df: DataFrame containing the training dataset with 'Summary' and 'Severity' columns.

    Returns:
        dict, dict: Two dictionaries containing word counts for Severe and NonSevere categories.
    """
    # Preprocess the summary text in the dataset using .loc
    training_data_df.loc[:, 'Summary'] = training_data_df['Summary'].apply(lambda x: nlpsteps(x))
    
    # Separate the dataset into Severe and NonSevere
    severe_df = training_data_df[training_data_df['Severity'] == 'Severe']
    nonsevere_df = training_data_df[training_data_df['Severity'] == 'NonSevere']
    
    # Convert summaries into lists of words
    severe_words = convert(severe_df['Summary'])
    nonsevere_words = convert(nonsevere_df['Summary'])
    
    # Get word counts
    severe_word_counts = getwordcounts(severe_words)
    nonsevere_word_counts = getwordcounts(nonsevere_words)
    
    return severe_word_counts, nonsevere_word_counts

def calculate_ratios(severe_word_counts, nonsevere_word_counts):
    """
    Calculates the ratios for Severe and NonSevere categories.

    Args:
        severe_word_counts: Dictionary containing word counts for Severe category.
        nonsevere_word_counts: Dictionary containing word counts for NonSevere category.

    Returns:
        DataFrame: Contains words with their counts and ratios for Severe and NonSevere categories.
    """
    # Combine word counts into a single dictionary
    all_words = set(severe_word_counts.keys()).union(set(nonsevere_word_counts.keys()))
    all_data = {word: {'Severe': severe_word_counts.get(word, 0), 'NonSevere': nonsevere_word_counts.get(word, 0)} for word in all_words}

    # Calculate ratios and prepare payload
    payload_train = {}
    for word, counts in all_data.items():
        ns = counts.get('NonSevere', 0)
        s = counts.get('Severe', 0)
        r1 = get_r1(ns, s)
        r2 = get_r2(ns, s)
        payload_train[word] = {'r1': r1, 'r2': r2}
    
    payload_train_df = pd.DataFrame(payload_train).T
    return payload_train_df

# Example usage
if __name__ == "__main__":
    # Assuming your data is in a CSV file named 'data.csv'
    bugs_eclipse = pd.read_csv('bugs_eclipse.csv')
    
    bugs_eclipse['Type'] = np.where(bugs_eclipse['Severity'] == 'enhancement', "enhancement", "defect")
    bugs_df = pd.concat([bugs_eclipse])

    # Dropped rows with severity level '--'
    bugs_df = bugs_df[bugs_df["Severity"].str.contains("--") == False].reset_index()

    #Dropped rows with Type "Enhancement" and "Task" because they are not a bug but a new feature
    indexSevere = bugs_df[ (bugs_df['Type'] == 'enhancement') & (bugs_df['Type'] == 'enhancement') ].index
    bugs_df.drop(indexSevere , inplace=True)

    indexSevere = bugs_df[ (bugs_df['Type'] == 'task') & (bugs_df['Type'] == 'task') ].index
    bugs_df.drop(indexSevere , inplace=True)

    # Catagorize the severity level into Severe and NonSevere to make it a binary problem
    bugs_df.loc[bugs_df["Severity"] == "blocker", "Severity"] = 'Severe'
    bugs_df.loc[bugs_df["Severity"] == "critical", "Severity"] = 'Severe'
    bugs_df.loc[bugs_df["Severity"] == "major", "Severity"] = 'Severe'
    bugs_df.loc[bugs_df["Severity"] == "S1", "Severity"] = 'Severe'
    bugs_df.loc[bugs_df["Severity"] == "S2", "Severity"] = 'Severe'
    bugs_df.loc[bugs_df["Severity"] == "S3", "Severity"]= 'NonSevere'
    bugs_df.loc[bugs_df["Severity"] == "normal", "Severity"] = 'NonSevere'
    bugs_df.loc[bugs_df["Severity"] == "minor", "Severity"] = 'NonSevere'
    bugs_df.loc[bugs_df["Severity"] == "trivial", "Severity"] = 'NonSevere'
    bugs_df.loc[bugs_df["Severity"] == "S4", "Severity"] = 'NonSevere'

    bugs_df = bugs_df.head(500)
    # Ensure the dataset only contains the 'Summary' and 'Severity' columns
    df = bugs_df[['Summary', 'Severity']]
    print(df)
    
    # Get word counts
    severe_word_counts, nonsevere_word_counts = get_distribution(df)
    
    # Calculate ratios
    payload_train_df = calculate_ratios(severe_word_counts, nonsevere_word_counts)
    print("Wordlist with Ratios:")
    pd.set_option('display.max_columns', None)
    print(payload_train_df)


                                               Summary   Severity
0    Cannot create Java EE artifacts: servlet, bean...     Severe
1                         Discovery plugin regressions     Severe
2    P2 doesn't install all features, compared to U...     Severe
3                 Versioning issues on latet 3.1 build     Severe
4    [doc] "New and Noteworthy" document in JSF Use...     Severe
..                                                 ...        ...
495             a number of bundles need version bumps  NonSevere
496  Run on Server a JSP  fails on web 2.5 modules ...  NonSevere
497  CHKJ3027E:  Invalid Exception Type java.lang.T...  NonSevere
498      small legal related fix needed for JPA schema  NonSevere
499  [EclipseLink] eclipselink 2.3 schemas not up t...  NonSevere

[500 rows x 2 columns]
Wordlist with Ratios:
                                     r1        r2
j                              0.785714  0.214286
pane                           0.000000  1.000000
attach      

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fatimaa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/fatimaa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import json 

In [None]:
bugs_eclipse = pd.read_csv("bugs_eclipse.csv")

In [None]:
bugs_eclipse['Type'] = np.where(bugs_eclipse['Severity'] == 'enhancement', "enhancement", "defect")
bugs_df = pd.concat([bugs_eclipse])

# Dropped rows with severity level '--'
bugs_df = bugs_df[bugs_df["Severity"].str.contains("--")==False].reset_index()

#Dropped rows with Type "Enhancement" and "Task" because they are not a bug but a new feature
indexSevere = bugs_df[ (bugs_df['Type'] == 'enhancement') & (bugs_df['Type'] == 'enhancement') ].index
bugs_df.drop(indexSevere , inplace=True)

indexSevere = bugs_df[ (bugs_df['Type'] == 'task') & (bugs_df['Type'] == 'task') ].index
bugs_df.drop(indexSevere , inplace=True)



#Catagorise the severity level into a Severe and Non Severe to make it a binary problem
bugs_df.loc[bugs_df["Severity"] == "blocker", "Severity"] = 'Severe'
bugs_df.loc[bugs_df["Severity"] == "critical", "Severity"] = 'Severe'
bugs_df.loc[bugs_df["Severity"] == "major", "Severity"] = 'Severe'
bugs_df.loc[bugs_df["Severity"] == "S1", "Severity"] = 'Severe'
bugs_df.loc[bugs_df["Severity"] == "S2", "Severity"] = 'Severe'
bugs_df.loc[bugs_df["Severity"] == "S3", "Severity"] = 'NonSevere'
bugs_df.loc[bugs_df["Severity"] == "normal", "Severity"] = 'NonSevere'
bugs_df.loc[bugs_df["Severity"] == "minor", "Severity"] = 'NonSevere'
bugs_df.loc[bugs_df["Severity"] == "trivial", "Severity"] = 'NonSevere'
bugs_df.loc[bugs_df["Severity"] == "S4", "Severity"] = 'NonSevere'

bugs_df = bugs_df.tail(50)
# print(bugs_df)
print("total bugs", len(bugs_df))
severerity = bugs_df['Severity'].value_counts()
print(severerity)

In [None]:
bugs_df.head()

In [None]:
bugs_eclipse_length = len(bugs_df)

In [None]:
bugs_eclipse_length

In [None]:
def nlpsteps(x):
    """
    Tokenizes and preprocesses a summary of a bug.

    Args:
        x (str): The summary text to be processed.

    Returns:
        str: The processed text after removing non-alphabetic characters, converting to lowercase,
             lemmatizing words, and removing stopwords.
    """
    
    # Remove non-alphabetic characters
    review = re.sub('[^a-zA-Z]', ' ', str(x))
    review = review.lower()
    review = review.split()

    # Initialize WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()

    # Remove stopwords and lemmatize words
    all_stopwords = set(stopwords.words('english'))
    all_stopwords.remove('not')
    review = [lemmatizer.lemmatize(word) for word in review if word not in all_stopwords]

    # Join the processed words back into a sentence
    review = ' '.join(review)
    return review


In [None]:
def convert(corpus_trainingdata):
    """
    Data after preprocessing splitting into separate words

    Args:
        corpus_trainingdata: Preprocessed data of the training dataset
     
    Returns: Splitted words
    """
#     print("DEMO--------------------------Corpus---------------------------")
#     print(corpus_trainingdata)
    return ([i for item in corpus_trainingdata for i in item.split()])

In [None]:
# Counts of each words in the corpus
def getwordcounts(splittedWords):
    occurrences = collections.Counter(splittedWords)
    return occurrences

In [None]:

# Preprocess the summary text in the dataset
bugs_df['Summary'] = bugs_df['Summary'].apply(lambda x: nlpsteps(x))

# Separate the dataset into Severe and NonSevere
severe_df = bugs_df[bugs_df['Severity'] == 'Severe']
nonsevere_df = bugs_df[bugs_df['Severity'] == 'NonSevere']

# Convert summaries into lists of words
severe_words = convert(severe_df['Summary'])
nonsevere_words = convert(nonsevere_df['Summary'])

# Get word counts
severe_word_counts = getwordcounts(severe_words)
nonsevere_word_counts = getwordcounts(nonsevere_words)

# Convert Counter objects to lists of tuples
severe_list = list(severe_word_counts.items())
nonsevere_list = list(nonsevere_word_counts.items())




In [None]:
severe_list, nonsevere_list = get_distribution(bugs_df)

In [None]:
def get_distribution(val,bugs_eclipse):
    """
    Data after preprocessing splitting into separate words

    Args:
        val: Preprocessed data of the training dataset
        training_data_df: training dataset dataframe
      
    Returns: Splitted words
    """
    bugs_eclipse['Summary'] = bugs_eclipse['Summary'].apply(lambda x: nlpsteps(x))
    records = bugs_eclipse[
        bugs_eclipse["Summary"].str.contains(val)
    ]
    
    if len(records) > 0:
        res = bugs_eclipse[
            training_data_df["Summary"].str.contains(val)
        ]["Severity"].value_counts(dropna=False)
        return dict(res)
    return None

In [None]:
def lexicon_preprocess(trainingdataset_length,training_data_df):
    """
    Create wordlists for severe and non severe from the preprocessed training dataset 

    Args:
        trainingdataset_length: size of training dataset
        training_data_df: training dataset dataframe
      
    Returns: a wordlist that has words from training dataset with its counts for severe and nonsevere
    """
    # print("trainingdataset_length",trainingdataset_length)
    # print("training_data_df",training_data_df)
    corpus_trainingdata = []
    all_data_df_ = []
       
    for i in range(0,trainingdataset_length):
        review = nlpsteps(str(training_data_df['Summary'][i]))
        corpus_trainingdata.append(review)
   

#     #Split words from the corpus
#     splittedWords = convert(corpus_trainingdata)
# #     print("splittedWords---------------", splittedWords)
    
#     splitted_words=getwordcounts(splittedWords)

#     #Converted collection.counter into dictionary
#     splitted_words_dict = dict(splitted_words)

#     keys = splitted_words_dict.keys()
    
#     all_data = {}
#     for key in keys:
#         res = get_distribution(key,training_data_df)
#         if res:
#             all_data[key] = res
#             all_data
#             all_data_df = pd.DataFrame(all_data)
       
#             print("--------------wordlists for severe and non-severe------------------------")
         
#             pd.set_option('display.max_columns', None)
#             print(all_data_df)
        

    return all_data_df 

In [None]:
payload_train = lexicon_preprocess(bugs_eclipse_length, bugs_df)

In [None]:
payload_train