In [55]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import collections

# Ensure NLTK data is downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Define your nlpsteps function
def nlpsteps(x):
    """
    Tokenizes and preprocesses a summary of a bug.

    Args:
        x (str): The summary text to be processed.

    Returns:
        str: The processed text after removing non-alphabetic characters, converting to lowercase,
             lemmatizing words, and removing stopwords.
    """
    # Remove non-alphabetic characters
    review = re.sub('[^a-zA-Z]', ' ', str(x))
    review = review.lower()
    review = review.split()

    # Initialize WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()

    # Remove stopwords and lemmatize words
    all_stopwords = set(stopwords.words('english'))
    all_stopwords.remove('not')
    review = [lemmatizer.lemmatize(word) for word in review if word not in all_stopwords]

    # Join the processed words back into a sentence
    review = ' '.join(review)
    return review

# Define your convert function
def convert(corpus_trainingdata):
    """
    Data after preprocessing splitting into separate words

    Args:
        corpus_trainingdata: Preprocessed data of the training dataset
     
    Returns: Splitted words
    """
    return [i for item in corpus_trainingdata for i in item.split()]

# Counts of each words in the corpus
def getwordcounts(splittedWords):
    occurrences = collections.Counter(splittedWords)
    return occurrences

# Function that returns the counts for each words that falls in Severe or NonSevere category
def get_distribution(training_data_df):
    """
    Returns two lists of frequent words with their counts for Severe and NonSevere categories.

    Args:
        training_data_df: DataFrame containing the training dataset with 'Summary' and 'Severity' columns.

    Returns:
        (list, list): Two lists of tuples, each containing a word and its count, for Severe and NonSevere categories respectively.
    """
    # Preprocess the summary text in the dataset
    training_data_df['Summary'] = training_data_df['Summary'].apply(lambda x: nlpsteps(x))
    
    # Separate the dataset into Severe and NonSevere
    severe_df = training_data_df[training_data_df['Severity'] == 'Severe']
    nonsevere_df = training_data_df[training_data_df['Severity'] == 'NonSevere']
    
    # Convert summaries into lists of words
    severe_words = convert(severe_df['Summary'])
    nonsevere_words = convert(nonsevere_df['Summary'])
    
    # Get word counts
    severe_word_counts = getwordcounts(severe_words)
    nonsevere_word_counts = getwordcounts(nonsevere_words)
    
    # Convert Counter objects to lists of tuples
    severe_list = list(severe_word_counts.items())
    nonsevere_list = list(nonsevere_word_counts.items())
    
    return severe_list, nonsevere_list

# Example usage
if __name__ == "__main__":
    # Assuming your data is in a CSV file named 'data.csv'
    df = pd.read_csv('bugs_eclipse.csv')

    # Ensure the dataset only contains the 'Summary' and 'Severity' columns
    df = df[['Summary', 'Severity']]
    
    severe_list, nonsevere_list = get_distribution(df)
    print("Severe words and counts:", severe_list)
    print("NonSevere words and counts:", nonsevere_list)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fatimaa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/fatimaa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Severe words and counts: []
NonSevere words and counts: []


In [40]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import json 

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fatimaa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
bugs_eclipse = pd.read_csv("bugs_eclipse.csv")

In [42]:
bugs_eclipse['Type'] = np.where(bugs_eclipse['Severity'] == 'enhancement', "enhancement", "defect")
bugs_df = pd.concat([bugs_eclipse])

# Dropped rows with severity level '--'
bugs_df = bugs_df[bugs_df["Severity"].str.contains("--")==False].reset_index()

#Dropped rows with Type "Enhancement" and "Task" because they are not a bug but a new feature
indexSevere = bugs_df[ (bugs_df['Type'] == 'enhancement') & (bugs_df['Type'] == 'enhancement') ].index
bugs_df.drop(indexSevere , inplace=True)

indexSevere = bugs_df[ (bugs_df['Type'] == 'task') & (bugs_df['Type'] == 'task') ].index
bugs_df.drop(indexSevere , inplace=True)



#Catagorise the severity level into a Severe and Non Severe to make it a binary problem
bugs_df.loc[bugs_df["Severity"] == "blocker", "Severity"] = 'Severe'
bugs_df.loc[bugs_df["Severity"] == "critical", "Severity"] = 'Severe'
bugs_df.loc[bugs_df["Severity"] == "major", "Severity"] = 'Severe'
bugs_df.loc[bugs_df["Severity"] == "S1", "Severity"] = 'Severe'
bugs_df.loc[bugs_df["Severity"] == "S2", "Severity"] = 'Severe'
bugs_df.loc[bugs_df["Severity"] == "S3", "Severity"] = 'NonSevere'
bugs_df.loc[bugs_df["Severity"] == "normal", "Severity"] = 'NonSevere'
bugs_df.loc[bugs_df["Severity"] == "minor", "Severity"] = 'NonSevere'
bugs_df.loc[bugs_df["Severity"] == "trivial", "Severity"] = 'NonSevere'
bugs_df.loc[bugs_df["Severity"] == "S4", "Severity"] = 'NonSevere'

bugs_df = bugs_df.tail(50)
# print(bugs_df)
print("total bugs", len(bugs_df))
severerity = bugs_df['Severity'].value_counts()
print(severerity)

total bugs 50
Severity
NonSevere    49
Severe        1
Name: count, dtype: int64


In [54]:
bugs_df.head()

Unnamed: 0,index,Bug ID,Product,Component,Assignee,Status,Resolution,Summary,Changed,Priority,Severity,Type
31028,31028,148084,WTP Source Editing,wst.xsd,keith.chong.ca@gmail.com,CLOSED,WORKSFORME,Context menu contents while editing names,21/01/2008 16:42,P4,NonSevere,defect
31029,31029,240170,WTP Source Editing,wst.xpath,d_a_carver@yahoo.com,RESOLVED,FIXED,[xslt][editor] XPath content assist does not n...,11/08/2010 17:08,P4,NonSevere,defect
31030,31030,129714,WTP Source Editing,jst.jsp,sarika.sinha@in.ibm.com,RESOLVED,FIXED,[formatting] Cleanup does not affect directives,07/12/2010 16:02,P4,NonSevere,defect
31031,31031,103180,WTP Webservices,jst.ws,pmoogk@ca.ibm.com,CLOSED,WORKSFORME,Excessive WTP plugins activated on startup in ...,12/09/2005 13:51,P4,NonSevere,defect
31032,31032,86408,WTP Source Editing,wst.xml,david_williams@acm.org,RESOLVED,WORKSFORME,source validation disappears for element betwe...,30/04/2010 14:32,P4,NonSevere,defect


In [44]:
bugs_eclipse_length = len(bugs_df)

In [45]:
bugs_eclipse_length

50

In [46]:
def nlpsteps(x):
    """
    Tokenizes and preprocesses a summary of a bug.

    Args:
        x (str): The summary text to be processed.

    Returns:
        str: The processed text after removing non-alphabetic characters, converting to lowercase,
             lemmatizing words, and removing stopwords.
    """
    
    # Remove non-alphabetic characters
    review = re.sub('[^a-zA-Z]', ' ', str(x))
    review = review.lower()
    review = review.split()

    # Initialize WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()

    # Remove stopwords and lemmatize words
    all_stopwords = set(stopwords.words('english'))
    all_stopwords.remove('not')
    review = [lemmatizer.lemmatize(word) for word in review if word not in all_stopwords]

    # Join the processed words back into a sentence
    review = ' '.join(review)
    return review


In [47]:
def convert(corpus_trainingdata):
    """
    Data after preprocessing splitting into separate words

    Args:
        corpus_trainingdata: Preprocessed data of the training dataset
     
    Returns: Splitted words
    """
#     print("DEMO--------------------------Corpus---------------------------")
#     print(corpus_trainingdata)
    return ([i for item in corpus_trainingdata for i in item.split()])

In [48]:
# Counts of each words in the corpus
def getwordcounts(splittedWords):
    occurrences = collections.Counter(splittedWords)
    return occurrences

In [53]:

# Preprocess the summary text in the dataset
bugs_df['Summary'] = bugs_df['Summary'].apply(lambda x: nlpsteps(x))

# Separate the dataset into Severe and NonSevere
severe_df = bugs_df[bugs_df['Severity'] == 'Severe']
nonsevere_df = bugs_df[bugs_df['Severity'] == 'NonSevere']

# Convert summaries into lists of words
severe_words = convert(severe_df['Summary'])
nonsevere_words = convert(nonsevere_df['Summary'])

# Get word counts
severe_word_counts = getwordcounts(severe_words)
nonsevere_word_counts = getwordcounts(nonsevere_words)

# Convert Counter objects to lists of tuples
severe_list = list(severe_word_counts.items())
nonsevere_list = list(nonsevere_word_counts.items())




NameError: name 'WordNetLemmatizer' is not defined

In [51]:
severe_list, nonsevere_list = get_distribution(bugs_df)

NameError: name 'WordNetLemmatizer' is not defined

In [37]:
def get_distribution(val,bugs_eclipse):
    """
    Data after preprocessing splitting into separate words

    Args:
        val: Preprocessed data of the training dataset
        training_data_df: training dataset dataframe
      
    Returns: Splitted words
    """
    bugs_eclipse['Summary'] = bugs_eclipse['Summary'].apply(lambda x: nlpsteps(x))
    records = bugs_eclipse[
        bugs_eclipse["Summary"].str.contains(val)
    ]
    
    if len(records) > 0:
        res = bugs_eclipse[
            training_data_df["Summary"].str.contains(val)
        ]["Severity"].value_counts(dropna=False)
        return dict(res)
    return None

In [38]:
def lexicon_preprocess(trainingdataset_length,training_data_df):
    """
    Create wordlists for severe and non severe from the preprocessed training dataset 

    Args:
        trainingdataset_length: size of training dataset
        training_data_df: training dataset dataframe
      
    Returns: a wordlist that has words from training dataset with its counts for severe and nonsevere
    """
    # print("trainingdataset_length",trainingdataset_length)
    # print("training_data_df",training_data_df)
    corpus_trainingdata = []
    all_data_df_ = []
       
    for i in range(0,trainingdataset_length):
        review = nlpsteps(str(training_data_df['Summary'][i]))
        corpus_trainingdata.append(review)
   

#     #Split words from the corpus
#     splittedWords = convert(corpus_trainingdata)
# #     print("splittedWords---------------", splittedWords)
    
#     splitted_words=getwordcounts(splittedWords)

#     #Converted collection.counter into dictionary
#     splitted_words_dict = dict(splitted_words)

#     keys = splitted_words_dict.keys()
    
#     all_data = {}
#     for key in keys:
#         res = get_distribution(key,training_data_df)
#         if res:
#             all_data[key] = res
#             all_data
#             all_data_df = pd.DataFrame(all_data)
       
#             print("--------------wordlists for severe and non-severe------------------------")
         
#             pd.set_option('display.max_columns', None)
#             print(all_data_df)
        

    return all_data_df 

In [39]:
payload_train = lexicon_preprocess(bugs_eclipse_length, bugs_df)

KeyError: 0

In [None]:
payload_train