In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample data
categories = ['Severe', 'Non-severe']
eclipse_bugs = [30, 70]  # Example data for Eclipse
firefox_bugs = [45, 55]  # Example data for Firefox

x = np.arange(len(categories))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
bars1 = ax.bar(x - width/2, eclipse_bugs, width, label='Eclipse')
bars2 = ax.bar(x + width/2, firefox_bugs, width, label='Firefox')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_xlabel('Bug Type')
ax.set_ylabel('Number of Bugs')
ax.set_title('Data Distribution in Eclipse and Firefox')
ax.set_xticks(x)
ax.set_xticklabels(categories)
ax.legend()

plt.show()


In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import helper

[nltk_data] Downloading package wordnet to /Users/abyte/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/abyte/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/abyte/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/abyte/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [10]:
bugs_df= pd.read_csv("bugs_eclipse.csv")



# Dropped rows with severity level '--'
bugs_df = bugs_df[bugs_df["Severity"].str.contains("--")==False].reset_index()

# #Dropped rows with Type "Enhancement" and "Task" because they are not a bug but a new feature
# indexSevere = bugs_df[ (bugs_df['Type'] == 'enhancement') & (bugs_df['Type'] == 'enhancement') ].index
# bugs_df.drop(indexSevere , inplace=True)

# indexSevere = bugs_df[(bugs_df['Type'] == 'task') & (bugs_df['Type'] == 'task') ].index
# bugs_df.drop(indexSevere , inplace=True)


#Catagorise the severity level into a Severe and Non Severe to make it a binary problem
bugs_df.loc[bugs_df["Severity"] == "blocker", "Severity"] = 'Severe'
bugs_df.loc[bugs_df["Severity"] == "critical", "Severity"] = 'Severe'
bugs_df.loc[bugs_df["Severity"] == "major", "Severity"] = 'Severe'
bugs_df.loc[bugs_df["Severity"] == "S1", "Severity"] = 'Severe'
bugs_df.loc[bugs_df["Severity"] == "S2", "Severity"] = 'Severe'
bugs_df.loc[bugs_df["Severity"] == "S3", "Severity"] = 'NonSevere'
bugs_df.loc[bugs_df["Severity"] == "normal", "Severity"] = 'NonSevere'
bugs_df.loc[bugs_df["Severity"] == "minor", "Severity"] = 'NonSevere'
bugs_df.loc[bugs_df["Severity"] == "trivial", "Severity"] = 'NonSevere'
bugs_df.loc[bugs_df["Severity"] == "S4", "Severity"] = 'NonSevere'

bugs_df = bugs_df.tail(200)
print(bugs_df)

       index  Bug ID             Product           Component  \
30954  30954  100010   WTP Java EE Tools            jst.j2ee   
30955  30955  159541          WTP Releng              releng   
30956  30956  352018  WTP Source Editing            wst.html   
30957  30957   95328     WTP ServerTools          wst.server   
30958  30958  103072     WTP Webservices              wst.ws   
...      ...     ...                 ...                 ...   
31149  31149  196418  WTP Source Editing             jst.jsp   
31150  31150  102610           Web Tools  Web Standard Tools   
31151  31151   80876  WTP Source Editing            wst.html   
31152  31152  102552           Web Tools  Web Standard Tools   
31153  31153  297974  WTP Source Editing             wst.xsd   

                                Assignee    Status  Resolution  \
30954                nagrawal@us.ibm.com    CLOSED  WORKSFORME   
30955  webtools.releng-inbox@eclipse.org       NEW         ---   
30956         wst.html-inbox@ecli

In [11]:
# Apply the preprocessing function to the 'Summary' column
bugs_df['Processed_Summary'] = bugs_df['Summary'].apply(lambda x: helper.nlpsteps(x))


In [12]:
bugs_df['Lowered_Summary'] = bugs_df['Processed_Summary'].apply(lambda x: x.lower())

In [13]:
print(bugs_df['Lowered_Summary'])

30954    new module project wizard use configured serve...
30955                               test plugins qualifier
30956    jsf facelet code completion suggest html tag e...
30957    performance lauching server wizard activates m...
30958      wsexplorer request response envelope borderless
                               ...                        
31149                   tag library editor feature request
31150                                      jaxb code assit
31151    unit test comparison based workspace not_files...
31152                       rsp rich server side plateform
31153                    allow xsd editor used outside ide
Name: Lowered_Summary, Length: 200, dtype: object


In [15]:
# Assuming you've already imported necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC

# Initialize CountVectorizer
cv = CountVectorizer()
cv.fit(bugs_df['Lowered_Summary'])

# Transform the processed summaries
X_train = cv.transform(bugs_df['Lowered_Summary'])

print(bugs_df)
# Assuming 'target' is correctly defined
target = bugs_df.iloc[:, -3].values
print("------target-----", target)


# Initialize LinearSVC
svm = LinearSVC()

# Fit the model
svm.fit(X_train, target)

# Get the coefficients from the trained SVM model
coef = svm.coef_.ravel()

# Get the feature names from CountVectorizer
feature_names = cv.get_feature_names()

# Create a dictionary mapping feature names to coefficients
word_coefficients = {feature_names[i]: coef[i] for i in range(len(feature_names))}

# print(word_coefficients)

# # Print the word list and their coefficients
# for word, coefficient in word_coefficients.items():
# #     print(f"{word}: {coefficient:.4f}")

#     if coefficient < 0:
#         severe_lexicons[word] = {"ratio": coefficient}
#     else:
#         non_severe_lexicons[word] = {"ratio": coefficient}

# result = {
#     "Severe Lexicons": severe_lexicons,
#     "NonSevere Lexicon": non_severe_lexicons
# }

# print(result)


# severe_lexicons = {}
# non_severe_lexicons = {}

# for word, coefficient in word_coefficients.items():
#     if coefficient < 0:
#         severe_lexicons[word] = {"ratio": coefficient}
#     else:
#         non_severe_lexicons[word] = {"ratio": coefficient}

# result = {
#     "Severe Lexicons": severe_lexicons,
#     "NonSevere Lexicon": non_severe_lexicons
# }

# print(result)


# print(word_coefficients)

       index  Bug ID             Product           Component  \
30954  30954  100010   WTP Java EE Tools            jst.j2ee   
30955  30955  159541          WTP Releng              releng   
30956  30956  352018  WTP Source Editing            wst.html   
30957  30957   95328     WTP ServerTools          wst.server   
30958  30958  103072     WTP Webservices              wst.ws   
...      ...     ...                 ...                 ...   
31149  31149  196418  WTP Source Editing             jst.jsp   
31150  31150  102610           Web Tools  Web Standard Tools   
31151  31151   80876  WTP Source Editing            wst.html   
31152  31152  102552           Web Tools  Web Standard Tools   
31153  31153  297974  WTP Source Editing             wst.xsd   

                                Assignee    Status  Resolution  \
30954                nagrawal@us.ibm.com    CLOSED  WORKSFORME   
30955  webtools.releng-inbox@eclipse.org       NEW         ---   
30956         wst.html-inbox@ecli



In [None]:
# def plot_coefficients(classifier, feature_names, top_features=20):
#     coef = classifier.coef_.ravel()
#     top_positive_coefficients = np.argsort(coef)[-top_features:]
#     top_negative_coefficients = np.argsort(coef)[:top_features]
#     top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])

#     # create plot
#     plt.figure(figsize=(15, 5))
#     colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
#     plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
#     feature_names = np.array(feature_names)
#     plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
#     plt.show()

# cv = CountVectorizer()
# cv.fit(bugs_df['Summary'])
# print("length", len(cv.vocabulary_))
# print("Feature Names", cv.get_feature_names())
# X_train = cv.transform(bugs_df['Summary'])
# target = bugs_df.iloc[:, -2].values
# print("target",target)

# svm = LinearSVC()
# svm.fit(X_train, target)
# plot_coefficients(svm, cv.get_feature_names())

# #Get the coefficients from the trained SVM model
# coef = svm.coef_.ravel()

# # Get the feature names from the CountVectorizer
# feature_names = cv.get_feature_names()

# # Create a dictionary mapping feature names to coefficients
# word_coefficients = {feature_names[i]: coef[i] for i in range(len(feature_names))}

# # Print the word list and their coefficients
# for word, coefficient in word_coefficients.items():
#     print(f"{word}: {coefficient:.4f}")


In [None]:
# # [ 1  2  8  9 11 12 16 17 18 19]
# def plot_coefficients(classifier, feature_names, top_features=30):
#     coef = classifier.coef_.ravel()
#     top_positive_coefficients = np.argsort(coef)[-top_features:]
#     top_negative_coefficients = np.argsort(coef)[:top_features]
#     top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])

#     # create plot
#     plt.figure(figsize=(15, 5))
#     colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
#     plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
#     feature_names = np.array(feature_names)
#     plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
#     plt.show()

# cv = CountVectorizer()
# cv.fit(bugs_df['Summary'])
# X_train = cv.transform(bugs_df['Summary'])
# # print(bugs_df['Summary'])
# target = bugs_df.iloc[:, -2].values
# # print(target)

# # Separate the data into severe and non-severe classes
# severe_indices = np.where(target == 'Severe')[0]
# non_severe_indices = np.where(target == 'NonSevere')[0]

# print("severe_indices", severe_indices)
# print("non_severe_indices", non_severe_indices)

# # Train separate SVM models for severe and non-severe classes
# svm_severe = LinearSVC()
# svm_severe.fit(X_train[severe_indices], target[severe_indices])
# print(svm_severe)

# svm_non_severe = LinearSVC()
# svm_non_severe.fit(X_train[non_severe_indices], target[non_severe_indices])

# # Get the coefficients from the trained SVM models
# coef_severe = svm_severe.coef_.ravel()
# coef_non_severe = svm_non_severe.coef_.ravel()

# # Create dictionaries mapping feature names to coefficients for severe and non-severe classes
# word_coefficients_severe = {feature_names[i]: coef_severe[i] for i in range(len(feature_names))}
# word_coefficients_non_severe = {feature_names[i]: coef_non_severe[i] for i in range(len(feature_names))}

# # Print the word list and their coefficients for both classes
# for word in feature_names:
#     print(f"{word}: Severe={word_coefficients_severe.get(word, 0):.4f}, Non-Severe={word_coefficients_non_severe.get(word, 0):.4f}")
