In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Load datasets
df = pd.read_csv('bugs-train.csv')
test_data = pd.read_csv('bugs-test.csv')

# Preprocess the data
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['summary'])
X_test = vectorizer.transform(test_data['summary'])

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['severity'])

# Split the data into train, validation, and test sets
temp_df, X_test_df, temp_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
train_df, val_df, train_y, val_y = train_test_split(temp_df, temp_y, test_size=0.2, random_state=42, stratify=temp_y)

# Define the AdaBoost classifier with a DecisionTree base estimator
ada_boost = AdaBoostClassifier(estimator=DecisionTreeClassifier())

# Define the hyperparameters grid
param_grid = {
    'estimator__max_depth': [1, 2],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1]
}


In [None]:
print("hello1")
# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=ada_boost, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(train_df, train_y)
print("hello2")
# Get the best estimator
best_ada_boost = grid_search.best_estimator_
print("hello3")
# Evaluate on the validation set
val_pred = best_ada_boost.predict(val_df)
print('Validation Classification Report:')
print(classification_report(val_y, val_pred))

# Predict the severities for the test data
test_pred = best_ada_boost.predict(X_test)

# Correctly map predicted labels back using the label encoder
predicted_labels_mapped = label_encoder.inverse_transform(test_pred)

# Save the predictions
pred_df = pd.DataFrame({
    'bug_id': test_data['bug_id'],
    'severity': predicted_labels_mapped
})
pred_df.to_csv("adaboost_predicted_bugs2.csv", index=False)

print("Predictions saved to 'adaboost_predicted_bugs.csv'")

hello1
hello2
hello3
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        98
           1       0.78      0.58      0.67      2612
           2       0.00      0.00      0.00       620
           3       0.00      0.00      0.00       847
           4       0.00      0.00      0.00       434
           5       0.84      0.98      0.91     17620
           6       0.00      0.00      0.00       169

    accuracy                           0.84     22400
   macro avg       0.23      0.22      0.23     22400
weighted avg       0.76      0.84      0.79     22400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Predictions saved to 'adaboost_predicted_bugs.csv'


In [None]:
import regex
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

nltk.download('stopwords')
nltk.download('wordnet')

wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')

def filter_text(text, stop_words):
    word_tokens = WordPunctTokenizer().tokenize(text.lower())
    filtered_text = [regex.sub('[^a-z ]+', '', w) for w in word_tokens]
    filtered_text = [regex.sub('[ ][ ]+', '', w) for w in filtered_text]
    filtered_text = [regex.sub('[0-9]', '', w) for w in filtered_text]
    filtered_text = [wordnet_lemmatizer.lemmatize(w, 'v') for w in filtered_text if not w in stop_words and len(w) > 2 and len(w) < 50]
    return " ".join(filtered_text)

df = pd.read_csv("bugs-train.csv")
df = df[df['severity'] != "trivial"] #not in the given classes
df["filtered_text"] = df["summary"].apply(lambda x : filter_text(x, stop))
df.head()

df['classification'] = df['severity'].replace(['enhancement', 'minor', 'normal', 'major', 'blocker','critical'],[1,2,3,4,5,6])
df['classification'] = df['classification'].dropna()
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna()
df['classification'].astype(int)

temp_df, test_df = train_test_split(df, test_size=0.3, random_state = 42, stratify = df['classification'])
train_df, val_df = train_test_split(temp_df, test_size=0.2, random_state = 42, stratify = temp_df['classification'])

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words = 'english',  min_df = 2, max_features = 17673, use_idf = True)
x_train = tfidf.fit_transform(train_df['filtered_text'])
x_val = tfidf.transform(val_df['filtered_text'])
y_train = train_df['classification']
y_val = val_df['classification']

x_test = tfidf.transform(test_df['filtered_text'])
y_test = test_df['classification']

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and fit the AdaBoost model
adaboost_model = AdaBoostClassifier(n_estimators=100, random_state=42)
adaboost_model.fit(x_train, y_train)

y_val_pred = adaboost_model.predict(x_val)
print("Validation Accuracy: ", accuracy_score(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_val, y_val_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=adaboost_model.classes_)
disp.plot()
plt.show()

y_test_pred = adaboost_model.predict(x_test)
print("Test Accuracy: ", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

cm = confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=adaboost_model.classes_)
disp.plot()
plt.show()

pred_df = pd.read_csv("bugs-test.csv")
pred_df['filtered_text'] = pred_df["summary"].apply(lambda x : filter_text(x, stop))

tfidf1 = TfidfVectorizer(stop_words = 'english',  min_df = 2, max_features = 17673, use_idf = True)
x_test = tfidf1.fit_transform(pred_df['filtered_text'])

predictions = adaboost_model.predict(x_test)

severity_mapping = {'enhancement': 1, 'minor': 2, 'normal': 3, 'major': 4, 'blocker': 5, 'critical': 6}
inverse_severity_mapping = {v: k for k, v in severity_mapping.items()}
predictions_labels = [inverse_severity_mapping[pred] for pred in predictions]

pred_df['prediction'] = predictions
pred_df = pd.DataFrame({
    'bug_id': pred_df['bug_id'],
    'prediction': predictions_labels
})
pred_df.to_csv("predicted_bugs.csv", index=False)

print("Predictions saved to 'predicted_bugs.csv'")
