In [6]:
#!pip install regex
!pip install nltk
#!pip install wordcloud

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
                                              0.0/1.5 MB ? eta -:--:--
                                              0.0/1.5 MB 1.4 MB/s eta 0:00:02
     --                                       0.1/1.5 MB 1.3 MB/s eta 0:00:02
     -----                                    0.2/1.5 MB 1.5 MB/s eta 0:00:01
     ------                                   0.3/1.5 MB 1.8 MB/s eta 0:00:01
     ------                                   0.3/1.5 MB 1.8 MB/s eta 0:00:01
     ------                                   0.3/1.5 MB 1.8 MB/s eta 0:00:01
     ------                                   0.3/1.5 MB 1.8 MB/s eta 0:00:01
     --------------------                     0.8/1.5 MB 2.1 MB/s eta 0:00:01
     ---------------------                    0.8/1.5 MB 1.9 MB/s eta 0:00:01
     ----------------------                   0.9/1.5 MB 1.8 MB/s eta 0:00:01
     --------------------------               1.0/1.5 MB 1.9 MB/s eta 0:00:01
    


[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
import regex
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer
import numpy as np

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')

def filter_text(text, stop_words):
    word_tokens = WordPunctTokenizer().tokenize(text.lower())
    filtered_text = [regex.sub('[^a-z ]+', '', w) for w in word_tokens]
    filtered_text = [regex.sub('[ ][ ]+', '', w) for w in filtered_text]
    filtered_text = [regex.sub('[0-9]', '', w) for w in filtered_text]
    filtered_text = [wordnet_lemmatizer.lemmatize(w, 'v') for w in filtered_text if not w in stop_words and len(w) > 2 and len(w) < 50]
    return ' '.join(filtered_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\90553\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\90553\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\90553\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [16]:
# Load the data
train_data = pd.read_csv('../datasets/bugs-train.csv')
test_data = pd.read_csv('../datasets/bugs-test.csv')



# Text and labels
X = train_data['summary'].values
y = train_data['severity'].values


In [17]:
# Filter the text
X = np.array([filter_text(text, stop) for text in X])
test_summaries = np.array([filter_text(text, stop) for text in test_data['summary'].values])


In [18]:
# Encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = to_categorical(y)

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Tokenize the text data
tokenizer = Tokenizer(num_words=5000, lower=True, oov_token='UNK')
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(test_summaries)

# Pad the sequences
max_seq_length = 100
X_train = pad_sequences(X_train, maxlen=max_seq_length)
X_val = pad_sequences(X_val, maxlen=max_seq_length)
X_test = pad_sequences(X_test, maxlen=max_seq_length)

In [19]:
from tensorflow.keras.optimizers import AdamW

# Build the model
from tensorflow.keras.regularizers import l2

# Build the model with L2 regularization and adjusted dropout rates
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_seq_length))
model.add(SpatialDropout1D(0.3))  # Adjusted dropout rate
model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3, kernel_regularizer=l2(0.01)))  # Added L2 regularization and adjusted dropout rates
model.add(Dense(7, activation='softmax', kernel_regularizer=l2(0.01)))  # Added L2 regularization to the dense layer

model.compile(loss='categorical_crossentropy', optimizer=AdamW(learning_rate=0.001, weight_decay=1e-4), metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=7, batch_size=2048, validation_data=(X_val, y_val), verbose=1)

# Evaluate the model
results = model.evaluate(X_val, y_val, verbose=1)
print(f"Validation Loss: {results[0]}")
print(f"Validation Accuracy: {results[1]}")

# Map predictions back to original severity labels
severity_mapping = {'enhancement': 1, 'minor': 2, 'normal': 3, 'major': 4, 'blocker': 5, 'critical': 6, 'trivial': 7}
inverse_severity_mapping = {v: k for k, v in severity_mapping.items()}

# Make predictions
predictions = model.predict(X_test)
predicted_labels = predictions.argmax(axis=1)

# Correctly map predicted labels back using the label encoder
predicted_labels_mapped = label_encoder.inverse_transform(predicted_labels)

# Save the predictions
pred_df = pd.DataFrame({
    'bug_id': test_data['bug_id'],
    'severity': predicted_labels_mapped
})
pred_df.to_csv("./predictions/predicted2_bugs.csv", index=False)

print("Predictions saved to 'predicted_bugs.csv'")



Epoch 1/7
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 2s/step - accuracy: 0.7402 - loss: 2.5260 - val_accuracy: 0.7912 - val_loss: 1.0797
Epoch 2/7
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 2s/step - accuracy: 0.7888 - loss: 0.9780 - val_accuracy: 0.8532 - val_loss: 0.6928
Epoch 3/7
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 2s/step - accuracy: 0.8468 - loss: 0.6899 - val_accuracy: 0.8570 - val_loss: 0.6351
Epoch 4/7
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 1s/step - accuracy: 0.8522 - loss: 0.6406 - val_accuracy: 0.8585 - val_loss: 0.6108
Epoch 5/7
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 1s/step - accuracy: 0.8515 - loss: 0.6269 - val_accuracy: 0.8579 - val_loss: 0.6081
Epoch 6/7
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 1s/step - accuracy: 0.8528 - loss: 0.6184 - val_accuracy: 0.8591 - val_loss: 0.5926
Epoch 7/7
[1m63/63[0m [32m━━━━━━━━━━━━━━━━