# Data Processing

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import WeightedRandomSampler
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn.metrics import classification_report
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import re
import numpy as np

## Necessary Functions

In [None]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def get_wordnet_pos(treebank_tag):
    """Converts treebank tags to wordnet tags."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None  # Return None if there is no match

def process_text(text):
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Perform POS tagging
    tagged_tokens = pos_tag(tokens)

    lemmatized_tokens = []
    for word, tag in tagged_tokens:
        if word.lower() not in stop_words:
            pos = get_wordnet_pos(tag)
            if pos:
                lemmatized_word = lemmatizer.lemmatize(word, pos)
            else:
                lemmatized_word = lemmatizer.lemmatize(word)
            lemmatized_tokens.append(lemmatized_word.lower())

    return lemmatized_tokens

def clean_text(text):
    text = re.sub(r'\b(js|Js|JS)\b', 'javascript', text)
    text = re.sub(r'\b(os|OS|Os)\b', 'osx', text)
    text = re.sub(r'\b(css|CSS)\b', 'cascadingstylesheet', text)
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = re.sub(r"[^a-zA-Z0-9'\s]", " ", text)
    text = " ".join(text.split())
    return text




## Preprocessing Step

In [None]:

# Load data
bugs_data = pd.read_csv('bugs-train.csv')

bugs_data['clean_summary'] = bugs_data['summary'].apply(clean_text)

bugs_data['lemmatized_tokens'] = bugs_data['clean_summary'].apply(process_text)

bugs_data['processed_text'] = bugs_data['lemmatized_tokens'].apply(lambda tokens: ' '.join(tokens))


In [None]:
label_encoder = LabelEncoder()
bugs_data['encoded_severity'] = label_encoder.fit_transform(bugs_data['severity'])

In [None]:
bugs_data.head()

Unnamed: 0,bug_id,summary,severity,clean_summary,lemmatized_tokens,processed_text,encoded_severity
0,365569,Remove workaround from bug 297227,normal,Remove workaround from bug 297227,"[remove, workaround, bug, 297227]",remove workaround bug 297227,5
1,365578,Print Preview crashes on any URL in gtk2 builds,critical,Print Preview crashes on any URL in gtk2 builds,"[print, preview, crash, url, gtk2, build]",print preview crash url gtk2 build,1
2,365582,Lines are not showing in table,major,Lines are not showing in table,"[lines, show, table]",lines show table,3
3,365584,Firefox render ÛÏsimplified ArabicÛ font fa...,normal,Firefox render simplified Arabic font face inc...,"[firefox, render, simplify, arabic, font, face...",firefox render simplify arabic font face incor...,5
4,365597,Crash [@ nsINodeInfo::NodeInfoManager],critical,Crash nsINodeInfo NodeInfoManager,"[crash, nsinodeinfo, nodeinfomanager]",crash nsinodeinfo nodeinfomanager,1


In [None]:
bugs_data['severity'].value_counts()

severity
normal         125854
critical        18658
major            6053
enhancement      4426
minor            3102
trivial          1204
blocker           701
Name: count, dtype: int64

# Try 1 - Neural Network

In [None]:
texts = bugs_data['processed_text']
labels = bugs_data['severity']

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(texts).toarray()

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train_torch = torch.tensor(X_train, dtype=torch.float32)
X_test_torch = torch.tensor(X_test, dtype=torch.float32)
y_train_torch = torch.tensor(y_train, dtype=torch.long)
y_test_torch = torch.tensor(y_test, dtype=torch.long)


train_data = TensorDataset(X_train_torch, y_train_torch)
train_loader = DataLoader(train_data, batch_size=64)

In [None]:
import torch.nn as nn

class TextClassifier(nn.Module):
    def __init__(self, num_features, num_classes):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(num_features, 512)  # First layer
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, 256)  # Second layer
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(256, 128)  # Third layer
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(0.5)
        self.fc4 = nn.Linear(128, num_classes)  # Output layer
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout3(x)
        x = self.fc4(x)
        x = self.softmax(x)
        return x

model = TextClassifier(num_features=1000, num_classes=len(label_encoder.classes_))


In [None]:
import torch.optim as optim

# Loss and optimizer
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score

model.eval()

# Prediction
with torch.no_grad():
    outputs = model(X_test_torch)
    _, predicted = torch.max(outputs, 1)

precision = precision_score(y_test_torch.numpy(), predicted.numpy(), average='macro')
print(f'Macro Precision: {precision}')

cm = confusion_matrix(y_test_torch.numpy(), predicted.numpy())
print("Confusion Matrix:")
print(cm)

plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt="d", cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# Try 1 - Predictions

## Cleaning the test data

In [None]:
df_test = pd.read_csv('bugs-test.csv')

df_test['clean_summary'] = df_test['summary'].apply(clean_text)

df_test['lemmatized_tokens'] = df_test['clean_summary'].apply(process_text)

df_test['processed_text'] = df_test['lemmatized_tokens'].apply(lambda tokens: ' '.join(tokens))

## Make predictions

In [None]:
tfidf_test = vectorizer.transform(df_test['processed_text'].fillna(" "))  # Replace NaN with empty strings

In [None]:
test_tensor = torch.tensor(tfidf_test.toarray(), dtype=torch.float32)

model.eval()
with torch.no_grad():
    outputs = model(test_tensor)
    _, predicted = torch.max(outputs, 1)


In [None]:
predicted_labels = predicted.numpy()

predicted_labels = label_encoder.inverse_transform(predicted_labels)

df_test['severity'] = predicted_labels


In [None]:
df_test = df_test.drop(['summary', 'clean_summary', 'lemmatized_tokens', 'processed_text'], axis=1)

In [None]:
df_test.to_csv('updated_bugs_test.csv', index=False)

# Try 2 - Stack Ensemble

In [None]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, classification_report
from imblearn.over_sampling import SMOTE
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

file_path = 'bugs-train.csv'
bugs_data = pd.read_csv(file_path)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()

    text = re.sub(r'[^a-zA-Z0-9/:_\-\[\]]', ' ', text)

    tokens = word_tokenize(text)

    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    cleaned_text = ' '.join(cleaned_tokens)

    return cleaned_text

bugs_data['cleaned_summary'] = bugs_data['summary'].apply(clean_text)

label_encoder = LabelEncoder()
bugs_data['severity_encoded'] = label_encoder.fit_transform(bugs_data['severity'])

tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

preprocessor = Pipeline(steps=[
    ('tfidf', tfidf_vectorizer),
    ('scaler', StandardScaler(with_mean=False))
])

X_transformed = preprocessor.fit_transform(bugs_data['cleaned_summary'])
y = bugs_data['severity_encoded']

base_models = [
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42, n_jobs=-1)),
    ('xgb', XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, use_label_encoder=False, eval_metric='mlogloss', random_state=42, n_jobs=-1)),
    ('mlp', MLPClassifier(hidden_layer_sizes=(100,), alpha=0.001, max_iter=300, random_state=42))
]

stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(max_iter=1000, n_jobs=-1)
)

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(stacking_model, X_transformed, y, cv=kf, scoring='precision_macro', n_jobs=-1)

print(f'Cross-validation scores (Macro Precision): {cross_val_scores}')
print(f'Average cross-validation score (Macro Precision): {cross_val_scores.mean()}')

stacking_model.fit(X_transformed, y)

# Try 2 - Predictions


## Cleaning the test data



In [None]:
test_file_path = 'bugs-test.csv'
bugs_test_data = pd.read_csv(test_file_path)

bugs_test_data['cleaned_summary'] = bugs_test_data['summary'].apply(clean_text)

## Make Predictions


In [None]:
X_test_data_transformed = preprocessor.transform(bugs_test_data['cleaned_summary'])

predicted_severity_encoded = stacking_model.predict(X_test_data_transformed)

predicted_severity = label_encoder.inverse_transform(predicted_severity_encoded)

output_data = pd.DataFrame({
    'bug_id': bugs_test_data['bug_id'],
    'severity': predicted_severity
})

output_file_path = 'bugs-test-predicted-10.csv'
output_data.to_csv(output_file_path, index=False)

print(f"Output saved to: {output_file_path}")

# Try 3 - Stacking with LGBM and Random Forest

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy.stats import randint as sp_randint

nltk.download('stopwords')
nltk.download('punkt')

english_stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\bjs\b', ' javascript ', text)
    text = re.sub(r'\bos\b', ' osx ', text)
    text = re.sub(r'\bos x\b', ' osx ', text)
    text = re.sub(r'js_', ' javascript ', text)
    text = re.sub(r'@ js', ' javascript ', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in english_stop_words]
    return ' '.join(filtered_tokens)

data = pd.read_csv('bugs-train.csv')

data['summary_preprocessed'] = data['summary'].apply(preprocess)

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(data['severity'])

vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(data['summary_preprocessed'])

X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size=0.2, random_state=42)

estimators = [
    ('random_forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('lgbm', lgb.LGBMClassifier(random_state=42))
]

stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    stack_method='predict_proba',
    cv=5
)

param_dist = {
    'random_forest__n_estimators': sp_randint(50, 200),
    'random_forest__max_features': ['auto', 'sqrt', 'log2'],
    'random_forest__max_depth': sp_randint(3, 20),
    'random_forest__min_samples_split': sp_randint(2, 11),
    'random_forest__min_samples_leaf': sp_randint(1, 11),
    'lgbm__num_leaves': sp_randint(20, 40),
    'lgbm__max_depth': sp_randint(3, 15),
    'lgbm__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'lgbm__n_estimators': sp_randint(50, 200)
}

random_search = RandomizedSearchCV(
    estimator=stack_model,
    param_distributions=param_dist,
    n_iter=3,
    scoring='accuracy',
    cv=3,
    random_state=42,
    verbose=10,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test)

best_accuracy = accuracy_score(y_test, y_pred)
best_conf_matrix = confusion_matrix(y_test, y_pred)
best_class_report = classification_report(y_test, y_pred)

print(f'Best Accuracy: {best_accuracy}')
print('Best Confusion Matrix:')
print(best_conf_matrix)
print('Best Classification Report:')
print(best_class_report)

# Try 3 - Predictions

## Cleaning the test data

In [None]:
import pandas as pd

# Load the test data
test_data = pd.read_csv('bugs-test.csv')

# Preprocess the 'summary' column using the same preprocessing function
test_data['summary_preprocessed'] = test_data['summary'].apply(preprocess)

## Make predictions

In [None]:
X_test_new = vectorizer.transform(test_data['summary_preprocessed'])

predicted_severity_encoded = best_model.predict(X_test_new)

predicted_severity = encoder.inverse_transform(predicted_severity_encoded)

result_df = pd.DataFrame({
    'bug_id': test_data['bug_id'],
    'severity': predicted_severity
})

result_df.to_csv('predicted_severity.csv', index=False)

print("Predictions saved to 'predicted_severity.csv'.")
