In [7]:

import os
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

In [8]:
os.chdir('/Users/hayrettinsendag/Documents/GitHub/412proj')  
train_data = pd.read_csv('data/bugs-train.csv')
test_data = pd.read_csv('data/bugs-test.csv')

In [9]:

severity_mapping = {
    'enhancement': 0,
    'trivial': 1,
    'minor': 2,
    'normal': 3,
    'major': 4,
    'blocker': 5,
    'critical': 6
}

inverse_severity_mapping = {v: k for k, v in severity_mapping.items()}
train_data['severity'] = train_data['severity'].map(severity_mapping)


X_train = train_data['summary']
y_train = train_data['severity']
X_test = test_data['summary']

vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)



In [10]:

xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=7)
xgb_model.fit(X_train_tfidf, y_train)


y_pred_xgb = xgb_model.predict(X_train_tfidf)
print("XGBoost Classification Report:\n", classification_report(y_train, y_pred_xgb))
print("Macro Precision XGBoost:", precision_score(y_train, y_pred_xgb, average='macro'))


XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.03      0.05      4426
           1       1.00      0.02      0.03      1204
           2       0.97      0.01      0.02      3102
           3       0.87      0.98      0.92    125854
           4       0.82      0.09      0.15      6053
           5       1.00      0.13      0.23       701
           6       0.83      0.79      0.81     18658

    accuracy                           0.87    159998
   macro avg       0.92      0.29      0.32    159998
weighted avg       0.87      0.87      0.83    159998

Macro Precision XGBoost: 0.9196060414978662


In [11]:
# Train LightGBM model
lgb_model = lgb.LGBMClassifier(objective='multiclass', num_class=7)
lgb_model.fit(X_train_tfidf, y_train)

# Predict and evaluate LightGBM model on the training data
y_pred_lgb = lgb_model.predict(X_train_tfidf)
print("LightGBM Classification Report:\n", classification_report(y_train, y_pred_lgb))
print("Macro Precision LightGBM:", precision_score(y_train, y_pred_lgb, average='macro'))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.127967 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 176138
[LightGBM] [Info] Number of data points in the train set: 159998, number of used features: 1000
[LightGBM] [Info] Start training from score -3.587665
[LightGBM] [Info] Start training from score -4.889512
[LightGBM] [Info] Start training from score -3.943114
[LightGBM] [Info] Start training from score -0.240039
[LightGBM] [Info] Start training from score -3.274607
[LightGBM] [Info] Start training from score -5.430409
[LightGBM] [Info] Start training from score -2.148886
LightGBM Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.03      0.05      4426
           1       0.92      0.04      0.08      1204
           2       0.78      0.01      0.03      3102
           3       0.87      0.98      0.92    125854
           4       0.75 

In [13]:

y_test_pred = lgb_model.predict(X_test_tfidf)


y_test_pred_labels = [inverse_severity_mapping[pred] for pred in y_test_pred]


assert len(test_data) == 86094, f"Test data should have 86094 rows, but has {len(test_data)} rows."

submission = pd.DataFrame({'bug_id': test_data['bug_id'], 'severity': y_test_pred_labels})
submission.to_csv('submission.csv', index=False)