In [98]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

In [99]:
df = pd.read_csv('data/bug-severity-cleaned.csv')
df.head()

Unnamed: 0,product_area,customer_type,priority,reporter_experience,reproduction_steps,num_attachments,description_length,affected_users_count,prev_similar_tickets,module_version,ticket_type,auto_tag_count,sla_breached,assigned_team,severity,day_of_week,hour_of_day,month_year,days_to_resolve
0,Auth,Internal,Medium,Senior,True,1,574.0,13.0,0.0,5.2.11,Bug,5,No,Qa,Medium,Friday,13,2023-09,1
1,Devops,Internal,High,Junior,True,0,299.0,16.0,2.0,4.2.4,Question,3,Yes,Dev,Medium,Wednesday,6,2025-03,1
2,Ml,Smb,Low,Junior,False,2,526.0,13.0,0.0,3.2.11,Bug,1,No,Qa,Low,Tuesday,18,2023-07,1
3,Auth,Free-Tier,Low,Mid,True,2,734.0,7.0,0.0,4.8.4,Bug,2,No,Qa,Low,Wednesday,22,2023-10,1
4,Backend,Enterprise,Medium,Mid,True,1,374.0,7.0,3.0,2.9.15,Bug,3,No,Security,Medium,Friday,0,2025-06,0


In [100]:
X = df.drop(columns=['severity'],axis=1)
X.head()

Unnamed: 0,product_area,customer_type,priority,reporter_experience,reproduction_steps,num_attachments,description_length,affected_users_count,prev_similar_tickets,module_version,ticket_type,auto_tag_count,sla_breached,assigned_team,day_of_week,hour_of_day,month_year,days_to_resolve
0,Auth,Internal,Medium,Senior,True,1,574.0,13.0,0.0,5.2.11,Bug,5,No,Qa,Friday,13,2023-09,1
1,Devops,Internal,High,Junior,True,0,299.0,16.0,2.0,4.2.4,Question,3,Yes,Dev,Wednesday,6,2025-03,1
2,Ml,Smb,Low,Junior,False,2,526.0,13.0,0.0,3.2.11,Bug,1,No,Qa,Tuesday,18,2023-07,1
3,Auth,Free-Tier,Low,Mid,True,2,734.0,7.0,0.0,4.8.4,Bug,2,No,Qa,Wednesday,22,2023-10,1
4,Backend,Enterprise,Medium,Mid,True,1,374.0,7.0,3.0,2.9.15,Bug,3,No,Security,Friday,0,2025-06,0


In [101]:
print("Categories in 'product area' variable:     ",end=" " )
print(df['product_area'].unique())

print("Categories in 'customer type' variable:  ",end=" ")
print(df['customer_type'].unique())

print("Categories in'priority' variable:",end=" " )
print(df['priority'].unique())

print("Categories in 'reporter experience' variable:     ",end=" " )
print(df['reporter_experience'].unique())

print("Categories in 'reproduction steps' variable:     ",end=" " )
print(df['reproduction_steps'].unique())

print("Categories in 'sla breached' variable:     ",end=" " )
print(df['sla_breached'].unique())

print("Categories in 'ticket type' variable:     ",end=" " )
print(df['ticket_type'].unique())

print("Categories in 'assigned team' variable:     ",end=" " )
print(df['assigned_team'].unique())

print("Categories in 'day of the week' variable:     ",end=" " )
print(df['day_of_week'].unique())

print("Categories in 'severity' variable:     ",end=" " )
print(df['severity'].unique())

Categories in 'product area' variable:      ['Auth' 'Devops' 'Ml' 'Backend' 'Security' 'Api' 'Database' 'Ui']
Categories in 'customer type' variable:   ['Internal' 'Smb' 'Free-Tier' 'Enterprise']
Categories in'priority' variable: ['Medium' 'High' 'Low' 'Urgent']
Categories in 'reporter experience' variable:      ['Senior' 'Junior' 'Mid']
Categories in 'reproduction steps' variable:      [ True False]
Categories in 'sla breached' variable:      ['No' 'Yes']
Categories in 'ticket type' variable:      ['Bug' 'Question' 'Crash' 'Performance' 'Feature Request']
Categories in 'assigned team' variable:      ['Qa' 'Dev' 'Security' 'Sre' 'Support']
Categories in 'day of the week' variable:      ['Friday' 'Wednesday' 'Tuesday' 'Sunday' 'Monday' 'Saturday' 'Thursday']
Categories in 'severity' variable:      ['Medium' 'Low' 'High' 'Critical']


In [102]:

y = df['severity']

In [103]:
# Encode target variable
from sklearn.preprocessing import LabelEncoder
severity_order = ['Low', 'Medium', 'High', 'Critical']
severity_encoder = OrdinalEncoder(categories=[severity_order])
y_encoded = severity_encoder.fit_transform(y.values.reshape(-1,1)).ravel()

In [104]:
# Create a column transformer for preprocessing
ordinal_features = ['priority', 'reporter_experience']
binary_features = ['sla_breached']


numeric_features = X.select_dtypes(exclude=['object']).columns
categorical_features = [
    col for col in X.select_dtypes(include=['object']).columns
    if col not in ordinal_features + binary_features
]

ordinal_encoder = OrdinalEncoder(categories=[['Low', 'Medium', 'High', 'Urgent'], ['Junior', 'Mid', 'Senior']])
binary_encoder = OrdinalEncoder(categories=[['No', 'Yes']])

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numeric_transformer, numeric_features),
        ('ordinal', ordinal_encoder, ordinal_features),
        ('binary', binary_encoder, binary_features),
        ('categorical', categorical_transformer, categorical_features)
    ]
)

In [105]:

X = preprocessor.fit_transform(X)

In [106]:
X.shape

(5000, 1067)

In [107]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((4000, 1067), (1000, 1067))

In [109]:
# Define models with class_weight='balanced' where applicable
models = {
    "Logistic Regression": LogisticRegression(
        class_weight='balanced', 
        max_iter=1000,
        random_state=42
        ),
    "Random Forest": RandomForestClassifier(
        class_weight='balanced',
        n_estimators=100,
        random_state=42
        ),
    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        random_state=42
        ),
    "Decision Tree": DecisionTreeClassifier(
        class_weight='balanced',
        random_state=42
        )       
   
} 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score



pipelines = {}
for name, model in models.items():
    pipelines[name] = Pipeline(steps=[
        ('model', model)
    ])

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = 'f1_weighted'

results = {}

for name, pipeline in pipelines.items():
    scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
    results[name] = {
        'mean_f1': scores.mean(),
        'std_f1': scores.std()
    }

for model, metrics in results.items():
         print(f"{model}: F1 = {metrics['mean_f1']:.3f} ± {metrics['std_f1']:.3f}")

Logistic Regression: F1 = 0.655 ± 0.022
Logistic Regression: F1 = 0.655 ± 0.022
Random Forest: F1 = 0.632 ± 0.007
Logistic Regression: F1 = 0.655 ± 0.022
Random Forest: F1 = 0.632 ± 0.007
Gradient Boosting: F1 = 0.698 ± 0.010
Logistic Regression: F1 = 0.655 ± 0.022
Random Forest: F1 = 0.632 ± 0.007
Gradient Boosting: F1 = 0.698 ± 0.010
Decision Tree: F1 = 0.642 ± 0.014
