In [222]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [223]:
df = pd.read_csv(r'data\feature_engineered_data.csv')
df.head()

Unnamed: 0,product_area,customer_type,priority,reporter_experience,reproduction_steps,num_attachments,description_length,affected_users_count,prev_similar_tickets,module_version,...,auto_tag_count_is_outlier,auto_tag_count_is_extreme,description_length_is_outlier,description_length_is_extreme,prev_similar_tickets_is_outlier,prev_similar_tickets_is_extreme,severity_score,is_critical_incident,documentation_score,impact_per_doc
0,Auth,Internal,Medium,Senior,1,1,574.0,13.0,0.0,5.2.11,...,0,0,0,0,0,0,2.342252,0,1.0,1.55867
1,Devops,Internal,High,Junior,1,0,299.0,16.0,2.0,4.2.4,...,0,0,0,0,0,0,2.131285,0,0.2,2.833213
2,Ml,Smb,Low,Junior,0,2,526.0,13.0,0.0,3.2.11,...,0,0,0,0,0,0,2.327345,0,0.8,1.257525
3,Auth,Free-Tier,Low,Mid,1,2,734.0,7.0,0.0,4.8.4,...,0,0,0,0,0,0,2.519499,0,1.0,0.990865
4,Backend,Enterprise,Medium,Mid,1,1,374.0,7.0,3.0,2.9.15,...,0,0,0,0,0,0,2.318406,0,0.6,1.228152


In [224]:
# Separate features from target variable
X = df.drop('severity', axis=1)
y = df['severity']

In [225]:
# Identify numeric and categorical columns
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print("\nCategorical/Object columns:")
print(df.select_dtypes(include=['object']).columns.tolist())

binary_cols = [col for col in numeric_columns if set(df[col].dropna().unique()).issubset({0, 1})]

# Continous columns
continuous_cols = [col for col in numeric_columns if col not in binary_cols and col != 'severity']


Categorical/Object columns:
['product_area', 'customer_type', 'priority', 'reporter_experience', 'module_version', 'ticket_type', 'assigned_team', 'severity', 'impact_level', 'attachment_category', 'tag_complexity']


In [226]:
cols_to_encode = ['priority', 'reporter_experience', 'impact_level', 'attachment_category', 'tag_complexity']
onehot_features = ['product_area', 'customer_type', 'ticket_type', 'assigned_team']

# Ordinal Encoding
ordinal_categories = [
['Low', 'Medium', 'High', 'Urgent'], # priority levels
['Junior', 'Mid', 'Senior'], # reporter experience levels
['individual','small_team', 'department', 'enterprise'], # impact levels
['none', 'few', 'moderate', 'many'], # attachment categories
['minimal', 'standard', 'complex', 'very_complex'] # tag complexity levels
]

encoder = OrdinalEncoder(categories = ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1)
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('ord', encoder, cols_to_encode),
        ('onehot', onehot_encoder, onehot_features),
        ('scale', StandardScaler(), continuous_cols)
    ], remainder='passthrough'
)

In [227]:
# Encode target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [228]:
X= preprocessor.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # Maintain class distribution
)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Train class distribution:\n{pd.Series(y_train).value_counts().sort_index()}")
print(f"Test class distribution:\n{pd.Series(y_test).value_counts().sort_index()}")

Train: (3980, 58), Test: (995, 58)
Train class distribution:
0     435
1     639
2    1290
3    1616
Name: count, dtype: int64
Test class distribution:
0    108
1    160
2    323
3    404
Name: count, dtype: int64


In [None]:
%pip install imblearn
# Apply SMOTE to training data only
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print(f"Resampled Train class distribution:\n{pd.Series(y_train_resampled).value_counts().sort_index()}")
# Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}
# Evaluate models using Stratified K-Fold Cross-Validation
results = {}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=skf, scoring='f1_weighted')
    results[model_name] = cv_scores
    print(f"{model_name} - F1 Weighted CV Scores: {cv_scores}, Mean: {cv_scores.mean():.4f}, Std: {cv_scores.std():.4f}")

ModuleNotFoundError: No module named 'imblearn'