In [21]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
import json

In [40]:
df = pd.read_csv('data/feature_engineered_data.csv')
# Load encoding configuration
with open('encoding_config.json', 'r') as f:
    encoding_config = json.load(f)

print(f"Loaded data: {df.shape}")

Loaded data: (4975, 49)


#### Prepare features and target

In [42]:
# Target variable
X = df.drop(columns=['severity_encoded'], axis=1)
y = df['severity_encoded']

In [43]:
# Drop unnecessary columns
drop_columns = [
    'ticket_id', 'submitted_at', 'resolved_at', 'severity',
     'severity_encoded', 'module_version'
]

X = df.drop(columns=drop_columns, errors='ignore')
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target classes: {y.unique()}")

Features shape: (4975, 43)
Target shape: (4975,)
Target classes: [3 2 1 0]


In [44]:
# Create ColumnTransformer for preprocessing
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [45]:
X= preprocessor.fit_transform(X)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # Maintain class distribution
)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Train class distribution:\n{y_train.value_counts().sort_index()}")
print(f"Test class distribution:\n{y_test.value_counts().sort_index()}")

Train: (3980, 43), Test: (995, 43)
Train class distribution:
severity_encoded
0     435
1     639
2    1290
3    1616
Name: count, dtype: int64
Test class distribution:
severity_encoded
0    108
1    160
2    323
3    404
Name: count, dtype: int64


In [49]:
# Define models with class_weight='balanced' where applicable
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

models = {
    "Logistic Regression": LogisticRegression(
       max_iter=1000,
        random_state=42,
        class_weight='balanced',
        multi_class='multinomial',
        ),
    "Random Forest": RandomForestClassifier(
        class_weight='balanced',
        n_estimators=100,
        random_state=42
        ),
    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42
        ),
    "XGBoost": XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42,
        eval_metric='mlogloss',
        n_jobs=-1
        )       
}

TypeError: LogisticRegression.__init__() got an unexpected keyword argument 'multi_class'