In [12]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
import json

In [13]:
df = pd.read_csv('data/feature_engineered_data.csv')
# Load encoding configuration
with open('encoding_config.json', 'r') as f:
    encoding_config = json.load(f)

print(f"Loaded data: {df.shape}")

Loaded data: (4975, 49)


#### Prepare features and target

In [14]:
# Target variable
y = df['severity_encoded']

In [15]:
# Drop unnecessary columns
drop_columns = [
    'ticket_id', 'submitted_at', 'resolved_at', 'severity',
     'severity_encoded', 'module_version'
]

X = df.drop(columns=drop_columns, errors='ignore')
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target classes: {y.unique()}")

Features shape: (4975, 43)
Target shape: (4975,)
Target classes: [3 2 1 0]


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # Maintain class distribution
)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Train class distribution:\n{y_train.value_counts().sort_index()}")
print(f"Test class distribution:\n{y_test.value_counts().sort_index()}")

Train: (3980, 43), Test: (995, 43)
Train class distribution:
severity_encoded
0     435
1     639
2    1290
3    1616
Name: count, dtype: int64
Test class distribution:
severity_encoded
0    108
1    160
2    323
3    404
Name: count, dtype: int64


In [16]:
# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # Maintain class distribution
)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Train class distribution:\n{y_train.value_counts().sort_index()}")
print(f"Test class distribution:\n{y_test.value_counts().sort_index()}")

Train: (3980, 43), Test: (995, 43)
Train class distribution:
severity_encoded
0     435
1     639
2    1290
3    1616
Name: count, dtype: int64
Test class distribution:
severity_encoded
0    108
1    160
2    323
3    404
Name: count, dtype: int64


In [17]:
# Features to one-hot encode
onehot_features = encoding_config['to_onehot_in_modeling']

# Check which columns actually exist
onehot_features = [col for col in onehot_features if col in X_train.columns]

print(f"One-hot encoding: {onehot_features}")

# One-hot encode train set
X_train_encoded = pd.get_dummies(
    X_train, 
    columns=onehot_features, 
    prefix=onehot_features,
    drop_first=True  # Avoid multicollinearity
)

# One-hot encode test set
X_test_encoded = pd.get_dummies(
    X_test, 
    columns=onehot_features, 
    prefix=onehot_features,
    drop_first=True  
)

One-hot encoding: ['product_area', 'customer_type', 'ticket_type', 'assigned_team']


In [18]:
# Ensure train and test have same columns
# Add missing columns to test set with 0s
missing_cols = set(X_train_encoded.columns) - set(X_test_encoded.columns)
for col in missing_cols:
    X_test_encoded[col] = 0

# Remove extra columns from test set
extra_cols = set(X_test_encoded.columns) - set(X_train_encoded.columns)
X_test_encoded = X_test_encoded.drop(columns=extra_cols)

# Reorder test columns to match train
X_test_encoded = X_test_encoded[X_train_encoded.columns]

print(f" After one-hot encoding:")
print(f" Train shape: {X_train_encoded.shape}")
print(f" Test shape: {X_test_encoded.shape}")
print(f" Features created: {X_train_encoded.shape[1] - X_train.shape[1]}")

 After one-hot encoding:
 Train shape: (3980, 57)
 Test shape: (995, 57)
 Features created: 14


In [143]:
# Define models with class_weight='balanced' where applicable
models = {
    "Logistic Regression": LogisticRegression(
        class_weight='balanced', 
        max_iter=1000,
        random_state=42
        ),
    "Random Forest": RandomForestClassifier(
        class_weight='balanced',
        n_estimators=100,
        random_state=42
        ),
    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        random_state=42
        ),
    "Decision Tree": DecisionTreeClassifier(
        class_weight='balanced',
        random_state=42
        )       
   
} 