**Split the data into train and test set**

In [None]:
# Encode and split the dataset
y_balanced_encoded = label_encoder.transform(y_balanced)
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(
    X_balanced, y_balanced_encoded, test_size=0.2, random_state=42, stratify=y_balanced_encoded
)

In [49]:
# Prepare an empty dictionary to store each model's metrics
model_metrics = {}

**1. Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Define the training and evaluation function for Logistic Regression
def train_evaluate_logistic(X_train, X_test, y_train, y_test):
    model = make_pipeline(
        StandardScaler(),
        LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Convert the predicted integer labels back to original string labels
    y_test_labels = label_encoder.inverse_transform(y_test)
    y_pred_labels = label_encoder.inverse_transform(y_pred)
    print("Logistic Regression Classification Report:")
    print(classification_report(y_test_labels, y_pred_labels))
    return classification_report(y_test_labels, y_pred_labels, output_dict=True)

# Train and evaluate the Logistic Regression model
metric_lr = train_evaluate_logistic(X_train_bal, X_test_bal, y_train_bal, y_test_bal)

# Store the results in the dictionary
model_metrics['Logistic Regression'] = metric_lr

Logistic Regression Classification Report:
                                                 precision    recall  f1-score   support

                                          OTHER       0.83      0.54      0.66      5000
                  add_license OR remove_license       0.54      0.85      0.66       586
                        add_user OR remove_user       0.67      0.81      0.74      1647
add_user_to_channel OR remove_user_from_channel       0.72      0.92      0.81       762
                                      os_update       0.77      0.90      0.83      1632
                                 password_reset       0.86      0.86      0.86      3068
                                      reset_mfa       0.56      0.90      0.69       365
                               shipping_request       0.64      0.92      0.76       940

                                       accuracy                           0.75     14000
                                      macro avg       0.70      0

**2. Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np

# Convert to 32-bit float
X_train_bal = X_train_bal.astype(np.float32)
X_test_bal = X_test_bal.astype(np.float32)

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

# Trian and evaluate Random Forest model
def train_evaluate_rf(X_train, X_test, y_train, y_test):
    rf_model = RandomForestClassifier(
        random_state=42,
        n_jobs=-1       # Use all available cores
    )
    grid_search = GridSearchCV(
        rf_model,
        param_grid_rf,
        cv=2,                     # ues 2-fold cross-validation
        scoring='f1_weighted',
        n_jobs=-1                 # Use all available cores
    )
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    # Output the best parameters found
    print("Best Parameters found:")
    print(grid_search.best_params_)

    y_pred = best_model.predict(X_test)
    # Convert the predicted integer labels back to original string labels
    y_test_labels = label_encoder.inverse_transform(y_test)
    y_pred_labels = label_encoder.inverse_transform(y_pred)
    print("Random Forest Classification Report:")
    print(classification_report(y_test_labels, y_pred_labels))
    # Return the classification report as a dictionary
    return classification_report(y_test_labels, y_pred_labels, output_dict=True)

# Train and evaluate the Random Forest model
metric_rf = train_evaluate_rf(X_train_bal, X_test_bal, y_train_bal, y_test_bal)

# Store the results in the dictionary
model_metrics['RandomForest'] = metric_rf

**3. XGBoost**

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import xgboost as xgb

# Expanded parameter grid
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.1, 0.01],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'gamma': [0, 1],
    'min_child_weight': [1, 5]
}

def train_evaluate_xgb(X_train, X_test, y_train, y_test):
    xgb_model = XGBClassifier(
        random_state=42,
        tree_method='hist',        # Use 'hist' tree method
        device='cuda',             # Enable GPU acceleration
        eval_metric='mlogloss',
        max_bin=256,
        n_jobs=1                   # Limit CPU threads
    )
    grid_search = GridSearchCV(
        xgb_model,
        param_grid_xgb,
        cv=2,
        scoring='f1_weighted',
        n_jobs=1                   # Limit CPU threads
    )
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    # Output the best parameters found
    print("Best Parameters found:")
    print(grid_search.best_params_)
    
    # Predictions
    y_pred = best_model.predict(X_test)

    # Convert labels back to original strings
    y_test_labels = label_encoder.inverse_transform(y_test)
    y_pred_labels = label_encoder.inverse_transform(y_pred)

    print("XGBoost Classification Report:")
    print(classification_report(y_test_labels, y_pred_labels))
    return classification_report(y_test_labels, y_pred_labels, output_dict=True)

# Train and evaluate the XGBoost model
metric_xgb = train_evaluate_xgb(X_train_bal, X_test_bal, y_train_bal, y_test_bal)

# Store the results
model_metrics['XGBoost'] = metric_xgb

**4. LightGBM**

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np

# Ensure data types are float32 to reduce memory usage
X_train_bal = X_train_bal.astype(np.float32)
X_test_bal = X_test_bal.astype(np.float32)

# Use the data without PCA
X_train_bal_reduced = X_train_bal
X_test_bal_reduced = X_test_bal

# Slightly expanded parameter grid
param_grid_lgb = {
    'n_estimators': [100, 200],          # Try different numbers of trees
    'max_depth': [3, 5, 7],              # Add a deeper tree depth
    'learning_rate': [0.1, 0.05],        # Keep existing learning rates
    'num_leaves': [31, 63],              # Try a larger number of leaves
    'subsample': [0.8],                  # Keep subsample constant
    'colsample_bytree': [0.8],           # Keep colsample_bytree constant
    'reg_alpha': [0],                    # Keep reg_alpha constant
    'reg_lambda': [0],                   # Keep reg_lambda constant
    'min_child_samples': [20, 40]        # Try increasing min_child_samples
}

def train_evaluate_lgb(X_train, X_test, y_train, y_test):
    lgb_model = LGBMClassifier(
        random_state=42,
        device='gpu',            # Enable GPU acceleration
        gpu_device_id=0,         # Specify GPU device ID
        n_jobs=1,                # Limit CPU usage
        verbose=-1               # Suppress warnings
    )
    grid_search = GridSearchCV(
        lgb_model,
        param_grid_lgb,
        cv=2,
        scoring='f1_weighted',
        n_jobs=1                 # Limit CPU usage
    )
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # Output the best parameters found
    print("Best Parameters found:")
    print(grid_search.best_params_)

    y_pred = best_model.predict(X_test)
    # Convert labels back to original strings
    y_test_labels = label_encoder.inverse_transform(y_test)
    y_pred_labels = label_encoder.inverse_transform(y_pred)
    print("LightGBM Classification Report:")
    print(classification_report(y_test_labels, y_pred_labels))
    # Return the classification report as a dictionary
    return classification_report(y_test_labels, y_pred_labels, output_dict=True)

# Train and evaluate the LightGBM model
metric_lgb = train_evaluate_lgb(X_train_bal_reduced, X_test_bal_reduced, y_train_bal, y_test_bal)

# Store the results
model_metrics['LightGBM'] = metric_lgb

**5. MLP**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Prepare the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_bal)
X_test_scaled = scaler.transform(X_test_bal)

# Convert data to PyTorch tensors and move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train_bal, dtype=torch.long).to(device)
y_test_tensor = torch.tensor(y_test_bal, dtype=torch.long).to(device)

# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_layer_sizes, num_classes, activation='relu'):
        super(MLP, self).__init__()
        layers = []
        in_features = input_size
        for hidden_size in hidden_layer_sizes:
            layers.append(nn.Linear(in_features, hidden_size))
            if activation == 'relu':
                layers.append(nn.ReLU())
            elif activation == 'tanh':
                layers.append(nn.Tanh())
            in_features = hidden_size
        layers.append(nn.Linear(in_features, num_classes))
        self.net = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.net(x)

# Hyperparameter grid
param_grid_mlp = {
    'hidden_layer_sizes': [[100], [200]],
    'activation': ['relu', 'tanh'],
    'learning_rate_init': [0.001, 0.01],
    'max_iter': [20]  # Reduced for faster training
}

# Function to train and evaluate the MLP model
def train_evaluate_mlp(X_train, X_test, y_train, y_test):
    num_classes = len(np.unique(y_train_bal))
    input_size = X_train.shape[1]
    best_f1 = 0
    best_params = None
    best_model = None
    
    # Iterate over all combinations of hyperparameters
    from itertools import product
    for hidden_sizes, activation, lr, epochs in product(
        param_grid_mlp['hidden_layer_sizes'],
        param_grid_mlp['activation'],
        param_grid_mlp['learning_rate_init'],
        param_grid_mlp['max_iter']
    ):
        model = MLP(input_size, hidden_sizes, num_classes, activation).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=lr)
        
        # Training loop
        batch_size = 64
        for epoch in range(epochs):
            permutation = torch.randperm(X_train.size()[0])
            model.train()
            for i in range(0, X_train.size()[0], batch_size):
                indices = permutation[i:i+batch_size]
                batch_x = X_train[indices]
                batch_y = y_train[indices]
                
                optimizer.zero_grad()
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
        
        # Evaluation
        model.eval()
        with torch.no_grad():
            outputs = model(X_test)
            _, predicted = torch.max(outputs, 1)
            y_pred = predicted.cpu().numpy()
            y_true = y_test.cpu().numpy()
            # Convert labels back to original strings
            y_test_labels = label_encoder.inverse_transform(y_true)
            y_pred_labels = label_encoder.inverse_transform(y_pred)
            report = classification_report(y_test_labels, y_pred_labels, output_dict=True)
            f1_weighted = report['weighted avg']['f1-score']
            if f1_weighted > best_f1:
                best_f1 = f1_weighted
                best_params = {
                    'hidden_layer_sizes': hidden_sizes,
                    'activation': activation,
                    'learning_rate_init': lr,
                    'max_iter': epochs
                }
                best_model = model
    
    # Print best parameters
    print("Best Parameters:", best_params)
    
    # Final evaluation on the test set
    best_model.eval()
    with torch.no_grad():
        outputs = best_model(X_test)
        _, predicted = torch.max(outputs, 1)
        y_pred = predicted.cpu().numpy()
        y_true = y_test.cpu().numpy()
        y_test_labels = label_encoder.inverse_transform(y_true)
        y_pred_labels = label_encoder.inverse_transform(y_pred)
        print("MLP Classification Report:")
        print(classification_report(y_test_labels, y_pred_labels))
        return classification_report(y_test_labels, y_pred_labels, output_dict=True)

# Train and evaluate the MLP model
metric_mlp = train_evaluate_mlp(X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor)

# Store the results
model_metrics['MLP'] = metric_mlp

**Compare Results**

In [None]:
## snowflake embedding
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Convert the f-1 score dictionary to a DataFrame
metrics_df = pd.DataFrame(model_metrics).T

# Define a function to extract F1-scores from the nested dictionary
def extract_f1_scores(value):
    if isinstance(value, dict) and 'f1-score' in value:
        return value['f1-score']
    elif isinstance(value, (int, float)):
        return value
    return None

# use the function to extract f1 scores
f1_scores_df = metrics_df.applymap(extract_f1_scores)

# only keed columns except macro avg	accuracy	weighted avg
f1_scores_df = f1_scores_df.drop(columns=['macro avg', 'accuracy', 'weighted avg'])

# Calculate the average F1-score for each category and sort categories in descending order
average_f1_scores = f1_scores_df.mean(axis=0)
categories_ordered = average_f1_scores.sort_values(ascending=False).index.tolist()

# Update the column order of f1_scores_df
f1_scores_df = f1_scores_df[categories_ordered]

# Set font size to 23, set white background and palette
sns.set_style("whitegrid")
plt.rcParams.update({'font.size': 23})

# Use Seaborn color palette, ensuring greater color variation (more distinct bar colors for each category)
palette = sns.color_palette("Oranges", n_colors=len(f1_scores_df.index))

# Reset index and convert to long format
f1_scores_melted = f1_scores_df.reset_index().melt(id_vars='index', var_name='Category', value_name='F1-score')

# Manually set the model order
model_order = ['Logistic Regression', 'XGBoost', 'LightGBM', 'MLP']
f1_scores_melted['index'] = pd.Categorical(f1_scores_melted['index'], categories=model_order, ordered=True)

# Plot the bar chart
plt.figure(figsize=(22, 6))
ax = sns.barplot(data=f1_scores_melted, x='Category', y='F1-score', hue='index', palette=palette, alpha=0.8)

# Set Y-axis and X-axis labels and increase font size
plt.ylabel('F1-score', fontsize=30)
plt.xlabel('SIMPLIFIED_CATEGORY', fontsize=30)
plt.title("This Week's Result", fontsize=28)
plt.xticks(rotation=0, ha='center', fontsize=22)

# Set Y-axis range from 0.5 to 1
plt.ylim(0.5, 1)

# Place the legend inside the plot, set to horizontal layout
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=4, fontsize=24, frameon=False)

# Add value labels with reduced font size to 16
for p in ax.patches:
    if p.get_height() > 0:
        ax.annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', va='center', xytext=(0, 9), textcoords='offset points', fontsize=20)

plt.tight_layout()
plt.show()