**GRADIENT BOOSTING (GB)**

**DATA LOADING AND PREPARATION**

In [1]:
import pandas as pd

In [2]:
# Read the CSV
tfidf_df = pd.read_csv("tfidf_sncb.csv", sep='\,', engine='python')

tfidf_df['incident_type'] = tfidf_df['incident_type'].astype('string') 

tfidf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1011 entries, 0 to 1010
Columns: 828 entries, incident_id to 998
dtypes: float64(826), int64(1), string(1)
memory usage: 6.4 MB


In [3]:
from sklearn.model_selection import train_test_split

# Filter in the Features (the values acquired from the events sequence after TF-IDF)
X = tfidf_df.drop(['incident_type', 'incident_id'], axis=1) 

# Filter in the Target variable (labels / incident types)
y = tfidf_df['incident_type']  

# setting random_state constant to be used in the whole pipeline and guarantee reproducibility
r_state = 123

# Split data into training+validation and testing sets
train_val_X, test_X, train_val_y, test_y = train_test_split(X, 
                                                            y, 
                                                            train_size = 0.8, 
                                                            random_state = r_state, # setting random_state for reproducibility
                                                            stratify = y) # to respect class imbalance in the label column

print(f"The train_val_X pandas df has {len(train_val_X)} rows and {len(train_val_X.columns)} columns.")
print(f"The test_y pandas series has {len(test_y)} rows and 1 column.")

The train_val_X pandas df has 808 rows and 826 columns.
The test_y pandas series has 203 rows and 1 column.


In [5]:
from collections import Counter

# get the size of the smallest incident type class
value_counts = Counter(train_val_y)
min_class_setsize = min(value_counts.values())

print(f"In RepeatedStratifiedKFold() function, the parameter n_splits has to be set atmost to {min_class_setsize}, due to class imbalance in the label column.")

In RepeatedStratifiedKFold() function, the parameter n_splits has to be set atmost to 3, due to class imbalance in the label column.


In [6]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
from datetime import datetime
import numpy as np

# Define the base model
model_gb = GradientBoostingClassifier(random_state = r_state,
                                      loss = "log_loss", # exponential works only for binary classification
                                      criterion = "friedman_mse", # tested ["squared_error"]
                                      max_features = 0.12 # tested [100, 0.15, 0.20]
                                     )

# Set up cross-validation
rskf = RepeatedStratifiedKFold(n_splits = min_class_setsize, 
                               n_repeats = 34, 
                               random_state = r_state)

**MODEL TRAINING AND VALIDATION**

**TEST SET RESULTS**

In [7]:
from sklearn.metrics import classification_report, confusion_matrix

# Train the base model
model_gb.fit(X, y)

"""
# Predict on the test set
test_pred_y = model_gb.predict(test_X)

# Compute and display metrics
print(f"The model classified correctly {sum(test_y == test_pred_y)} entries from a total of {len(test_X)}.\n")

print(f"Accuracy on test set:          {accuracy_score(test_y, test_pred_y)}")
print(f"Weighted F1-Score on test set: {f1_score(test_y, test_pred_y, average='weighted')}\n")

print("F1-Score per class\n")

# Generate classification report
report = classification_report(test_y, test_pred_y, output_dict=True, zero_division=0)

# Display F1-score per class
for class_label, metrics in report.items():
    if isinstance(metrics, dict) and 'f1-score' in metrics:
        print(f"Class {class_label}: F1-Score = {metrics['f1-score']:.6f}")

print("\nAccuracy per class\n")

# Display F1-score per class
for class_label, metrics in report.items():
    if isinstance(metrics, dict) and 'recall' in metrics:
        print(f"Class {class_label}: Recall = {metrics['recall']:.6f}") # Recall is equivalent to per-class accuracy
"""

'\n# Predict on the test set\ntest_pred_y = model_gb.predict(test_X)\n\n# Compute and display metrics\nprint(f"The model classified correctly {sum(test_y == test_pred_y)} entries from a total of {len(test_X)}.\n")\n\nprint(f"Accuracy on test set:          {accuracy_score(test_y, test_pred_y)}")\nprint(f"Weighted F1-Score on test set: {f1_score(test_y, test_pred_y, average=\'weighted\')}\n")\n\nprint("F1-Score per class\n")\n\n# Generate classification report\nreport = classification_report(test_y, test_pred_y, output_dict=True, zero_division=0)\n\n# Display F1-score per class\nfor class_label, metrics in report.items():\n    if isinstance(metrics, dict) and \'f1-score\' in metrics:\n        print(f"Class {class_label}: F1-Score = {metrics[\'f1-score\']:.6f}")\n\nprint("\nAccuracy per class\n")\n\n# Display F1-score per class\nfor class_label, metrics in report.items():\n    if isinstance(metrics, dict) and \'recall\' in metrics:\n        print(f"Class {class_label}: Recall = {metrics[\

**SAVE AND EXPORT RESULTS**

In [14]:
gb_importance_df = pd.DataFrame({'importance': model_gb.feature_importances_,
                                 'feature': model_gb.feature_names_in_
                                })

gb_importance_df.sort_values('importance', ascending=False, inplace=True)

gb_importance_df

Unnamed: 0,importance,feature
652,0.043043,4080
534,0.027375,3548
527,0.022573,3528
811,0.022431,942
276,0.021125,2492
...,...,...
807,0.000000,920
806,0.000000,906
771,0.000000,66
775,0.000000,678


In [15]:
# Save DataFrame of Feature Importance
gb_importance_df.to_csv('gb_importance.csv', index=False)