In [1]:
import keras
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from cvf_da_model import encode_categorical_features, CATEGORICAL_FEATURES, NUMERICAL_FEATURES
from data_preprocessing import process_data_for_training
from sklearn.utils import shuffle

2023-08-08 12:17:27.527138: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Compute CVF-DA Model's Permutation Importance

In [20]:
# Load the trained model
cvf_da_model = keras.models.load_model('out/models/cvf_da_fully_trained.keras')

In [22]:
# Prepare data that we are evaluating with
prepared_data = process_data_for_training('test_data/recommender_testing_data2.csv', 'test_data/continuous_learning_claims.csv')
prepared_data = encode_categorical_features(prepared_data, LabelEncoder())
prepared_data_input = [prepared_data[feature].values for feature in CATEGORICAL_FEATURES] + \
                      [prepared_data[NUMERICAL_FEATURES].values]

test_data/recommender_testing_data2.csv - Data import complete

test_data/recommender_testing_data2.csv - Diagnostic consultation initiated

test_data/recommender_testing_data2.csv - Vehicle state data derived

test_data/recommender_testing_data2.csv - Diagnostic and warranty data merged

test_data/recommender_testing_data2.csv - Temporal features derived

There are no missing values in the DataFrame.
Total number of records in the DataFrame: 17311
test_data/recommender_testing_data2.csv - Missing values addressed

Number of duplicate records removed: 0
test_data/recommender_testing_data2.csv - Duplicates removed

test_data/recommender_testing_data2.csv - Numerical data normalised



In [23]:
# Get the original predictions and compute original performance
original_predictions = cvf_da_model.predict(prepared_data_input)
original_class_predictions = np.argmax(original_predictions, axis=1)
original_f1 = f1_score(original_class_predictions, np.argmax(original_predictions, axis=1), average='weighted')

original_predictions



array([[2.74642080e-04, 3.33144931e-06, 6.48242831e-02, ...,
        5.96736527e-08, 3.97920758e-06, 7.12803505e-09],
       [4.14901551e-05, 1.77633821e-03, 4.62857857e-02, ...,
        6.09815800e-07, 2.40562577e-03, 8.23548180e-06],
       [3.46953639e-05, 1.56476090e-04, 1.68816432e-01, ...,
        3.16750146e-07, 2.83305912e-04, 1.39949054e-06],
       ...,
       [1.01659188e-08, 2.80881753e-08, 3.91312460e-06, ...,
        1.06174635e-11, 2.46548325e-06, 1.31194166e-12],
       [3.24131179e-05, 1.39412470e-04, 3.36720841e-05, ...,
        2.71309887e-06, 1.40130534e-04, 1.39366122e-07],
       [2.05328862e-11, 9.38525126e-12, 2.96967007e-09, ...,
        1.02661781e-12, 8.03336286e-11, 5.16221722e-13]], dtype=float32)

In [24]:
# For storing the drop in performance for each feature
importance_scores = {}

# Compute Permutation Importance for each feature
for feature in CATEGORICAL_FEATURES + NUMERICAL_FEATURES:
    shuffled_data = prepared_data.copy()
    shuffled_data[feature] = shuffle(shuffled_data[feature].values)

    shuffled_data_input = [shuffled_data[f].values for f in CATEGORICAL_FEATURES] + \
                          [shuffled_data[NUMERICAL_FEATURES].values]

    # Get predictions on shuffled data
    shuffled_predictions = cvf_da_model.predict(shuffled_data_input)
    shuffled_class_predictions = np.argmax(shuffled_predictions, axis=1)

    # Compute drop in performance using F1 score with weighted average
    shuffled_f1 = f1_score(shuffled_class_predictions, np.argmax(original_predictions, axis=1), average='weighted')
    drop_in_f1 = original_f1 - shuffled_f1
    print(f"Feature: {feature}, Shuffled F1: {shuffled_f1}")
    importance_scores[feature] = drop_in_f1

Feature: model, Shuffled F1: 0.8134043570914804
Feature: modelyear, Shuffled F1: 0.7221024231303808
Feature: driver, Shuffled F1: 0.8255924158520221
Feature: plant, Shuffled F1: 0.705882053423774
Feature: engine, Shuffled F1: 0.6370478821583078
Feature: transmission, Shuffled F1: 0.9878959324479679
Feature: module, Shuffled F1: 0.8176737309572158
Feature: dtcbase, Shuffled F1: 0.692011194696601
Feature: faulttype, Shuffled F1: 0.8849174182362952
Feature: dtcfull, Shuffled F1: 0.7766497859143233
Feature: year, Shuffled F1: 0.8246419447402729
Feature: month, Shuffled F1: 0.7340541138506639
Feature: dayOfWeek, Shuffled F1: 0.6102759309505019
Feature: weekOfYear, Shuffled F1: 0.564972850868265
Feature: season, Shuffled F1: 0.8848122410652328
Feature: i_original_vfg_code, Shuffled F1: 1.0
Feature: softwarepartnumber, Shuffled F1: 0.6263708306062661
Feature: hardwarepartnumber, Shuffled F1: 0.7839126596644349
Feature: i_p_css_code, Shuffled F1: 1.0
Feature: i_original_ccc_code, Shuffled F1: 

In [25]:
# Print out importance scores
sorted_importances = sorted(importance_scores.items(), key=lambda x: x[1], reverse=True)
for feature, score in sorted_importances:
    print(f"{feature}: {score}")

weekOfYear: 0.435027149131735
dayOfWeek: 0.3897240690494981
softwarepartnumber: 0.3736291693937339
engine: 0.36295211784169223
vehicleAgeAtSession: 0.33549567368808997
dtcbase: 0.307988805303399
plant: 0.29411794657622603
daysSinceWarrantyStart: 0.28819619484300085
modelyear: 0.27789757686961924
month: 0.2659458861493361
i_months_in_service: 0.23913993170315484
dtcfull: 0.22335021408567668
hardwarepartnumber: 0.2160873403355651
i_time_in_service: 0.21576063208487262
model: 0.1865956429085196
module: 0.18232626904278415
year: 0.17535805525972714
driver: 0.17440758414797786
season: 0.11518775893476718
faulttype: 0.11508258176370478
i_mileage: 0.0830970017628545
elapsedTimeSec: 0.08223639267130722
odomiles: 0.06454829981794519
transmission: 0.012104067552032083
timeSinceLastActivitySec: 0.006496281098339662
i_original_vfg_code: 0.0
i_p_css_code: 0.0
i_original_ccc_code: 0.0
i_original_function_code: 0.0
i_original_vrt_code: 0.0
i_current_vfg_code: 0.0
i_current_function_code: 0.0
i_curren