<a href="https://colab.research.google.com/github/fitriaprasari/ProgramAnalysis/blob/main/Model_Path_Coverage_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# ── 1. Sensor ────────────────────────────────────────────────────────────
# Taken from your notebook
class TransactionSensor:
    def __init__(self, path: str):
        self.path = path

    def stream(self) -> pd.DataFrame:
        print(f"[Sensor] Reading file {self.path} …")
        df = pd.read_csv(self.path)
        print(f"[Sensor] Loaded {len(df)} rows × {df.shape[1]} cols\n")
        return df

# ── 2. Perception ────────────────────────────────────────────────────────
# Taken from your notebook
class PerceptionModule:
    def __init__(self, num_raw, cat_cols):
        self.num_raw  = num_raw
        self.cat_cols = cat_cols
        self.scaler   = StandardScaler()
        print("[Perception] Initializing StandardScaler and OneHotEncoder+PCA pipeline")

        self.prep = ColumnTransformer([
            ('num', 'passthrough', [c + '_z' for c in num_raw]),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
        ])
        self.pca  = PCA(n_components=0.95, random_state=42)
        self.pipe = Pipeline([('prep', self.prep), ('pca', self.pca)])

    def fit(self, X: pd.DataFrame):
        print("[Perception] Fitting scaler on numeric columns:")
        X_num = X[self.num_raw]
        X_z   = self.scaler.fit_transform(X_num)
        for i, c in enumerate(self.num_raw):
            X[c + '_z'] = X_z[:, i]
        print("\n[Perception] Fitting PCA on preprocessed data…")
        self.pipe.fit(X)
        cumvar = np.cumsum(self.pca.explained_variance_ratio_)
        print(f"  • PCA kept {self.pca.n_components_} components")
        print(f"  • Cumulative variance explained: {cumvar[-1]:.3f}\n")

    def transform(self, X: pd.DataFrame) -> np.ndarray:
        print("[Perception] Transforming data through PCA pipeline…")
        X_num = X[self.num_raw]
        X_z   = self.scaler.transform(X_num)
        for i, c in enumerate(self.num_raw):
            X[c + '_z'] = X_z[:, i]
        X_pca = self.pipe.transform(X)
        print(f"  • Output shape: {X_pca.shape}")
        return X_pca

# ── 3. CognitiveCore – Decision Tree ────────────────────────────────────
# Taken from your notebook
class TreeCore:
    def __init__(self):
        print("[Cognition:Tree] Initializing Decision Tree model")
        # IMPORTANT: We are not limiting tree depth to see all paths
        self.clf = DecisionTreeClassifier(random_state=42, class_weight='balanced')

    def fit(self, X, y):
        print("[Cognition:Tree] Training Decision Tree…")
        self.clf.fit(X, y)

    def predict(self, X):
        print("[Cognition:Tree] Predicting with Decision Tree…")
        return self.clf.predict(X)

    def score(self, X, y):
        acc = accuracy_score(y, self.predict(X))
        print(f"[Cognition:Tree] Accuracy = {acc:.3f}\n")
        return acc

# ── 4. Main Flow (Adapted for Analysis) ──────────────────────────────
if __name__ == "__main__":

    # 4.1 Setup: Sensor, Split, Perception (as in notebook)
    sensor = TransactionSensor("data_labeled.csv")
    df     = sensor.stream()

    y     = df['Class']
    Xraw  = df.drop(columns='Class')

    num_raw  = ['TransactionAmount','AccountBalance','TransactionDuration','CustomerAge','LoginAttempts']
    cat_cols = ['Location','Channel']

    X_train, X_test, y_train, y_test = train_test_split(
        Xraw, y, test_size=0.2, stratify=y, random_state=42)

    vision = PerceptionModule(num_raw, cat_cols)
    vision.fit(X_train)
    X_train_pca = vision.transform(X_train)
    X_test_pca  = vision.transform(X_test)

    # 4.2 Train Model (as in notebook)
    tree_core = TreeCore()
    tree_core.fit(X_train_pca, y_train)

    # 4.3 PREDICT (for later validation)
    y_pred_dt = tree_core.predict(X_test_pca)
    print("\n[Main] Model standard prediction complete.\n")


    # ----------------------------------------------------------------------
    # NEW SECTION: SCOPE 1 - MODEL LOGIC VERIFICATION (MODEL COVERAGE)
    # ----------------------------------------------------------------------
    print("="*60)
    print("SCOPE 1: MODEL LOGIC VERIFICATION (MODEL COVERAGE)")
    print("Focus: Decision Tree internal paths (rules)")
    print("="*60)

    fitted_model = tree_core.clf
    tree_structure = fitted_model.tree_

    # Step 1: Find all possible paths (rules) within the model
    # A "path" is represented by a "leaf node".
    # A node is a leaf if `children_left` and `children_right` == -1.

    children_left = tree_structure.children_left
    children_right = tree_structure.children_right

    total_leaf_nodes = set()
    for i in range(tree_structure.node_count):
        if children_left[i] == -1 and children_right[i] == -1:
            total_leaf_nodes.add(i)

    total_paths_in_model = len(total_leaf_nodes)

    if total_paths_in_model == 0:
        print("Error: Decision Tree model not trained or has no leaves.")
    else:
        print(f"Model Analysis: The trained Decision Tree has {total_paths_in_model} unique 'rules' (paths to leaves).")

        # Step 2: Find all paths EXECUTED by the test set (X_test_pca)
        # The .apply() method returns the ID of the leaf node for each sample.

        covered_leaf_ids_array = fitted_model.apply(X_test_pca)
        covered_leaf_nodes = set(covered_leaf_ids_array)

        covered_paths_count = len(covered_leaf_nodes)

        print(f"Test Set Analysis: The test set (X_test_pca) executed {covered_paths_count} out of {total_paths_in_model} of those rules.")

        # Step 3: Calculate Model Path Coverage
        coverage_percentage = (covered_paths_count / total_paths_in_model) * 100

        print("\n--- Model Path Coverage Results ---")
        print(f"  Total Rules (Paths) in Model: {total_paths_in_model}")
        print(f"  Covered Rules (by Test Set): {covered_paths_count}")
        print(f"  Model Path Coverage: {coverage_percentage:.2f}%")

        # Step 4: Identify UNCOVERED Rules (Paths)
        # This is your main goal: identifying untried paths
        uncovered_leaf_nodes = total_leaf_nodes - covered_leaf_nodes
        uncovered_count = len(uncovered_leaf_nodes)

        print(f"\nObjective Identification: Found {uncovered_count} untested paths/rules.")

        if uncovered_count > 0:
            print("  The following are the leaf node IDs (representing rules) that were NOT executed by the test set:")
            # Display only the first 20 for brevity
            print(f"  {list(uncovered_leaf_nodes)[:20]}...")
        else:
            print("  CONGRATULATIONS! All paths in the Decision Tree have been successfully covered by the test set.")

    print("="*60)


[Sensor] Reading file data_labeled.csv …
[Sensor] Loaded 2512 rows × 17 cols

[Perception] Initializing StandardScaler and OneHotEncoder+PCA pipeline
[Perception] Fitting scaler on numeric columns:

[Perception] Fitting PCA on preprocessed data…
  • PCA kept 49 components
  • Cumulative variance explained: 0.950

[Perception] Transforming data through PCA pipeline…
  • Output shape: (2009, 49)
[Perception] Transforming data through PCA pipeline…
  • Output shape: (503, 49)
[Cognition:Tree] Initializing Decision Tree model
[Cognition:Tree] Training Decision Tree…
[Cognition:Tree] Predicting with Decision Tree…

[Main] Model standard prediction complete.

SCOPE 1: MODEL LOGIC VERIFICATION (MODEL COVERAGE)
Focus: Decision Tree internal paths (rules)
Model Analysis: The trained Decision Tree has 64 unique 'rules' (paths to leaves).
Test Set Analysis: The test set (X_test_pca) executed 36 out of 64 of those rules.

--- Model Path Coverage Results ---
  Total Rules (Paths) in Model: 64
  Cov

In [7]:
import numpy as np

def get_rule_for_leaf(tree_structure, leaf_id, feature_names):
    """
    Reconstructs the rule leading to a specific leaf node in a Decision Tree.

    Args:
        tree_structure: The sklearn.tree._tree.Tree object (e.g., fitted_model.tree_).
        leaf_id: The ID of the leaf node.
        feature_names: A list of feature names corresponding to the feature indices.

    Returns:
        A string representing the rule, or an error message if the node is not a leaf
        or not found.
    """
    children_left = tree_structure.children_left
    children_right = tree_structure.children_right
    feature = tree_structure.feature
    threshold = tree_structure.threshold
    value = tree_structure.value
    n_node_samples = tree_structure.n_node_samples

    if children_left[leaf_id] != -1 or children_right[leaf_id] != -1:
        return f"Node {leaf_id} is not a leaf node."

    # Recursively find the path from the root to the leaf
    full_path_nodes = []
    def find_path(node_id, current_path):
        if node_id == -1:
            return False

        current_path.append(node_id)

        if node_id == leaf_id:
            return True

        if children_left[node_id] != -1:
            if find_path(children_left[node_id], current_path):
                return True

        if children_right[node_id] != -1:
            if find_path(children_right[node_id], current_path):
                return True

        current_path.pop() # Backtrack
        return False

    find_path(0, full_path_nodes)

    if not full_path_nodes or full_path_nodes[-1] != leaf_id:
        return f"Could not find path to node {leaf_id}."

    rules = []
    for i in range(len(full_path_nodes) - 1):
        node = full_path_nodes[i]
        child = full_path_nodes[i+1]

        feat_idx = feature[node]
        if feat_idx == -2: # Should not happen for internal nodes
            continue

        feat_name = feature_names[feat_idx]
        thresh = threshold[node]

        if children_left[node] == child:
            rules.append(f"{feat_name} <= {thresh:.4f}")
        elif children_right[node] == child:
            rules.append(f"{feat_name} > {thresh:.4f}")

    # Add leaf node prediction
    values_at_leaf = value[leaf_id][0]
    predicted_class = np.argmax(values_at_leaf)
    class_counts = f"Class 0: {int(values_at_leaf[0])}, Class 1: {int(values_at_leaf[1])}"

    rule_string = " AND ".join(rules)
    rule_string += f" -> Predict Class {predicted_class} (samples: {n_node_samples[leaf_id]}, {class_counts})"

    return rule_string

# Generate feature names for PCA components (assuming X_train_pca is available)
pca_feature_names = [f"component_{i}" for i in range(X_train_pca.shape[1])]

# Get the first uncovered leaf node ID (assuming 'uncovered_leaf_nodes' is available)
first_uncovered_node_id = list(uncovered_leaf_nodes)[0] if uncovered_leaf_nodes else None

if first_uncovered_node_id is not None:
    print(f"The rule for uncovered leaf node {first_uncovered_node_id} is:")
    rule_for_node = get_rule_for_leaf(fitted_model.tree_, first_uncovered_node_id, pca_feature_names)
    print(rule_for_node)
else:
    print("No uncovered leaf nodes were found to display rules for.")


The rule for uncovered leaf node 5 is:
component_0 <= -0.2570 AND component_2 <= -0.4838 AND component_0 <= -1.7783 AND component_4 <= 0.1056 AND component_31 <= 0.0054 -> Predict Class 1 (samples: 3, Class 0: 0, Class 1: 1)
