In [2]:
import pickle
import numpy as np
import pandas as pd

def load_data(trees_path: str, features_path: str):
    """Load the XGBoost tree dump and the feature CSV."""
    trees_df = pd.read_csv(trees_path)
    features_df = pd.read_csv(features_path)
    return trees_df, features_df


def build_tree_structures(trees_df: pd.DataFrame):
    """
    Build an in-memory dict of tree structures:
      { tree_index: { node_id: {feature, threshold, yes, no, missing, gain} } }
    """
    structures = {}
    for _, row in trees_df.iterrows():
        tree_idx = int(row['Tree'])
        node_id = int(row['Node'])
        structures.setdefault(tree_idx, {})[node_id] = {
            'feature': row['Feature'],
            'threshold': row['Split'],
            'yes': int(row['Yes'].split('-')[1]) if isinstance(row['Yes'], str) else None,
            'no':  int(row['No'].split('-')[1]) if isinstance(row['No'], str) else None,
            'missing': int(row['Missing'].split('-')[1]) if isinstance(row['Missing'], str) else None,
            'gain': row['Gain']
        }
    return structures


def generate_report_for_sample(tree_structures, sample: pd.Series):
    """
    Traverse each tree, recording Tree, Node, Feature, Threshold, SampleValue, and Result or leaf value.
    """
    rows = []
    for tree_idx, nodes in tree_structures.items():
        node_id = 0
        while True:
            node = nodes[node_id]
            feat = node['feature']
            if feat == 'Leaf':  # leaf node
                rows.append({
                    'Tree': tree_idx,
                    'Node': node_id,
                    'Feature': 'Leaf',
                    'Threshold': None,
                    'SampleValue': None,
                    'Result': node['gain']
                })
                break

            # decision node
            val = sample[feat]
            if pd.isna(val):
                branch, next_id = 'missing', node['missing']
            elif val < node['threshold']:
                branch, next_id = 'yes', node['yes']
            else:
                branch, next_id = 'no', node['no']

            rows.append({
                'Tree': tree_idx,
                'Node': node_id,
                'Feature': feat,
                'Threshold': node['threshold'],
                'SampleValue': val,
                'Result': branch
            })
            node_id = next_id

    return pd.DataFrame(rows).sort_values(['Tree', 'Node']).reset_index(drop=True)


def main():
    # File paths
    trees_csv = 'xgboost_trees.csv'
    features_csv = 'ns3_features_48Mbps.csv'
    labelenc_pkl = 'label_encoder.pkl'

    # Load LabelEncoder
    with open(labelenc_pkl, 'rb') as f:
        le_obj = pickle.load(f)
    classes = le_obj.classes_ if hasattr(le_obj, 'classes_') else np.array(le_obj)
    num_classes = len(classes)

    # Load data
    trees_df, features_df = load_data(trees_csv, features_csv)
    tree_structures = build_tree_structures(trees_df)

    # Select a sample
    sample_idx = 1
    sample = features_df.iloc[sample_idx]

    # Generate node-by-node report
    report_df = generate_report_for_sample(tree_structures, sample)
    report_df['Class'] = report_df['Tree'].apply(lambda t: classes[t % num_classes])

    # Prepare output file
    out_csv = f'detailed_report_sample_{sample_idx}.csv'

    # Save report DataFrame to CSV
    report_df.to_csv(out_csv, index=False)

    # Extract leaf outputs and compute class scores
    leaf_df = report_df[report_df['Feature'] == 'Leaf'][['Class', 'Result']]
    scores = leaf_df.groupby('Class')['Result'].sum().astype(float)

    # Softmax probabilities
    exp_vals = np.exp(scores.values)
    prob_vals = exp_vals / exp_vals.sum()
    probs = pd.Series(prob_vals, index=scores.index)

    # Prepare lines for appending
    lines = []
    lines.append('\nCalculation of class scores and probabilities:')
    for cls in classes:
        raw = scores.get(cls, 0.0)
        p = probs.get(cls, 0.0)
        lines.append(f"  Class {cls}:")
        lines.append(f"    - Sum of leaf values = {raw}")
        lines.append(f"    - Probability (softmax) = {p:.4f}")
    pred = probs.idxmax()
    lines.append(f"\nPredicted class for sample {sample_idx}: {pred}\n")

    # Append lines to the same CSV file
    with open(out_csv, 'a') as f:
        for line in lines:
            f.write(line + '\n')

    print(f"Report and calculations saved to '{out_csv}'")

if __name__ == '__main__':
    main()


Report and calculations saved to 'detailed_report_sample_1.csv'
