# Decision tree analysis for data discovery
This program is used to perform analysis on data category separation, exploring more valuable information.

In [588]:
import sys
from pathlib import Path
import pandas as pd
import json
from sqlalchemy import text

sys.path.append(f"{Path.cwd().parent.absolute()}/")
from setup import setup

db = setup()

## Required parameters
* OID (int): data object id in database
* target (str): target column name
* skip_features (list[str]): exclude feature from attribute (feature) list
* datetime_format (str): if there has datetime column, use this format to parse datetime column

In [589]:
OID = 139
skip_features = []
datetime_format = ""
target = "教育程度類別"
# target = "信用卡交易金額[新台幣]"
# target = "性別"
# target = "產業別"
# target = "信用卡交易筆數"

### Fetch data from database

In [590]:
query = text(f"SELECT * FROM [RawDB].[dbo].[D{OID}]")
data = db.execute(query)
query = text("SELECT * FROM [DV].[dbo].[Object] where OID = :OID")
object = db.execute(query, OID=OID).fetchall()
df = pd.DataFrame(data.fetchall())
data_object = pd.DataFrame(object)
print(data_object["CName"])
print(df.shape)
df.head()

0    各教育程度持卡人於十六縣消費樣態 (男女)
Name: CName, dtype: object
(147840, 7)


Unnamed: 0,年月,地區,產業別,性別,教育程度類別,信用卡交易筆數,信用卡交易金額[新台幣]
0,201511,南投縣,百貨,1,高中高職,419,386373
1,201511,南投縣,百貨,1,其他,246,196553
2,201511,南投縣,其他,1,博士,121,560606
3,201511,南投縣,其他,1,碩士,1397,6765858
4,201511,南投縣,其他,1,大學,4596,23640387


## Exploratory data analysis and feature engineering
* Clean and pre-processing data
* Split data to training sets (70% - 80%) and test sets
* Feature engineering: category values are encoded and other suitable changes are made to the data
* Predictive model is ready

### Show all column of this data

In [591]:
column_names: list[str] = df.columns.to_list()
column_names

['年月', '地區', '產業別', '性別', '教育程度類別', '信用卡交易筆數', '信用卡交易金額[新台幣]']

### Check if has null value

In [592]:
df.isnull().sum()

年月              0
地區              0
產業別             0
性別              0
教育程度類別          0
信用卡交易筆數         0
信用卡交易金額[新台幣]    0
dtype: int64

### Datetime column quantization

##### Initial variables

In [593]:
from typing import Tuple, Type


datetime_format_list = ["%Y-%m-%d", "%Y-%m", "%Y%m%d", "%Y%m", "%Y"]
# [column_name, earliest_time, latest_time]
datetime_column_earliest_latest_tuple: Tuple[str, Type[pd.Timestamp], Type[pd.Timestamp]] = []

if len(datetime_format) != 0:
    datetime_format_list.insert(0, datetime_format)

#### Find datetime column, also record min and max value

In [594]:
# to_datetime reference: https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html

for col in column_names:
    is_datetime = df[col].dtype == "datetime64[ns]"
    if is_datetime:
        datetime_column_earliest_latest_tuple.append(
            [
                col,
                df[col].min(),
                df[col].max(),
            ]
        )
        continue
    for test_format in datetime_format_list:
        is_numeric = df[col].dtype == "int64"
        is_datetime = is_numeric and (
            True
            not in pd.to_datetime(arg=df[col].astype("str"), format=test_format, errors="coerce")
            .isna()
            .value_counts()
            .index.to_list()
        )
        if is_datetime:
            parsed_datetime = pd.to_datetime(arg=df[col], format=test_format, errors="coerce")
            df[col] = parsed_datetime
            datetime_column_earliest_latest_tuple.append(
                [
                    col,
                    parsed_datetime.min(),
                    parsed_datetime.max(),
                ]
            )
            break

datetime_column_earliest_latest_tuple

[['年月', Timestamp('2014-01-01 00:00:00'), Timestamp('2023-02-01 00:00:00')]]

#### Quantize found datetime column

In [595]:
# Quantize datetime problem reference:
# https://stackoverflow.com/questions/43500894/pandas-pd-cut-binning-datetime-column-series
# ! this section can only execute once
for tuple in datetime_column_earliest_latest_tuple:
    # date_range reference: https://pandas.pydata.org/docs/reference/api/pandas.date_range.html
    # Frequency reference: https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases
    # 季度開始 -> QS
    # 年份開始 -> YS
    # TODO: 測試時間用哪個頻率進行離散化效果最好，季度 or 年度 or 月 etc.
    # * 逐一比較不同頻率的交叉驗證平均分數 (cross validation average score)
    # * 分數越高代表頻率更適合
    datetime_range = pd.date_range(start=tuple[1], end=tuple[2], freq="QS")
    datetime_range = datetime_range.union([tuple[2]])
    labels = ["({}, {}]".format(datetime_range[i - 1], datetime_range[i]) for i in range(1, len(datetime_range))]
    # ! this line of code can only cut once
    df[tuple[0]] = pd.cut(df[tuple[0]], bins=datetime_range, labels=labels, include_lowest=True)

### Numerical column

#### Counting min and max value

In [596]:
# [column_name, minimum_value, maximum_value]
numerical_column_max_min_tuple: Tuple[str, int, int] = []
for col in column_names:
    is_numeric = df[col].dtype == "int64"
    is_category_column = len(df[col].unique()) <= 10
    if is_numeric and (not is_category_column):
        numerical_column_max_min_tuple.append([col, df[col].min(), df[col].max()])

numerical_column_max_min_tuple

[['信用卡交易筆數', -3, 478692], ['信用卡交易金額[新台幣]', -1795429, 844247746]]

### Handle target and features

In [597]:
# 1. 處理不同型別的屬性 (類別型，數值型) -> 都可當作分析目標
# 2. 排除不想要的屬性，並且不加入 features (X) 裡面
# * 針對路徑中沒被用到的屬性
#   1. 類別型: 代表所有類別
#   2. 數值型: 代表在 MIN - MAX 區間
#   3. 時間型: 代表在最早與最晚的區間內

# Count target column ratio to determine its data type
target_column_ratio = len(df[target].unique()) / df[target].count()
# if not, it's numeric
is_category_column = df[target].dtype == "object" or target_column_ratio < 0.01
X: pd.DataFrame
try:
    X = df.drop([target] + skip_features, axis=1)
except KeyError:
    print("Column of target or skip features not exist in data frame")
feature_names = X.columns.to_list()
# If value of target column are numeric, divide it into multiple intervals (discretize)
quantization_labels = ["low", "middle", "high"]
discrete_bin_num = 3
y = (
    df[target].astype("string")
    if is_category_column
    else pd.qcut(df[target], q=discrete_bin_num, labels=quantization_labels)
)
target_class_names = y.unique().tolist()
target_class_names

['高中高職', '其他', '博士', '碩士', '大學', '專科']

### Encode category column of features

#### Initialization

In [598]:
import category_encoders as ce

# ! Prior knowledge
# TODO: 類別型欄位，是否有大小關係? ⇒ 讓使用者去決定這個順序, 可以利用 Python 的 category type
category_frame = X.select_dtypes(include=["object", "category"])
category_frame.head()

Unnamed: 0,年月,地區,產業別
0,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,百貨
1,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,百貨
2,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,其他
3,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,其他
4,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,其他


In [599]:
encoder = ce.OrdinalEncoder(cols=category_frame.columns)
X = pd.DataFrame(encoder.fit_transform(X))
X.head()

Unnamed: 0,年月,地區,產業別,性別,信用卡交易筆數,信用卡交易金額[新台幣]
0,8,1,1,1,419,386373
1,8,1,1,1,246,196553
2,8,1,2,1,121,560606
3,8,1,2,1,1397,6765858
4,8,1,2,1,4596,23640387


#### Encoder mapping

In [600]:
category_column_mapping = encoder.mapping
category_column_mapping

[{'col': '年月',
  'mapping': (2014-01-01 00:00:00, 2014-04-01 00:00:00]     1
  (2014-04-01 00:00:00, 2014-07-01 00:00:00]     2
  (2014-07-01 00:00:00, 2014-10-01 00:00:00]     3
  (2014-10-01 00:00:00, 2015-01-01 00:00:00]     4
  (2015-01-01 00:00:00, 2015-04-01 00:00:00]     5
  (2015-04-01 00:00:00, 2015-07-01 00:00:00]     6
  (2015-07-01 00:00:00, 2015-10-01 00:00:00]     7
  (2015-10-01 00:00:00, 2016-01-01 00:00:00]     8
  (2016-01-01 00:00:00, 2016-04-01 00:00:00]     9
  (2016-04-01 00:00:00, 2016-07-01 00:00:00]    10
  (2016-07-01 00:00:00, 2016-10-01 00:00:00]    11
  (2016-10-01 00:00:00, 2017-01-01 00:00:00]    12
  (2017-01-01 00:00:00, 2017-04-01 00:00:00]    13
  (2017-04-01 00:00:00, 2017-07-01 00:00:00]    14
  (2017-07-01 00:00:00, 2017-10-01 00:00:00]    15
  (2017-10-01 00:00:00, 2018-01-01 00:00:00]    16
  (2018-01-01 00:00:00, 2018-04-01 00:00:00]    17
  (2018-04-01 00:00:00, 2018-07-01 00:00:00]    18
  (2018-07-01 00:00:00, 2018-10-01 00:00:00]    19
  (20

### Split data to training and test dataset
Purpose: find dependencies between target and feature column

In [601]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [602]:
X_train.shape, X_test.shape

((99052, 6), (48788, 6))

## Fitting the model, evaluating the results and visualizing the trees
* Data totally prepared
* Classifier is instantiated
* Model is fit onto the data
* Ensure the model is neither over fitting and under fitting the data
* Evaluate classifier: confusion matrix, precision score, f1 score, recall, support scores

### Initial variables

In [603]:
from sklearn.tree import DecisionTreeClassifier

row_counts = len(X.index)
max_depth = 10
min_samples_split = 0
min_samples_leaf = 0

is_big_data = row_counts > 10000

if is_big_data:
    # 確保葉節點有足夠的樣本進行有意義的分析，同時避免過度細分
    # 100 - 1000
    min_samples_leaf = 100
    # 確保在分割內部節點之前有足夠的樣本數
    # 10 - 50
    min_samples_split = 10
else:
    # 確保每個葉節點至少有一些樣本進行分析
    # 1 or 2
    min_samples_leaf = 1
    # 確保在內部節點的樣本數較少時也可以進行分割
    # 2 - 5
    min_samples_split = 2

### Fitting training data into decision tree classifier

In [604]:
clf = DecisionTreeClassifier(
    criterion="entropy",
    splitter="best",
    max_depth=max_depth,
    random_state=0,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
)
decision_tree = clf.fit(X_train, y_train)

### Dependencies

#### Feature importance

In [605]:
# Pair importance and feature
feature_importance = clf.feature_importances_
feature_importance_pairs = list(zip(feature_names, feature_importance))
# Sort importance
feature_importance_pairs.sort(key=lambda pair: pair[1], reverse=True)
feature_importance_pairs

[('信用卡交易金額[新台幣]', 0.40439773919227434),
 ('地區', 0.227029385571455),
 ('產業別', 0.14638860759054895),
 ('信用卡交易筆數', 0.13945205366067234),
 ('性別', 0.04604622590085605),
 ('年月', 0.03668598808419333)]

#### Cross validation

In [606]:
from sklearn.model_selection import cross_val_score

# cv: k-fold, default is 5-fold
cross_validation_score = cross_val_score(clf, X, y, cv=5)
print("交叉驗證分數:", cross_validation_score)
print("平均分數:", cross_validation_score.mean())

交叉驗證分數: [0.44666531 0.39343209 0.47578463 0.50378788 0.46367695]
平均分數: 0.4566693722943723


In [607]:
y_predict_test = clf.predict(X_test)
y_predict_test

array(['其他', '高中高職', '博士', ..., '博士', '專科', '其他'], dtype=object)

In [608]:
y_predict_train = clf.predict(X_train)
y_predict_train

array(['碩士', '專科', '其他', ..., '大學', '高中高職', '博士'], dtype=object)

#### Model accuracy

In [609]:
from sklearn.metrics import accuracy_score

print("Training set score: {:.4f}".format(accuracy_score(y_train, y_predict_train)))
print("Test set score: {0:0.4f}".format(accuracy_score(y_test, y_predict_test)))

Training set score: 0.5177
Test set score: 0.4980


### Resolve decision tree structure to json

In [610]:
from sklearn import tree
from graphviz import Source

output_file_path = f"{Path.cwd().absolute()}/temp/temp.dot"

#### Types definitions

In [611]:
from dataclasses import dataclass


@dataclass
class DecisionTreeNode:
    id: int
    labels: list[str]


@dataclass
class DecisionTreeEdge:
    id: int
    label: str
    head: int
    tail: int


@dataclass
class DecisionTreeGraph:
    nodes: list[DecisionTreeNode]
    edges: dict[str, DecisionTreeEdge]  # node1_node2 as key value


@dataclass
class DecisionTreePath:
    path: list[int]
    nodeLabel: dict[int, list[str]]

#### Export decision tree from model and reconstruct DecisionTreeGraph

In [612]:
# The analysis goal is discovering data, not just training model
# ! re-fit the hole data (target and features, X and y), not splitted data

decision_tree = clf.fit(X, y)

# * Scikit-learn decision tree:
# Using optimized version of the CART algorithm
# Not support categorical variable for now, that is, categorical variable need to encode

# * Entropy range:
# From 0 to 1 for binary classification (target has only two classes, true or false)
# From 0 to log base 2 k where k is the number of classes

dotData = tree.export_graphviz(
    clf,
    out_file=output_file_path,
    feature_names=feature_names,
    class_names=target_class_names,
    max_depth=max_depth,
    label="all",
    rounded=True,
    filled=True,
)

with open(output_file_path, "r", encoding="utf-8") as f:
    dotData = f.read()

# Use graphviz lib to convert dot format to json format
source = Source(dotData)
jsonGraph = source.pipe(format="json").decode("utf-8")
dictGraph: dict = json.loads(jsonGraph)

# Filter needed part
nodes = list(
    map(
        lambda o: {"id": o.get("_gvid"), "labels": o.get("label").split("\\n")},
        dictGraph.get("objects"),
    )
)

edges = dict(
    map(
        lambda o: (
            str(o.get("tail")) + "_" + str(o.get("head")),
            {
                "id": o.get("_gvid"),
                "label": o.get("headlabel"),
                "head": o.get("tail"),
                "tail": o.get("head"),
            },
        ),
        dictGraph.get("edges"),
    )
)

#### Store information

In [613]:
# Information storage
# * type is category => store unique value
# * type is numeric => store min and max value
# * type is datetime => store min and max value

# target name -> str
# target unique values -> []
# feature names -> []
# feature values -> {
#   feature1: {
#     type, value
#   }
#   ...
# }
analysis_information: dict[str, str or list or dict] = {}
analysis_information["target_name"] = target
analysis_information["target_values"] = target_class_names
analysis_information["feature_names"] = feature_names

# Numeric
feature_values: dict[str, dict[str, str or list]] = {}
for n in numerical_column_max_min_tuple:
    feature_values[n[0]] = {"type": "numeric", "value": [n[1], n[2]]}

# Datetime
for d in datetime_column_earliest_latest_tuple:
    format = "%Y-%m-%d %X"
    feature_values[d[0]] = {
        "type": "datetime",
        "value": [d[1].strftime(format), d[2].strftime(format)],
    }

# Category
for c in category_column_mapping:
    feature_values[c["col"]] = {
        "type": "category",
        "value": (c["mapping"].index.to_list()),
        "mapping": pd.Series(dict((v, k) for k, v in c["mapping"].items())),
    }
    feature_values[c["col"]]["value"].pop()

unstored_features = list(set(feature_names) - set(list(feature_values.keys())))

# TODO: Custom mapping => 使用者指定 1 男, 2 女
for f in unstored_features:
    value = list(df[f].astype("string").unique())
    feature_values[f] = {
        "type": "category",
        "value": value,
        "mapping": pd.Series(dict((v, v) for v in value)),
    }

analysis_information["feature_values"] = feature_values
# ! This is important information of analysis process
analysis_information

{'target_name': '教育程度類別',
 'target_values': ['高中高職', '其他', '博士', '碩士', '大學', '專科'],
 'feature_names': ['年月', '地區', '產業別', '性別', '信用卡交易筆數', '信用卡交易金額[新台幣]'],
 'feature_values': {'信用卡交易筆數': {'type': 'numeric', 'value': [-3, 478692]},
  '信用卡交易金額[新台幣]': {'type': 'numeric', 'value': [-1795429, 844247746]},
  '年月': {'type': 'category',
   'value': ['(2014-01-01 00:00:00, 2014-04-01 00:00:00]',
    '(2014-04-01 00:00:00, 2014-07-01 00:00:00]',
    '(2014-07-01 00:00:00, 2014-10-01 00:00:00]',
    '(2014-10-01 00:00:00, 2015-01-01 00:00:00]',
    '(2015-01-01 00:00:00, 2015-04-01 00:00:00]',
    '(2015-04-01 00:00:00, 2015-07-01 00:00:00]',
    '(2015-07-01 00:00:00, 2015-10-01 00:00:00]',
    '(2015-10-01 00:00:00, 2016-01-01 00:00:00]',
    '(2016-01-01 00:00:00, 2016-04-01 00:00:00]',
    '(2016-04-01 00:00:00, 2016-07-01 00:00:00]',
    '(2016-07-01 00:00:00, 2016-10-01 00:00:00]',
    '(2016-10-01 00:00:00, 2017-01-01 00:00:00]',
    '(2017-01-01 00:00:00, 2017-04-01 00:00:00]',
    '(2017

### Decision tree path parser

In [614]:
from math import log


def DecisionTreePathParser(graph: DecisionTreeGraph, root_id: int = 0):
    paths: list[DecisionTreePath] = []

    # DFS: Depth-First Search
    def dfs(current_id: int = 0, path: list[int] = []):
        if not graph:
            return
        path.append(current_id)
        edge_values = list(map(lambda edge: DecisionTreeEdge(**edge), list(graph.edges.values())))
        outgoing_edges = list(filter(lambda edge: edge.head == current_id, edge_values))

        # 如果目前節點沒有出邊（即為最底層節點），將路徑加入結果中
        if len(outgoing_edges) == 0:
            last_id = path[len(path) - 1]
            last_node = DecisionTreeNode(**(graph.nodes[last_id]))
            node_labels: dict[int, list[str]] = {}
            # FIXME:a Maybe here will cause some bugs
            # ! 排除 entropy 高的 path
            entropy = float(last_node.labels[0].split(" ")[2])

            if entropy > log(len(feature_names), 2) / 2:
                path.pop()
                return
            # ! ####################
            node_labels[last_id] = last_node.labels

            for i in range(0, len(path) - 1):
                node_id = path[i]
                labels = DecisionTreeNode(**(graph.nodes[node_id])).labels
                # 如果下一個 node id 是上個 +1 則是 true，不然的話是 false
                # * left edge <= => true
                # * right edge > => false
                next_id = path[i + 1]
                if node_id + 1 != next_id:
                    new_labels = [*labels]
                    condition = new_labels[0]
                    split_condition = condition.split(" ")
                    split_condition[1] = ">"
                    new_labels[0] = " ".join(split_condition)
                    node_labels[node_id] = new_labels
                    continue
                node_labels[node_id] = [*labels]

            paths.append(DecisionTreePath([*path], node_labels))

        # 遍歷目前節點的所有出邊
        else:
            for edge in outgoing_edges:
                next_id = edge.tail

                # 遞迴呼叫深度優先搜索
                dfs(next_id, path)

        # 回溯，從路徑中移除目前節點
        path.pop()

    dfs(root_id)

    return paths


decision_tree_graph = DecisionTreeGraph(nodes, edges)
paths = DecisionTreePathParser(decision_tree_graph, 0)
print("Path counts = {}".format(len(paths)))

Path counts = 172


### Decision tree path analyzer

In [615]:
# Decision tree path analyzer purpose: discover more information of paths
# TODO: write decision tree path analyzer
def DecisionTreePathAnalyzer(paths: list[DecisionTreePath], target_unique: list[str], feature_names: list[str]):
    path_analysis: dict = {}
    for value in target_unique:
        path_analysis[value] = []

    for path in paths:
        path_analysis_part = {}
        for feature in feature_names:
            path_analysis_part[feature] = analysis_information["feature_values"][feature]["value"]
        for node_id in path.path:
            if node_id == path.path[len(path.path) - 1]:
                target_class = path.nodeLabel[node_id][3].split(" ")[2]
                path_analysis[target_class].append(path_analysis_part)
                continue

            labels = path.nodeLabel[node_id][0].split(" ")
            feature = labels[0]
            symbol = labels[1]
            type = analysis_information["feature_values"][labels[0]]["type"]

            if type == "category":
                mapping = analysis_information["feature_values"][feature]["mapping"]

            status = [symbol, type]

            match status:
                case ["<=", "category"]:
                    pass
                case ["<=", "numeric"]:
                    pass
                case [">", "category"]:
                    pass
                case [">", "numeric"]:
                    pass
                case _:
                    print("no match case")

    return path_analysis


path_analysis = DecisionTreePathAnalyzer(paths=paths, target_unique=target_class_names, feature_names=feature_names)

path_analysis

{'高中高職': [{'年月': ['(2014-01-01 00:00:00, 2014-04-01 00:00:00]',
    '(2014-04-01 00:00:00, 2014-07-01 00:00:00]',
    '(2014-07-01 00:00:00, 2014-10-01 00:00:00]',
    '(2014-10-01 00:00:00, 2015-01-01 00:00:00]',
    '(2015-01-01 00:00:00, 2015-04-01 00:00:00]',
    '(2015-04-01 00:00:00, 2015-07-01 00:00:00]',
    '(2015-07-01 00:00:00, 2015-10-01 00:00:00]',
    '(2015-10-01 00:00:00, 2016-01-01 00:00:00]',
    '(2016-01-01 00:00:00, 2016-04-01 00:00:00]',
    '(2016-04-01 00:00:00, 2016-07-01 00:00:00]',
    '(2016-07-01 00:00:00, 2016-10-01 00:00:00]',
    '(2016-10-01 00:00:00, 2017-01-01 00:00:00]',
    '(2017-01-01 00:00:00, 2017-04-01 00:00:00]',
    '(2017-04-01 00:00:00, 2017-07-01 00:00:00]',
    '(2017-07-01 00:00:00, 2017-10-01 00:00:00]',
    '(2017-10-01 00:00:00, 2018-01-01 00:00:00]',
    '(2018-01-01 00:00:00, 2018-04-01 00:00:00]',
    '(2018-04-01 00:00:00, 2018-07-01 00:00:00]',
    '(2018-07-01 00:00:00, 2018-10-01 00:00:00]',
    '(2018-10-01 00:00:00, 2019-01-0

### Path data to JSON string

In [616]:
paths_json_str = json.dumps(list(map(lambda path: path.__dict__, paths)))
paths_object = json.loads(paths_json_str)
# ! This is important path analysis result include nodes and their labels
paths_object

[{'path': [0, 1, 2, 3, 4, 5, 6, 7, 11, 12],
  'nodeLabel': {'12': ['entropy = 0.0',
    'samples = 132',
    'value = [0, 132, 0, 0, 0, 0]',
    'class = 其他'],
   '0': ['信用卡交易金額[新台幣] <= 1572301.5',
    'entropy = 2.585',
    'samples = 147840',
    'value = [24640, 24640, 24640, 24640, 24640, 24640]',
    'class = 高中高職'],
   '1': ['信用卡交易筆數 <= 232.5',
    'entropy = 2.13',
    'samples = 44557',
    'value = [4225, 22028, 2921, 4674, 7487, 3222]',
    'class = 其他'],
   '2': ['地區 <= 9.5',
    'entropy = 1.863',
    'samples = 27795',
    'value = [2131, 16740, 1341, 2347, 3501, 1735]',
    'class = 其他'],
   '3': ['性別 <= 1.5',
    'entropy = 1.063',
    'samples = 11869',
    'value = [412, 9616, 126, 377, 1150, 188]',
    'class = 其他'],
   '4': ['產業別 <= 4.5',
    'entropy = 1.473',
    'samples = 5343',
    'value = [387, 3804, 102, 320, 567, 163]',
    'class = 其他'],
   '5': ['信用卡交易筆數 <= 73.5',
    'entropy = 1.737',
    'samples = 4035',
    'value = [380, 2535, 101, 314, 545, 160]',
 