# Decision tree analysis for data discovery
This program is used to perform analysis on data category separation, exploring more valuable information.

In [367]:
import sys
from pathlib import Path
import pandas as pd
import json
from sqlalchemy import text

sys.path.append(f"{Path.cwd().parent.absolute()}/")
from setup import setup

db = setup()

## Required parameters
* OID (int): data object id in database
* target (str): target column name
* skip_features (list[str]): exclude feature from attribute (feature) list
* datetime_format (str): if there has datetime column, use this format to parse datetime column

In [368]:
OID = 139
analysis_depth = 30
skip_features = []
datetime_format = ""
target = "教育程度類別"
# target = "信用卡交易金額[新台幣]"
# target = "性別"
# target = "產業別"
# target = "信用卡交易筆數"

### Fetch data from database

In [369]:
query = text(f"SELECT * FROM [RawDB].[dbo].[D{OID}]")
data = db.execute(query)
query = text("SELECT * FROM [DV].[dbo].[Object] where OID = :OID")
object = db.execute(query, OID=OID).fetchall()
df = pd.DataFrame(data.fetchall())
data_object = pd.DataFrame(object)
print(data_object["CName"])
print(df.shape)
df.head()

0    各教育程度持卡人於十六縣消費樣態 (男女)
Name: CName, dtype: object
(147840, 7)


Unnamed: 0,年月,地區,產業別,性別,教育程度類別,信用卡交易筆數,信用卡交易金額[新台幣]
0,201511,南投縣,百貨,1,高中高職,419,386373
1,201511,南投縣,百貨,1,其他,246,196553
2,201511,南投縣,其他,1,博士,121,560606
3,201511,南投縣,其他,1,碩士,1397,6765858
4,201511,南投縣,其他,1,大學,4596,23640387


## Exploratory data analysis and feature engineering
* Clean and pre-processing data
* Split data to training sets (70% - 80%) and test sets
* Feature engineering: category values are encoded and other suitable changes are made to the data
* Predictive model is ready

### Show all column of this data

In [370]:
column_names: list[str] = df.columns.to_list()
column_names

['年月', '地區', '產業別', '性別', '教育程度類別', '信用卡交易筆數', '信用卡交易金額[新台幣]']

### Check if has null value

In [371]:
df.isnull().sum()

年月              0
地區              0
產業別             0
性別              0
教育程度類別          0
信用卡交易筆數         0
信用卡交易金額[新台幣]    0
dtype: int64

### Datetime column quantization

##### Initial variables

In [372]:
from typing import Tuple, Type


datetime_format_list = ["%Y-%m-%d", "%Y-%m", "%Y%m%d", "%Y%m", "%Y"]
# [column_name, earliest_time, latest_time]
datetime_column_earliest_latest_tuple: Tuple[str, Type[pd.Timestamp], Type[pd.Timestamp]] = []

if len(datetime_format) != 0:
    datetime_format_list.insert(0, datetime_format)

#### Find datetime column, also record min and max value

In [373]:
# to_datetime reference: https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html

for col in column_names:
    is_datetime = df[col].dtype == "datetime64[ns]"
    if is_datetime:
        datetime_column_earliest_latest_tuple.append(
            [
                col,
                df[col].min(),
                df[col].max(),
            ]
        )
        continue
    for test_format in datetime_format_list:
        is_numeric = df[col].dtype == "int64"
        is_datetime = is_numeric and (
            True
            not in pd.to_datetime(arg=df[col].astype("str"), format=test_format, errors="coerce")
            .isna()
            .value_counts()
            .index.to_list()
        )
        if is_datetime:
            parsed_datetime = pd.to_datetime(arg=df[col], format=test_format, errors="coerce")
            df[col] = parsed_datetime
            datetime_column_earliest_latest_tuple.append(
                [
                    col,
                    parsed_datetime.min(),
                    parsed_datetime.max(),
                ]
            )
            break

datetime_column_earliest_latest_tuple

[['年月', Timestamp('2014-01-01 00:00:00'), Timestamp('2023-02-01 00:00:00')]]

#### Quantize found datetime column

In [374]:
# Quantize datetime problem reference:
# https://stackoverflow.com/questions/43500894/pandas-pd-cut-binning-datetime-column-series
# ! this section can only execute once
quantization_datetime_columns = []

for tuple in datetime_column_earliest_latest_tuple:
    # date_range reference: https://pandas.pydata.org/docs/reference/api/pandas.date_range.html
    # Frequency reference: https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases
    # 季度開始 -> QS
    # 年份開始 -> YS
    # TODO: 測試時間用哪個頻率進行離散化效果最好，季度 or 年度 or 月 etc.
    # * 逐一比較不同頻率的交叉驗證平均分數 (cross validation average score)
    # * 分數越高代表頻率更適合
    datetime_range = pd.date_range(start=tuple[1], end=tuple[2], freq="QS")
    datetime_range = datetime_range.union([tuple[2]])
    labels = ["({}, {}]".format(datetime_range[i - 1], datetime_range[i]) for i in range(1, len(datetime_range))]
    # ! this line of code can only cut once
    df[tuple[0]] = pd.cut(df[tuple[0]], bins=datetime_range, labels=labels, include_lowest=True)
    quantization_datetime_columns.append(tuple[0])

### Numerical column

#### Counting min and max value

In [375]:
# [column_name, minimum_value, maximum_value]
numerical_column_max_min_tuple: Tuple[str, int, int] = []
for col in column_names:
    is_numeric = df[col].dtype == "int64"
    is_category_column = len(df[col].unique()) <= 10
    if is_numeric and (not is_category_column):
        numerical_column_max_min_tuple.append([col, df[col].min(), df[col].max()])

numerical_column_max_min_tuple

[['信用卡交易筆數', -3, 478692], ['信用卡交易金額[新台幣]', -1795429, 844247746]]

### Handle target and features

In [376]:
# 1. 處理不同型別的屬性 (類別型，數值型) -> 都可當作分析目標
# 2. 排除不想要的屬性，並且不加入 features (X) 裡面

# TODO: Try to quantize all numeric column
quantile_mapping = {}

for column_name in column_names:
    column_ratio = len(df[column_name].unique()) / df[column_name].count()
    is_categorical_column = (
        df[column_name].dtype == "object" or df[column_name].dtype == "category" or column_ratio < 0.01
    )
    is_numerical_column = df[column_name].dtype == "int64" and not is_categorical_column

    if is_numerical_column:
        quantile_labels = ["low", "middle", "high"]
        discrete_bin_num = 3
        quantile = pd.qcut(df[column_name], q=discrete_bin_num).cat.categories
        quantile_mapping[column_name] = {"col": column_name, "quantile": quantile, "labels": quantile_labels}
        df[column_name] = pd.qcut(df[column_name], q=discrete_bin_num, labels=quantile_labels)

print(quantile_mapping)

X: pd.DataFrame
try:
    X = df.drop([target] + skip_features, axis=1)
except KeyError:
    print("Column of target or skip features are not exist in data frame")

feature_names = X.columns.to_list()

y = df[target].astype("string")
target_values = y.unique().tolist()
target_values

{'信用卡交易筆數': {'col': '信用卡交易筆數', 'quantile': IntervalIndex([(-3.001, 879.0], (879.0, 4356.0], (4356.0, 478692.0]], dtype='interval[float64, right]'), 'labels': ['low', 'middle', 'high']}, '信用卡交易金額[新台幣]': {'col': '信用卡交易金額[新台幣]', 'quantile': IntervalIndex([(-1795429.001, 1993207.667], (1993207.667, 11233651.667], (11233651.667, 844247746.0]], dtype='interval[float64, right]'), 'labels': ['low', 'middle', 'high']}}


['高中高職', '其他', '博士', '碩士', '大學', '專科']

### Encode category column of features

#### Initialization

In [377]:
import category_encoders as ce

# ! Prior knowledge
# TODO: 類別型欄位，是否有大小關係? ⇒ 讓使用者去決定這個順序, 可以利用 Python 的 category type
category_frame = X.select_dtypes(include=["object", "category"])
category_frame.head()

Unnamed: 0,年月,地區,產業別,信用卡交易筆數,信用卡交易金額[新台幣]
0,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,百貨,low,low
1,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,百貨,low,low
2,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,其他,low,low
3,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,其他,middle,middle
4,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,其他,high,high


In [378]:
encoder = ce.OrdinalEncoder(cols=category_frame.columns)
X = pd.DataFrame(encoder.fit_transform(X))
X.head()

Unnamed: 0,年月,地區,產業別,性別,信用卡交易筆數,信用卡交易金額[新台幣]
0,8,1,1,1,1,1
1,8,1,1,1,1,1
2,8,1,2,1,1,1
3,8,1,2,1,2,2
4,8,1,2,1,3,3


#### Encoder mapping

In [379]:
category_column_mapping = encoder.mapping
category_column_mapping

[{'col': '年月',
  'mapping': (2014-01-01 00:00:00, 2014-04-01 00:00:00]     1
  (2014-04-01 00:00:00, 2014-07-01 00:00:00]     2
  (2014-07-01 00:00:00, 2014-10-01 00:00:00]     3
  (2014-10-01 00:00:00, 2015-01-01 00:00:00]     4
  (2015-01-01 00:00:00, 2015-04-01 00:00:00]     5
  (2015-04-01 00:00:00, 2015-07-01 00:00:00]     6
  (2015-07-01 00:00:00, 2015-10-01 00:00:00]     7
  (2015-10-01 00:00:00, 2016-01-01 00:00:00]     8
  (2016-01-01 00:00:00, 2016-04-01 00:00:00]     9
  (2016-04-01 00:00:00, 2016-07-01 00:00:00]    10
  (2016-07-01 00:00:00, 2016-10-01 00:00:00]    11
  (2016-10-01 00:00:00, 2017-01-01 00:00:00]    12
  (2017-01-01 00:00:00, 2017-04-01 00:00:00]    13
  (2017-04-01 00:00:00, 2017-07-01 00:00:00]    14
  (2017-07-01 00:00:00, 2017-10-01 00:00:00]    15
  (2017-10-01 00:00:00, 2018-01-01 00:00:00]    16
  (2018-01-01 00:00:00, 2018-04-01 00:00:00]    17
  (2018-04-01 00:00:00, 2018-07-01 00:00:00]    18
  (2018-07-01 00:00:00, 2018-10-01 00:00:00]    19
  (20

### Split data to training and test dataset
Purpose: find dependencies between target and feature column

In [380]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [381]:
X_train.shape, X_test.shape

((99052, 6), (48788, 6))

## Fitting the model, evaluating the results and visualizing the trees
* Data totally prepared
* Classifier is instantiated
* Model is fit onto the data
* Ensure the model is neither over fitting and under fitting the data
* Evaluate classifier: confusion matrix, precision score, f1 score, recall, support scores

### Initial variables

In [382]:
from sklearn.tree import DecisionTreeClassifier

row_counts = len(X.index)
max_depth = analysis_depth
min_samples_split = 0
min_samples_leaf = 0

is_big_data = row_counts > 10000

if is_big_data:
    # 確保葉節點有足夠的樣本進行有意義的分析，同時避免過度細分
    # 100 - 1000
    min_samples_leaf = 100
    # 確保在分割內部節點之前有足夠的樣本數
    # 10 - 50
    min_samples_split = 10
else:
    # 確保每個葉節點至少有一些樣本進行分析
    # 1 or 2
    min_samples_leaf = 1
    # 確保在內部節點的樣本數較少時也可以進行分割
    # 2 - 5
    min_samples_split = 2

### Fitting training data into decision tree classifier

In [383]:
clf = DecisionTreeClassifier(
    criterion="entropy",
    splitter="best",
    max_depth=max_depth,
    random_state=0,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
)
decision_tree = clf.fit(X_train, y_train)

### Dependencies

#### Feature importance

In [384]:
# Pair importance and feature
feature_importance = clf.feature_importances_
feature_importance_pairs = list(zip(feature_names, feature_importance))
# Sort importance
feature_importance_pairs.sort(key=lambda pair: pair[1], reverse=True)
feature_importance_pairs

[('信用卡交易金額[新台幣]', 0.3546239835426098),
 ('地區', 0.2532279207551779),
 ('產業別', 0.17436739794266182),
 ('年月', 0.07766202584870012),
 ('信用卡交易筆數', 0.07124271335185399),
 ('性別', 0.0688759585589964)]

#### Cross validation

In [385]:
from sklearn.model_selection import cross_val_score

# cv: k-fold, default is 5-fold
cross_validation_score = cross_val_score(clf, X, y, cv=5)
print("交叉驗證分數:", cross_validation_score)
print("平均分數:", cross_validation_score.mean())

交叉驗證分數: [0.39086174 0.3773336  0.41541531 0.4240395  0.39434524]
平均分數: 0.4003990800865801


In [386]:
y_predict_test = clf.predict(X_test)
y_predict_test

array(['大學', '其他', '博士', ..., '碩士', '博士', '專科'], dtype=object)

In [387]:
y_predict_train = clf.predict(X_train)
y_predict_train

array(['博士', '大學', '高中高職', ..., '大學', '高中高職', '博士'], dtype=object)

#### Model accuracy

In [388]:
from sklearn.metrics import accuracy_score

print("Training set score: {:.4f}".format(accuracy_score(y_train, y_predict_train)))
print("Test set score: {0:0.4f}".format(accuracy_score(y_test, y_predict_test)))

Training set score: 0.4333
Test set score: 0.3929


### Resolve decision tree structure to json

In [389]:
from sklearn import tree
from graphviz import Source

output_file_path = f"{Path.cwd().absolute()}/temp/temp.dot"

#### Types definitions

In [390]:
from dataclasses import dataclass


@dataclass
class DecisionTreeNode:
    id: int
    labels: list[str]


@dataclass
class DecisionTreeEdge:
    id: int
    label: str
    head: int
    tail: int


@dataclass
class DecisionTreeGraph:
    nodes: list[DecisionTreeNode]
    edges: dict[str, DecisionTreeEdge]  # node1_node2 as key value


@dataclass
class DecisionTreePath:
    path: list[int]
    nodeLabel: dict[int, list[str]]

#### Export decision tree from model and reconstruct DecisionTreeGraph

In [391]:
# The analysis goal is discovering data, not just training model
# ! re-fit the hole data (target and features, X and y), not splitted data

decision_tree = clf.fit(X, y)

# * Scikit-learn decision tree:
# Using optimized version of the CART algorithm
# Not support categorical variable for now, that is, categorical variable need to encode

# * Entropy range:
# From 0 to 1 for binary classification (target has only two classes, true or false)
# From 0 to log base 2 k where k is the number of classes

dotData = tree.export_graphviz(
    clf,
    out_file=output_file_path,
    feature_names=feature_names,
    class_names=target_values,
    max_depth=max_depth,
    label="all",
    rounded=True,
    filled=True,
)

with open(output_file_path, "r", encoding="utf-8") as f:
    dotData = f.read()

# Use graphviz lib to convert dot format to json format
source = Source(dotData)
json_graph = source.pipe(format="json").decode("utf-8")
dict_graph: dict = json.loads(json_graph)

# Filter needed part
nodes = list(
    map(
        lambda o: {"id": o.get("_gvid"), "labels": o.get("label").split("\\n")},
        dict_graph.get("objects"),
    )
)

edges = dict(
    map(
        lambda o: (
            str(o.get("tail")) + "_" + str(o.get("head")),
            {
                "id": o.get("_gvid"),
                "label": o.get("headlabel"),
                "head": o.get("tail"),
                "tail": o.get("head"),
            },
        ),
        dict_graph.get("edges"),
    )
)

#### Store information

In [392]:
data_information: dict[str, str or list or dict] = {}
data_information["target_name"] = target
data_information["target_values"] = target_values
data_information["feature_names"] = feature_names

# Numeric
feature_values: dict[str, dict[str, str or list]] = {}

for n in numerical_column_max_min_tuple:
    feature_values[n[0]] = {"type": "numeric", "value": [n[1], n[2]]}

# Datetime
for d in datetime_column_earliest_latest_tuple:
    format = "%Y-%m-%d %X"
    feature_values[d[0]] = {
        "type": "datetime",
        "value": [d[1].strftime(format), d[2].strftime(format)],
    }

# Category
for c in category_column_mapping:
    is_datetime_column = c["col"] in quantization_datetime_columns
    feature_values[c["col"]] = {
        "type": "datetime" if is_datetime_column else "category",
        "value": (c["mapping"].index.to_list()),
        "mapping": pd.Series(dict((v, k) for k, v in c["mapping"].items())),
    }
    feature_values[c["col"]]["value"].pop()

unstored_features = list(set(feature_names) - set(list(feature_values.keys())))

# TODO: Custom mapping => 使用者指定 1 男, 2 女
for f in unstored_features:
    split_value = df[f].astype("string").unique().tolist()
    mapping_pairs = dict((i + 1, split_value[i]) for i in range(len(split_value)))
    mapping_pairs[-2] = "nan"
    mapping = pd.Series(mapping_pairs)
    feature_values[f] = {
        "type": "category",
        "value": split_value,
        "mapping": mapping,
    }

data_information["feature_values"] = feature_values

data_information

{'target_name': '教育程度類別',
 'target_values': ['高中高職', '其他', '博士', '碩士', '大學', '專科'],
 'feature_names': ['年月', '地區', '產業別', '性別', '信用卡交易筆數', '信用卡交易金額[新台幣]'],
 'feature_values': {'信用卡交易筆數': {'type': 'category',
   'value': ['low', 'middle', 'high'],
   'mapping':  1       low
    2    middle
    3      high
   -2       NaN
   dtype: object},
  '信用卡交易金額[新台幣]': {'type': 'category',
   'value': ['low', 'middle', 'high'],
   'mapping':  1       low
    2    middle
    3      high
   -2       NaN
   dtype: object},
  '年月': {'type': 'datetime',
   'value': ['(2014-01-01 00:00:00, 2014-04-01 00:00:00]',
    '(2014-04-01 00:00:00, 2014-07-01 00:00:00]',
    '(2014-07-01 00:00:00, 2014-10-01 00:00:00]',
    '(2014-10-01 00:00:00, 2015-01-01 00:00:00]',
    '(2015-01-01 00:00:00, 2015-04-01 00:00:00]',
    '(2015-04-01 00:00:00, 2015-07-01 00:00:00]',
    '(2015-07-01 00:00:00, 2015-10-01 00:00:00]',
    '(2015-10-01 00:00:00, 2016-01-01 00:00:00]',
    '(2016-01-01 00:00:00, 2016-04-01 00:00:00]',

### Decision tree path parser

In [393]:
from math import log


def DecisionTreePathParser(graph: DecisionTreeGraph, root_id: int = 0):
    paths: list[DecisionTreePath] = []

    # DFS: Depth-First Search
    def SearchPathByDFS(current_id: int = 0, path: list[int] = []):
        if not graph:
            return

        path.append(current_id)

        edge_values = list(map(lambda edge: DecisionTreeEdge(**edge), list(graph.edges.values())))
        outgoing_edges = list(filter(lambda edge: edge.head == current_id, edge_values))

        # 如果目前節點沒有出邊（即為最底層節點），將路徑加入結果中
        if len(outgoing_edges) == 0:
            last_id = path[len(path) - 1]
            last_node = DecisionTreeNode(**(graph.nodes[last_id]))
            node_labels: dict[int, list[str]] = {}

            # ! 排除 entropy 高的 path
            entropy = float(last_node.labels[0].split(" ")[2])

            if entropy > log(len(feature_names), 2) / 2:
                path.pop()
                return
            # ! ####################

            node_labels[last_id] = last_node.labels

            for i in range(0, len(path) - 1):
                node_id = path[i]
                labels = DecisionTreeNode(**(graph.nodes[node_id])).labels

                # 如果下一個的 node id 是上一個 +1 則是 true，不然的話是 false
                # * left edge <= => true
                # * right edge > => false

                next_id = path[i + 1]

                if node_id + 1 != next_id:
                    new_labels = [*labels]
                    condition = new_labels[0]
                    split_condition = condition.split(" ")
                    split_condition[1] = ">"
                    new_labels[0] = " ".join(split_condition)
                    node_labels[node_id] = new_labels
                    continue

                node_labels[node_id] = [*labels]

            paths.append(DecisionTreePath([*path], node_labels))

        # 遍歷目前節點的所有出邊
        else:
            for edge in outgoing_edges:
                next_id = edge.tail

                # 遞迴呼叫深度優先搜索
                SearchPathByDFS(next_id, path)

        # 回溯，從路徑中移除目前節點
        path.pop()

    SearchPathByDFS(root_id)

    return paths


decision_tree_graph = DecisionTreeGraph(nodes, edges)
paths = DecisionTreePathParser(decision_tree_graph, 0)
print("Path counts = {}".format(len(paths)))

Path counts = 189


### Decision tree path analyzer

In [394]:
from math import ceil, floor


def DecisionTreePathAnalyzer(paths: list[DecisionTreePath], target_values: list[str], feature_names: list[str]):
    path_analysis_result: dict = {}
    for split_value in target_values:
        path_analysis_result[split_value] = []

    for path in paths:
        path_analysis_result_part = {}

        for feature_name in feature_names:
            path_analysis_result_part[feature_name] = data_information["feature_values"][feature_name]["value"].copy()

        for node_id in path.path:
            labels = path.nodeLabel[node_id][0].split(" ")
            feature_name = labels[0]
            split_symbol = labels[1]
            split_value = float(labels[2])

            if node_id == path.path[len(path.path) - 1]:
                class_name = path.nodeLabel[node_id][3].split(" ")[2]

                sample_value = " ".join(path.nodeLabel[node_id][2].split(" ")[2:]).split(", ")
                sample_value[0] = sample_value[0][1:]
                sample_value[len(sample_value) - 1] = sample_value[len(sample_value) - 1][0:-1]

                path_analysis_result_part["entropy"] = float(split_value)
                path_analysis_result_part["samples"] = list(map(lambda value: int(value), sample_value))
                path_analysis_result_part["labels"] = target_values
                path_analysis_result_part["class"] = class_name

                path_analysis_result[class_name].append(path_analysis_result_part)
                break

            feature_type = data_information["feature_values"][labels[0]]["type"]
            split_situation = [split_symbol, feature_type]

            match split_situation:
                case ["<=", "category"]:
                    mapping: pd.Series = data_information["feature_values"][feature_name]["mapping"]
                    filter_values = mapping.drop(-2).loc[1 : floor(split_value)].tolist()
                    path_analysis_result_part[feature_name] = filter_values
                case ["<=", "datetime"]:
                    mapping: pd.Series = data_information["feature_values"][feature_name]["mapping"]
                    filter_values = mapping.drop(-2).loc[1 : floor(split_value)].tolist()
                    path_analysis_result_part[feature_name] = filter_values
                case ["<=", "numeric"]:
                    if split_value < path_analysis_result_part[feature_name][1]:
                        path_analysis_result_part[feature_name][1] = split_value
                case [">", "category"]:
                    mapping: pd.Series = data_information["feature_values"][feature_name]["mapping"]
                    filter_values = mapping.drop(-2).loc[ceil(split_value) :].tolist()
                    path_analysis_result_part[feature_name] = filter_values
                case [">", "datetime"]:
                    mapping: pd.Series = data_information["feature_values"][feature_name]["mapping"]
                    filter_values = mapping.drop(-2).loc[ceil(split_value) :].tolist()
                    path_analysis_result_part[feature_name] = filter_values
                case [">", "numeric"]:
                    if split_value > path_analysis_result_part[feature_name][0]:
                        path_analysis_result_part[feature_name][0] = split_value
                case _:
                    print("no match case")

    return path_analysis_result


path_analysis_result = DecisionTreePathAnalyzer(paths=paths, target_values=target_values, feature_names=feature_names)

path_analysis_result

{'高中高職': [{'年月': ['(2016-01-01 00:00:00, 2016-04-01 00:00:00]',
    '(2016-04-01 00:00:00, 2016-07-01 00:00:00]',
    '(2016-07-01 00:00:00, 2016-10-01 00:00:00]',
    '(2016-10-01 00:00:00, 2017-01-01 00:00:00]',
    '(2017-01-01 00:00:00, 2017-04-01 00:00:00]',
    '(2017-04-01 00:00:00, 2017-07-01 00:00:00]',
    '(2017-07-01 00:00:00, 2017-10-01 00:00:00]',
    '(2017-10-01 00:00:00, 2018-01-01 00:00:00]',
    '(2018-01-01 00:00:00, 2018-04-01 00:00:00]',
    '(2018-04-01 00:00:00, 2018-07-01 00:00:00]',
    '(2018-07-01 00:00:00, 2018-10-01 00:00:00]',
    '(2018-10-01 00:00:00, 2019-01-01 00:00:00]',
    '(2019-01-01 00:00:00, 2019-04-01 00:00:00]',
    '(2019-04-01 00:00:00, 2019-07-01 00:00:00]',
    '(2019-07-01 00:00:00, 2019-10-01 00:00:00]',
    '(2019-10-01 00:00:00, 2020-01-01 00:00:00]',
    '(2020-01-01 00:00:00, 2020-04-01 00:00:00]',
    '(2020-04-01 00:00:00, 2020-07-01 00:00:00]',
    '(2020-07-01 00:00:00, 2020-10-01 00:00:00]',
    '(2020-10-01 00:00:00, 2021-01-0

### Path data to JSON string

In [395]:
json_str = json.dumps(path_analysis_result)
json_object = json.loads(json_str)

for k in json_object:
    print(k, len(json_object[k]))

高中高職 1
其他 103
博士 36
碩士 4
大學 44
專科 1


In [396]:
for k in quantile_mapping:
    print(quantile_mapping[k]["col"])
    print(quantile_mapping[k]["quantile"].to_list())
    print(quantile_mapping[k]["labels"])

信用卡交易筆數
[Interval(-3.001, 879.0, closed='right'), Interval(879.0, 4356.0, closed='right'), Interval(4356.0, 478692.0, closed='right')]
['low', 'middle', 'high']
信用卡交易金額[新台幣]
[Interval(-1795429.001, 1993207.667, closed='right'), Interval(1993207.667, 11233651.667, closed='right'), Interval(11233651.667, 844247746.0, closed='right')]
['low', 'middle', 'high']


In [397]:
df.head()

Unnamed: 0,年月,地區,產業別,性別,教育程度類別,信用卡交易筆數,信用卡交易金額[新台幣]
0,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,百貨,1,高中高職,low,low
1,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,百貨,1,其他,low,low
2,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,其他,1,博士,low,low
3,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,其他,1,碩士,middle,middle
4,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,其他,1,大學,high,high


In [398]:
df.tail()

Unnamed: 0,年月,地區,產業別,性別,教育程度類別,信用卡交易筆數,信用卡交易金額[新台幣]
147835,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,文教康樂,1,其他,middle,middle
147836,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,百貨,1,博士,low,low
147837,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,百貨,1,碩士,low,low
147838,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,百貨,1,大學,low,low
147839,"(2015-10-01 00:00:00, 2016-01-01 00:00:00]",南投縣,百貨,1,專科,low,low


In [399]:
X

Unnamed: 0,年月,地區,產業別,性別,信用卡交易筆數,信用卡交易金額[新台幣]
0,8,1,1,1,1,1
1,8,1,1,1,1,1
2,8,1,2,1,1,1
3,8,1,2,1,2,2
4,8,1,2,1,3,3
...,...,...,...,...,...,...
147835,8,1,7,1,2,2
147836,8,1,1,1,1,1
147837,8,1,1,1,1,1
147838,8,1,1,1,1,1


In [400]:
y

0         高中高職
1           其他
2           博士
3           碩士
4           大學
          ... 
147835      其他
147836      博士
147837      碩士
147838      大學
147839      專科
Name: 教育程度類別, Length: 147840, dtype: string

In [403]:
data_information

{'target_name': '教育程度類別',
 'target_values': ['高中高職', '其他', '博士', '碩士', '大學', '專科'],
 'feature_names': ['年月', '地區', '產業別', '性別', '信用卡交易筆數', '信用卡交易金額[新台幣]'],
 'feature_values': {'信用卡交易筆數': {'type': 'category',
   'value': ['low', 'middle', 'high'],
   'mapping':  1       low
    2    middle
    3      high
   -2       NaN
   dtype: object},
  '信用卡交易金額[新台幣]': {'type': 'category',
   'value': ['low', 'middle', 'high'],
   'mapping':  1       low
    2    middle
    3      high
   -2       NaN
   dtype: object},
  '年月': {'type': 'datetime',
   'value': ['(2014-01-01 00:00:00, 2014-04-01 00:00:00]',
    '(2014-04-01 00:00:00, 2014-07-01 00:00:00]',
    '(2014-07-01 00:00:00, 2014-10-01 00:00:00]',
    '(2014-10-01 00:00:00, 2015-01-01 00:00:00]',
    '(2015-01-01 00:00:00, 2015-04-01 00:00:00]',
    '(2015-04-01 00:00:00, 2015-07-01 00:00:00]',
    '(2015-07-01 00:00:00, 2015-10-01 00:00:00]',
    '(2015-10-01 00:00:00, 2016-01-01 00:00:00]',
    '(2016-01-01 00:00:00, 2016-04-01 00:00:00]',

In [401]:
json_object

{'高中高職': [{'年月': ['(2016-01-01 00:00:00, 2016-04-01 00:00:00]',
    '(2016-04-01 00:00:00, 2016-07-01 00:00:00]',
    '(2016-07-01 00:00:00, 2016-10-01 00:00:00]',
    '(2016-10-01 00:00:00, 2017-01-01 00:00:00]',
    '(2017-01-01 00:00:00, 2017-04-01 00:00:00]',
    '(2017-04-01 00:00:00, 2017-07-01 00:00:00]',
    '(2017-07-01 00:00:00, 2017-10-01 00:00:00]',
    '(2017-10-01 00:00:00, 2018-01-01 00:00:00]',
    '(2018-01-01 00:00:00, 2018-04-01 00:00:00]',
    '(2018-04-01 00:00:00, 2018-07-01 00:00:00]',
    '(2018-07-01 00:00:00, 2018-10-01 00:00:00]',
    '(2018-10-01 00:00:00, 2019-01-01 00:00:00]',
    '(2019-01-01 00:00:00, 2019-04-01 00:00:00]',
    '(2019-04-01 00:00:00, 2019-07-01 00:00:00]',
    '(2019-07-01 00:00:00, 2019-10-01 00:00:00]',
    '(2019-10-01 00:00:00, 2020-01-01 00:00:00]',
    '(2020-01-01 00:00:00, 2020-04-01 00:00:00]',
    '(2020-04-01 00:00:00, 2020-07-01 00:00:00]',
    '(2020-07-01 00:00:00, 2020-10-01 00:00:00]',
    '(2020-10-01 00:00:00, 2021-01-0

### Select data from database with path parameters

In [402]:
# category column : add into where statement on select query
# numeric column
# ! 先手動編寫選擇語句，之後再用 code 去自動控制選擇的語句
query = text(f"SELECT * FROM [RawDB].[dbo].[D{OID}]")