## Step 1: Importing the required libraries and datasets

In [None]:
import sys
from pathlib import Path
import pandas as pd
import json
from sqlalchemy import text

# Append root path on execution time that it can find setup.py
sys.path.append(f"{Path.cwd().parent.absolute()}/")
from setup import setup

In [None]:
# Call setup function to connect to database
db = setup()

### Parameter

In [None]:
# This classification handler required parameters
OID = 139
skip_features = ["年月"]
# target = "信用卡交易金額[新台幣]"
# target = "性別"
target = "教育程度類別"
# target = "產業別"
# target = "信用卡交易筆數"

### Fetch data

In [None]:
# Select data from OID
query = text(f"SELECT * FROM [RawDB].[dbo].[D{OID}]")
result = db.execute(query)
df = pd.DataFrame(result.fetchall())

### Show origin data shape

In [None]:
df.shape

In [None]:
df.head()

## Step 2: Exploratory data analysis and feature engineering
* Clean and pre-processing data
* Split data to training sets (70% - 80%) and test sets
* Feature engineering: category values are encoded and other suitable changes are made to the data
* Predictive model is ready

In [None]:
column_names: list[str] = df.columns.to_list()
column_names

In [None]:
# Value counts of each columns
numerical_column_max_min_tuple = []
for col in column_names:
    is_numeric = df[col].dtype == "int64"
    is_category_column = len(df[col].unique()) <= 10
    print(df[col].value_counts(), "\n")
    if is_numeric and (not is_category_column):
        numerical_column_max_min_tuple.append([col, df[col].min(), df[col].max()])

In [None]:
numerical_column_max_min_tuple

In [None]:
# Check if columns have any null value
df.isnull().sum()

### Handle target and features selection

In [None]:
# Choose one target column (or called target attribute, that is, y) and drop from X (features)
is_category_column = df[target].dtype == "object" or len(df[target].unique()) <= 10
labels = ["low", "middle", "high"]
discrete_bin_num = 3
X: pd.DataFrame
try:
    X = df.drop([target] + skip_features, axis=1)
except:
    print("Column of target or skip features not exist in data frame")
feature_names = X.columns
# If value of target column are numeric, divide it into multiple intervals (discretize)
y = df[target].astype("string") if is_category_column else pd.qcut(df[target], q=discrete_bin_num, labels=labels)
class_names = y.unique()

In [None]:
class_names

### Show features (X) and target (y)

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train.shape, X_test.shape

In [None]:
X.dtypes

In [None]:
# Prepare category list for encoding
category_frame = X.select_dtypes(include=["object"])

In [None]:
category_frame.head()

### Encoding category value of features

In [None]:
# Transform category attribute into encoded value
import category_encoders as ce

encoder = ce.OrdinalEncoder(cols=category_frame.columns)
X = encoder.fit_transform(X)
X_train = encoder.fit_transform(X_train)
X_test = encoder.fit_transform(X_test)

In [None]:
for it in encoder.mapping:
    print(it["col"])
    print(it["mapping"], "\n")

In [None]:
# After category value has been encoded
X_train.head()


## Step 3: Fitting the model, evaluating the results and visualizing the trees
* Data totally prepared
* Classifier is instantiated
* Model is fit onto the data
* Ensure the model is neither over fitting and under fitting the data
* Evaluate classifier: confusion matrix, precision score, f1 score, recall, support scores

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

### Fitting data into decision tree classifier

In [None]:
row_counts = len(X.index)
max_depth = 10
is_big_data = row_counts > 10000
min_samples_split = 0
min_samples_leaf = 0
if is_big_data:
    # 確保葉節點有足夠的樣本進行有意義的分析，同時避免過度細分
    # 100 - 1000
    min_samples_leaf = 100
    # 確保在分割內部節點之前有足夠的樣本數
    # 10 - 50
    min_samples_split = 10
else:
    # 確保每個葉節點至少有一些樣本進行分析
    # 1 or 2
    min_samples_leaf = 1
    # 確保在內部節點的樣本數較少時也可以進行分割
    # 2 - 5
    min_samples_split = 2

clf = DecisionTreeClassifier(
    criterion="entropy",
    splitter="best",
    max_depth=max_depth,
    random_state=0,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
)
decision_tree = clf.fit(X_train, y_train)

In [None]:
y_predict_test = clf.predict(X_test)
y_predict_test

In [None]:
y_predict_train = clf.predict(X_train)
y_predict_train

### Model accuracy

In [None]:
# 如果某個分類目標的準確率低，代表這個目標與其他屬性的關聯性低，也就是說，其他屬性不足以準確的分類 (預測) 這個目標
print("Training set score: {:.4f}".format(accuracy_score(y_train, y_predict_train)))
print("Test set score: {0:0.4f}".format(accuracy_score(y_test, y_predict_test)))

### Export tree structure as json

In [None]:
from sklearn import tree
from graphviz import Source

output_file_path = f"{Path.cwd().absolute()}/temp/temp.dot"

In [None]:
decision_tree = clf.fit(X, y)

dotData = tree.export_graphviz(
    clf,
    out_file=output_file_path,
    feature_names=feature_names,
    class_names=class_names,
    max_depth=max_depth,
    label="all",
    rounded=True,
    filled=True,
)

with open(output_file_path, "r", encoding="utf-8") as f:
    dotData = f.read()

# Use graphviz lib to convert dot format to json format
source = Source(dotData)
jsonGraph = source.pipe(format="json").decode("utf-8")
dictGraph: dict = json.loads(jsonGraph)
result = {"nodes": [], "edges": []}

# Filter needed part
result["nodes"] = list(
    map(
        lambda o: {"id": o.get("_gvid"), "labels": o.get("label").split("\\n")},
        dictGraph.get("objects"),
    )
)

result["edges"] = dict(
    map(
        lambda o: (
            str(o.get("tail")) + "_" + str(o.get("head")),
            {
                "id": o.get("_gvid"),
                "label": o.get("headlabel"),
                "head": o.get("tail"),
                "tail": o.get("head"),
            },
        ),
        dictGraph.get("edges"),
    )
)

### Final result

In [None]:
result["nodes"]

In [None]:
result["edges"]