## Step 1: Importing the required libraries and datasets

In [833]:
import sys
from pathlib import Path
import pandas as pd
import json
from sqlalchemy import text

# Append root path on execution time that it can find setup.py
sys.path.append(f"{Path.cwd().parent.absolute()}/")
from setup import setup

In [834]:
# Call setup function to connect to database
db = setup()
db

Engine(mssql+pyodbc://dataviz:***@10.21.24.190\SQLEXPRESS:1433/RawDB?driver=ODBC+Driver+17+for+SQL+Server)

### Parameter

In [835]:
# This classification handler required parameters
OID = 139
skip_features = ["年月"]
target = "信用卡交易金額[新台幣]"
# target = "性別"
# target = "教育程度類別"
# target = "產業別"
# target = "信用卡交易筆數"

In [836]:
# Select data from OID
query = text(f"SELECT * FROM [RawDB].[dbo].[D{OID}]")
result = db.execute(query)
df = pd.DataFrame(result.fetchall())

### Show origin data shape

In [837]:
df.shape

(147840, 7)

In [838]:
df.head()

Unnamed: 0,年月,地區,產業別,性別,教育程度類別,信用卡交易筆數,信用卡交易金額[新台幣]
0,201511,南投縣,百貨,1,高中高職,419,386373
1,201511,南投縣,百貨,1,其他,246,196553
2,201511,南投縣,其他,1,博士,121,560606
3,201511,南投縣,其他,1,碩士,1397,6765858
4,201511,南投縣,其他,1,大學,4596,23640387


## Step 2: Exploratory data analysis and feature engineering
* Clean and pre-processing data
* Split data to training sets (70% - 80%) and test sets
* Feature engineering: category values are encoded and other suitable changes are made to the data
* Predictive model is ready

In [839]:
column_names = df.columns.to_list()
column_names

['年月', '地區', '產業別', '性別', '教育程度類別', '信用卡交易筆數', '信用卡交易金額[新台幣]']

In [840]:
# Value counts of each columns
for col in column_names:
    print(df[col].value_counts(), "\n")

201511    1344
202108    1344
202207    1344
202206    1344
202205    1344
          ... 
201808    1344
201807    1344
201806    1344
201805    1344
201510    1344
Name: 年月, Length: 110, dtype: int64 

南投縣    9240
雲林縣    9240
宜蘭縣    9240
嘉義縣    9240
屏東縣    9240
嘉義市    9240
花蓮縣    9240
台東縣    9240
澎湖縣    9240
金門縣    9240
連江縣    9240
基隆市    9240
新竹市    9240
新竹縣    9240
苗栗縣    9240
彰化縣    9240
Name: 地區, dtype: int64 

百貨      21120
其他      21120
食       21120
衣       21120
住       21120
行       21120
文教康樂    21120
Name: 產業別, dtype: int64 

1    73920
2    73920
Name: 性別, dtype: int64 

高中高職    24640
其他      24640
博士      24640
碩士      24640
大學      24640
專科      24640
Name: 教育程度類別, dtype: int64 

0        1776
1         514
3         355
2         350
4         329
         ... 
6609        1
27418       1
18550       1
30565       1
25795       1
Name: 信用卡交易筆數, Length: 26352, dtype: int64 

0           1774
2000          38
1600          14
3000          13
1800          12
            

In [841]:
# Check if columns have any null value
df.isnull().sum()

年月              0
地區              0
產業別             0
性別              0
教育程度類別          0
信用卡交易筆數         0
信用卡交易金額[新台幣]    0
dtype: int64

### Handle target and features selection

In [842]:
# Choose one target column (or called target attribute, that is, y) and drop from X (features)
labels = ["low", "middle", "high"]
discrete_bin_num = 3
X: pd.DataFrame
try:
    X = df.drop([target] + skip_features, axis=1)
except:
    print("Column of target or skip features not exist in data frame")
feature_names = X.columns
# If value of target column are numeric, divide it into multiple intervals (discretize)
is_category_column = df[target].dtype == "object" or len(df[target].unique()) <= 100
y = df[target].astype("string") if is_category_column else pd.qcut(df[target], q=discrete_bin_num, labels=labels)
class_names = y.unique()

In [843]:
class_names

['low', 'middle', 'high']
Categories (3, object): ['low' < 'middle' < 'high']

### Show features (X) and target (y)

In [844]:
X.head()

Unnamed: 0,地區,產業別,性別,教育程度類別,信用卡交易筆數
0,南投縣,百貨,1,高中高職,419
1,南投縣,百貨,1,其他,246
2,南投縣,其他,1,博士,121
3,南投縣,其他,1,碩士,1397
4,南投縣,其他,1,大學,4596


In [845]:
y.head()

0       low
1       low
2       low
3    middle
4      high
Name: 信用卡交易金額[新台幣], dtype: category
Categories (3, object): ['low' < 'middle' < 'high']

In [846]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [847]:
X_train.shape, X_test.shape

((99052, 5), (48788, 5))

In [848]:
X.dtypes

地區         object
產業別        object
性別          int64
教育程度類別     object
信用卡交易筆數     int64
dtype: object

In [849]:
# Prepare category list for encoding
category_frame = X.select_dtypes(include=["object"])

In [850]:
category_frame.head()

Unnamed: 0,地區,產業別,教育程度類別
0,南投縣,百貨,高中高職
1,南投縣,百貨,其他
2,南投縣,其他,博士
3,南投縣,其他,碩士
4,南投縣,其他,大學


### Encoding category value of features

In [851]:
# Transform category attribute into encoded value
import category_encoders as ce

encoder = ce.OrdinalEncoder(cols=category_frame.columns)
X = encoder.fit_transform(X)
X_train = encoder.fit_transform(X_train)
X_test = encoder.fit_transform(X_test)

In [852]:
# After category value encoded
X_train.head()

Unnamed: 0,地區,產業別,性別,教育程度類別,信用卡交易筆數
129818,1,1,2,1,123
11453,2,2,1,2,15250
142663,3,2,1,3,24631
138042,4,2,1,4,13841
139111,5,3,1,3,1983



## Step 3: Fitting the model, evaluating the results and visualizing the trees
* Data totally prepared
* Classifier is instantiated
* Model is fit onto the data
* Ensure the model is neither over fitting and under fitting the data
* Evaluate classifier: confusion matrix, precision score, f1 score, recall, support scores

In [853]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

### Fitting data into decision tree classifier

In [854]:
row_counts = len(X.index)
max_depth = 20
is_big_data = row_counts > 10000
min_samples_split = 0
min_samples_leaf = 0
if is_big_data:
    # 確保葉節點有足夠的樣本進行有意義的分析，同時避免過度細分
    # 100 - 1000
    min_samples_leaf = 100
    # 確保在分割內部節點之前有足夠的樣本數
    # 10 - 50
    min_samples_split = 10
else:
    # 確保每個葉節點至少有一些樣本進行分析
    # 1 or 2
    min_samples_leaf = 1
    # 確保在內部節點的樣本數較少時也可以進行分割
    # 2 - 5
    min_samples_split = 2

clf = DecisionTreeClassifier(
    criterion="entropy",
    max_depth=max_depth,
    random_state=0,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
)
decision_tree = clf.fit(X_train, y_train)

In [855]:
y_predict_test = clf.predict(X_test)
y_predict_test

array(['low', 'high', 'low', ..., 'low', 'low', 'middle'], dtype=object)

In [856]:
y_predict_train = clf.predict(X_train)
y_predict_train


array(['low', 'high', 'high', ..., 'high', 'low', 'low'], dtype=object)

### Model accuracy

In [857]:
# 如果某個分類目標的準確率低，代表這個目標與其他屬性的關聯性低，也就是說，其他屬性不足以準確的分類 (預測) 這個目標
print("Training set score: {:.4f}".format(
    accuracy_score(y_train, y_predict_train)))
print("Test set score: {0:0.4f}".format(
    accuracy_score(y_test, y_predict_test)))


Training set score: 0.9250
Test set score: 0.6928


In [858]:
from sklearn import tree
from graphviz import Source

output_file_path = f"{Path.cwd().absolute()}/temp/temp.dot"


### Export tree structure as json

In [859]:
decision_tree = clf.fit(X, y)

dotData = tree.export_graphviz(
    clf,
    out_file=output_file_path,
    feature_names=feature_names,
    class_names=class_names,
    max_depth=max_depth,
    label="all",
    rounded=True,
    filled=True,
)

with open(output_file_path, "r", encoding="utf-8") as f:
    dotData = f.read()

# Use graphviz lib to convert dot format to json format
source = Source(dotData)
jsonGraph = source.pipe(format="json").decode("utf-8")
dictGraph: dict = json.loads(jsonGraph)
result = {"nodes": [], "edges": []}

# Filter needed part
result["nodes"] = list(
    map(
        lambda o: {"id": o.get("_gvid"), "labels": o.get("label").split("\\n")},
        dictGraph.get("objects"),
    )
)

result["edges"] = dict(
    map(
        lambda o: (
            str(o.get("tail")) + "_" + str(o.get("head")),
            {
                "id": o.get("_gvid"),
                "label": o.get("headlabel"),
                "head": o.get("tail"),
                "tail": o.get("head"),
            },
        ),
        dictGraph.get("edges"),
    )
)

In [860]:
result["nodes"]


[{'id': 0,
  'labels': ['信用卡交易筆數 <= 1725.5',
   'entropy = 1.585',
   'samples = 147840',
   'value = [49280, 49280, 49280]',
   'class = low']},
 {'id': 1,
  'labels': ['信用卡交易筆數 <= 419.5',
   'entropy = 0.872',
   'samples = 66790',
   'value = [204, 48478, 18108]',
   'class = middle']},
 {'id': 2,
  'labels': ['信用卡交易筆數 <= 308.5',
   'entropy = 0.113',
   'samples = 36148',
   'value = [0, 35600, 548]',
   'class = middle']},
 {'id': 3,
  'labels': ['信用卡交易筆數 <= 145.5',
   'entropy = 0.039',
   'samples = 31719',
   'value = [0, 31587, 132]',
   'class = middle']},
 {'id': 4,
  'labels': ['entropy = 0.0',
   'samples = 22113',
   'value = [0, 22113, 0]',
   'class = middle']},
 {'id': 5,
  'labels': ['產業別 <= 6.5',
   'entropy = 0.105',
   'samples = 9606',
   'value = [0, 9474, 132]',
   'class = middle']},
 {'id': 6,
  'labels': ['信用卡交易筆數 <= 265.5',
   'entropy = 0.059',
   'samples = 8036',
   'value = [0, 7981, 55]',
   'class = middle']},
 {'id': 7,
  'labels': ['地區 <= 12.5',
   '

In [861]:
result["edges"]


{'0_1': {'id': 0, 'label': 'True', 'head': 0, 'tail': 1},
 '1_2': {'id': 2, 'label': None, 'head': 1, 'tail': 2},
 '2_3': {'id': 4, 'label': None, 'head': 2, 'tail': 3},
 '3_4': {'id': 6, 'label': None, 'head': 3, 'tail': 4},
 '3_5': {'id': 7, 'label': None, 'head': 3, 'tail': 5},
 '5_6': {'id': 8, 'label': None, 'head': 5, 'tail': 6},
 '6_7': {'id': 10, 'label': None, 'head': 6, 'tail': 7},
 '7_8': {'id': 12, 'label': None, 'head': 7, 'tail': 8},
 '8_9': {'id': 14, 'label': None, 'head': 8, 'tail': 9},
 '8_10': {'id': 15, 'label': None, 'head': 8, 'tail': 10},
 '10_11': {'id': 16, 'label': None, 'head': 10, 'tail': 11},
 '11_12': {'id': 18, 'label': None, 'head': 11, 'tail': 12},
 '12_13': {'id': 20, 'label': None, 'head': 12, 'tail': 13},
 '12_14': {'id': 21, 'label': None, 'head': 12, 'tail': 14},
 '14_15': {'id': 22, 'label': None, 'head': 14, 'tail': 15},
 '15_16': {'id': 24, 'label': None, 'head': 15, 'tail': 16},
 '16_17': {'id': 26, 'label': None, 'head': 16, 'tail': 17},
 '16_