<a href="https://colab.research.google.com/github/dannyycwang/SankeyX/blob/main/SankeyX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# 定義事件類型
EVENTS = ['Browse', 'Detail', 'Add', 'Remove']

# 建立假資料
data = [
    {'session': 'Comparative', 'sequence': ['Browse', 'Detail', 'Add'], 'label': 1},
    {'session': 'Comparative', 'sequence': ['Browse', 'Detail', 'Remove'], 'label': 0},
    {'session': 'Hesitant',    'sequence': ['Browse', 'Remove'], 'label': 0},
    {'session': 'Hesitant',    'sequence': ['Browse', 'Detail', 'Remove'], 'label': 0}
]

df = pd.DataFrame(data)
df['prediction'] = [1, 0, 1, 0]  # 假設模型預測結果
df['shap_values'] = [
    [0.2, 0.3, 0.5],    # Positive SHAP
    [0.2, 0.1, -0.5],   # 負SHAP
    [-0.2, -0.8],       # 全負
    [-0.1, 0.1, -0.7]
]

# Utility matrix
UTILITY = {
    (1, 1): 3,    # TP
    (1, 0): -1,   # FP
    (0, 1): -2.5,   # FN
    (0, 0): 0      # TN
}

def get_utility(row):
    return UTILITY[(row['prediction'], row['label'])]

df['utility'] = df.apply(get_utility, axis=1)

# 產生Sankey nodes
event_color_map = {
    'Browse': 'green',
    'Detail': 'skyblue',
    'Add': 'orange',
    'Remove': 'lightsteelblue',
    'True Positive': 'purple',
    'True Negative': 'gray',
    'False Positive': 'red',
    'False Negative': 'pink'
}

# 建立node list
nodes = []
for e in EVENTS:
    if e not in nodes:
        nodes.append(e)
for outcome in ['True Positive', 'True Negative', 'False Positive', 'False Negative']:
    nodes.append(outcome)
nodes.append('Utility')

# 對應節點索引
node_map = {name: i for i, name in enumerate(nodes)}

# Links資料產生
links = {
    'source': [],
    'target': [],
    'value': [],
    'color': [],
    'label': []
}

for idx, row in df.iterrows():
    seq = row['sequence']
    pred = row['prediction']
    label = row['label']
    shap = row['shap_values']

    # 根據SHAP正負選顏色
    for i in range(len(seq)-1):
        links['source'].append(node_map[seq[i]])
        links['target'].append(node_map[seq[i+1]])
        links['value'].append(abs(shap[i+1]))
        color = 'gray' if shap[i+1] > 0 else 'red'
        links['color'].append(color)
        links['label'].append('')

    # 事件最後一步 -> 預測結果
    # Outcome決定
    if pred == 1 and label == 1:
        outcome = 'True Positive'
    elif pred == 0 and label == 0:
        outcome = 'True Negative'
    elif pred == 1 and label == 0:
        outcome = 'False Positive'
    elif pred == 0 and label == 1:
        outcome = 'False Negative'

    links['source'].append(node_map[seq[-1]])
    links['target'].append(node_map[outcome])
    links['value'].append(1)
    links['color'].append(event_color_map[outcome])
    links['label'].append(outcome)

    # outcome -> utility
    links['source'].append(node_map[outcome])
    links['target'].append(node_map['Utility'])
    links['value'].append(abs(row['utility']))
    links['color'].append('gold' if row['utility'] > 0 else 'red')
    links['label'].append('Utility: $' + str(row['utility']))

# 畫 Sankey
fig = go.Figure(data=[go.Sankey(
    arrangement = "snap",
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = nodes,
      color = [event_color_map.get(n, 'lightgray') for n in nodes]
    ),
    link = dict(
      source = links['source'],
      target = links['target'],
      value = links['value'],
      color = links['color'],
      label = links['label']
  ))])

fig.update_layout(title_text="SankeyX Visualization Demo", font_size=13)
fig.show()


NameError: name 'links' is not defined

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, Input
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import ast

# 載入你新上傳的資料
df = pd.read_csv("results2_with_cluster (1).csv")

# sequence 從後往前取 6 個，不足補 0
def last_n(seq, n=20):
    seq = seq[-n:] if len(seq) >= n else [0] * (n - len(seq)) + seq
    return seq

df['truncated_sequence'] = df['truncated_sequence'].apply(ast.literal_eval)
df['X_seq'] = df['truncated_sequence'].apply(lambda x: last_n(x, 20))

# 這裡 intent 欄位請根據你的實際欄位名稱調整
intent_cols = ['Intent1', 'Intent2', 'Intent3', 'Intent4', 'Intent5', 'Intent6']
for col in intent_cols:
    df[col] = df[col].fillna(False).astype(int)

X_seq = np.array(df['X_seq'].tolist())
X_feat = df[intent_cols].values
y = df['purchase'].values

# Train/test split
X_seq_tr, X_seq_te, X_feat_tr, X_feat_te, y_tr, y_te = train_test_split(
    X_seq, X_feat, y, test_size=0.2, random_state=42
)

# 只保留 sequence only 版本（你也可以開啟 with_feat 模式）
def create_transformer_model():
    MAXLEN, VOCAB_SIZE, EMBED_DIM, FF_DIM, NUM_HEADS, DROPOUT = 20, 20, 32, 64, 2, 0.3
    seq_input = Input(shape=(MAXLEN,), name='sequence_input')
    x = layers.Embedding(VOCAB_SIZE, EMBED_DIM, mask_zero=True)(seq_input)
    pos_encoding = layers.Embedding(MAXLEN, EMBED_DIM)(tf.range(MAXLEN))
    x = x + pos_encoding
    # transformer block
    attn_output = layers.MultiHeadAttention(num_heads=NUM_HEADS, key_dim=EMBED_DIM)(x, x)
    attn_output = layers.Dropout(DROPOUT)(attn_output)
    out1 = layers.LayerNormalization()(x + attn_output)
    ffn_output = layers.Dense(FF_DIM, activation="relu")(out1)
    ffn_output = layers.Dense(EMBED_DIM)(ffn_output)
    ffn_output = layers.Dropout(DROPOUT)(ffn_output)
    x = layers.LayerNormalization()(out1 + ffn_output)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(FF_DIM, activation='relu')(x)
    x = layers.Dropout(DROPOUT)(x)
    output = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=seq_input, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_eval(model, X_train, y_train, X_test, y_test, name):
    cb = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=10, batch_size=32,
              validation_data=(X_test, y_test), callbacks=[cb])
    y_prob = model.predict(X_test, verbose=0).squeeze()
    y_pred = (y_prob > 0.5).astype(int)
    print(f"\n{name} Evaluation:")
    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, zero_division=0))
    print("Recall   :", recall_score(y_test, y_pred))
    print("F1 Score :", f1_score(y_test, y_pred))
    # 可選：保存結果
    pd.DataFrame({
        "y_true": y_test,
        "y_prob": y_prob,
        "y_pred": y_pred
    }).to_csv(f"{name}_result.csv", index=False)
    print(f"Saved: {name}_result.csv")

# 執行訓練與評估
model = create_transformer_model()
train_eval(model, X_seq_tr, y_tr, X_seq_te, y_te, "A1_seq6only")


  df[col] = df[col].fillna(False).astype(int)
  df[col] = df[col].fillna(False).astype(int)
  df[col] = df[col].fillna(False).astype(int)
  df[col] = df[col].fillna(False).astype(int)
  df[col] = df[col].fillna(False).astype(int)
  df[col] = df[col].fillna(False).astype(int)


Epoch 1/10
[1m5519/5519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 14ms/step - accuracy: 0.9285 - loss: 0.1996 - val_accuracy: 0.9451 - val_loss: 0.1663
Epoch 2/10
[1m5519/5519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 14ms/step - accuracy: 0.9447 - loss: 0.1651 - val_accuracy: 0.9471 - val_loss: 0.1584
Epoch 3/10
[1m5519/5519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 14ms/step - accuracy: 0.9462 - loss: 0.1620 - val_accuracy: 0.9487 - val_loss: 0.1645
Epoch 4/10
[1m5519/5519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 13ms/step - accuracy: 0.9473 - loss: 0.1594 - val_accuracy: 0.9507 - val_loss: 0.1557
Epoch 5/10
[1m5519/5519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 13ms/step - accuracy: 0.9479 - loss: 0.1571 - val_accuracy: 0.9457 - val_loss: 0.1617
Epoch 6/10
[1m5519/5519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 14ms/step - accuracy: 0.9520 - loss: 0.1479 - val_accuracy: 0.9528 - val_loss: 0.1473
Epoc

In [None]:
import shap
import pandas as pd
import numpy as np

# 欄位設定
orig_cols = [
    'session_id_hash',
    'purchase',
    'symbolic_trace',
    'Intent_type',
    'truncated_sequence',
    'truncated_timestamp',
    'Intent1', 'Intent2', 'Intent3', 'Intent4', 'Intent5', 'Intent6'
]

# Step 1. 取 test set 前 1000 筆（可自行調整 pool size）
N_POOL = 1000
n_test = len(X_seq_te)
test_idx = df.index[-n_test:]  # test set 在原始df的index
pool_idx = test_idx[:N_POOL]

X_pool = X_seq_te[:N_POOL]
background = X_seq_tr[:100]

# Step 2. SHAP
explainer = shap.Explainer(model, background)
shap_values = explainer(X_pool)
shap_array = shap_values.values

# Step 3. 預測
y_prob = model.predict(X_pool).squeeze()
y_pred = (y_prob > 0.5).astype(int)

# Step 4. 合併
df_pool = df.iloc[pool_idx][orig_cols].reset_index(drop=True)
df_pool['y_prob'] = y_prob
df_pool['y_pred'] = y_pred
shap_df = pd.DataFrame(shap_array, columns=[f"SHAP_{i+1}" for i in range(X_pool.shape[1])])
df_concat = pd.concat([df_pool, shap_df], axis=1)

# Step 5. 分層抽樣（150 not purchase，50 purchase, 至少35正確）
purchase_pos = df_concat[df_concat['purchase'] == 1]
purchase_pos_right = purchase_pos[purchase_pos['y_pred'] == 1]
purchase_pos_wrong = purchase_pos[purchase_pos['y_pred'] == 0]

n_right = min(35, len(purchase_pos_right))
n_total = 50

right_sample = purchase_pos_right.sample(n=n_right, random_state=42)
wrong_sample = purchase_pos_wrong.sample(n=n_total - n_right, random_state=42) if n_total - n_right > 0 else pd.DataFrame()
purchase_sample = pd.concat([right_sample, wrong_sample]).sample(n=n_total, random_state=42)

not_purchase_sample = df_concat[df_concat['purchase'] == 0].sample(n=150, random_state=42)

final_sample = pd.concat([purchase_sample, not_purchase_sample]).sample(frac=1, random_state=42).reset_index(drop=True)
final_sample.to_csv("shap_sampled_200.csv", index=False)
print("已輸出：shap_sampled_200.csv")


PermutationExplainer explainer: 1001it [35:22,  2.13s/it]

[1m14/32[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m0s[0m 4ms/step 




[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
已輸出：shap_sampled_200.csv


In [None]:
# 合併 SHAP
shap_df = pd.DataFrame(shap_array, columns=[f"SHAP_{i+1}" for i in range(20)])
df_concat = pd.concat([df_sample, shap_df], axis=1)
df_concat.to_csv("shap_with_original_200.csv", index=False)
print("已儲存: shap_with_original_200.csv")

已儲存: shap_with_original_200.csv


In [9]:
# 1. 必要套件
!pip install plotly pandas

import pandas as pd
import plotly.graph_objects as go

# 2. 讀取資料
df = pd.read_csv('shap_sampled_200.csv')

# 幫你預處理欄位（有些欄可能存成字串要eval）
import ast
def safe_eval(val):
    try:
        return ast.literal_eval(val)
    except:
        return val

df['truncated_sequence'] = df['truncated_sequence'].apply(safe_eval)

# 3. 篩選序列長度 >= 5
df = df[df['truncated_sequence'].apply(lambda x: len(x) >= 5)].reset_index(drop=True)

# 控件參數（你可以修改這裡）
num_sessions = 5  # 顯示幾筆（你可以改前5/後5）
session_mode = 'first'  # 'first' or 'last'
intent_filter = None  # 例：'Comparative'，None代表所有

# 篩選資料
if intent_filter:
    filtered = df[df['Intent_type'] == intent_filter]
else:
    filtered = df

if session_mode == 'first':
    selected = filtered.head(num_sessions)
else:
    selected = filtered.tail(num_sessions)

# Utility matrix
utility_dict = {
    (1, 1): 3,    # TP
    (1, 0): -1,   # FP
    (0, 1): -2.5,   # FN
    (0, 0): 1e-6      # TN
}

# 行為顏色對應
event_color_map = {
    1: '#1f77b4',  # Browse
    2: '#ff7f0e',  # Detail
    3: '#2ca02c',  # Add
    4: '#d62728',  # Remove
    5: '#9467bd',  # Purchase (通常沒在truncated_sequence內)
}

event_name_map = {
    1: 'Browse',
    2: 'Detail',
    3: 'Add',
    4: 'Remove',
    5: 'Purchase'
}

outcome_color_map = {
    'TP': '#6a3d9a',
    'TN': '#aaaaaa',
    'FP': '#e41a1c',
    'FN': '#ff7f00'
}

# 4. 將每個session拆成: [行為, shap, y_pred, label, TP/FP...]
records = []
for _, row in selected.iterrows():
    seq = row['truncated_sequence'][-5:]  # 倒數5個
    # 填補不足為0
    seq = [0] * (5 - len(seq)) + list(seq)
    shap_values = [row[f'SHAP_{i}'] for i in range(16, 21)]  # 16-20
    y_pred = int(row['y_pred'])
    y_true = int(row['purchase'])
    intent = row['Intent_type']

    # 判斷 outcome
    if y_pred == 1 and y_true == 1:
        outcome = 'TP'
    elif y_pred == 0 and y_true == 0:
        outcome = 'TN'
    elif y_pred == 1 and y_true == 0:
        outcome = 'FP'
    elif y_pred == 0 and y_true == 1:
        outcome = 'FN'
    utility = utility_dict[(y_pred, y_true)]

    records.append({
        'sequence': seq,
        'shap': shap_values,
        'y_pred': y_pred,
        'y_true': y_true,
        'outcome': outcome,
        'utility': utility,
        'intent': intent
    })

# 5. 組Sankey nodes & links（橫向對齊）
nodes = []
node_labels = []
node_colors = []

for pos in range(5):
    for event in [1, 2, 3, 4, 0]:  # 0是padding
        nodes.append(f'{pos}_{event}')
        if event == 0:
            node_labels.append('PAD')
            node_colors.append('#cccccc')
        else:
            node_labels.append(event_name_map[event])
            node_colors.append(event_color_map[event])

outcome_nodes = []
for out in ['TP', 'TN', 'FP', 'FN']:
    nodes.append(f'OUT_{out}')
    node_labels.append(out)
    node_colors.append(outcome_color_map[out])
    outcome_nodes.append(f'OUT_{out}')

nodes.append('UTILITY')
node_labels.append('Utility')
node_colors.append('#FFD700')

node_idx = {name: i for i, name in enumerate(nodes)}

# 建立Sankey links
sources = []
targets = []
values = []
colors = []
link_labels = []
separate_mode = True  # True=完全分開, False=預設GROUP

nodes = []
node_labels = []
node_colors = []
node_idx = {}

# 建立outcome/utility nodes（這兩種還是共用）
for out in ['TP', 'TN', 'FP', 'FN']:
    node_name = f'OUT_{out}'
    nodes.append(node_name)
    node_labels.append(out)
    node_colors.append(outcome_color_map[out])
    node_idx[node_name] = len(nodes) - 1

nodes.append('UTILITY')
node_labels.append('Utility')
node_colors.append('#FFD700')
node_idx['UTILITY'] = len(nodes) - 1

sources = []
targets = []
values = []
colors = []
link_labels = []

for sidx, record in enumerate(records):
    prev = None
    for pos, (event, shap) in enumerate(zip(record['sequence'], record['shap'])):
        if separate_mode:
            curr_node = f'{sidx}_{pos}_{event}'
            # 這行關鍵！每session每步都唯一
            if curr_node not in node_idx:
                nodes.append(curr_node)
                # label可以show session_id, step, event等
                node_labels.append(f'S{str(sidx)}-{pos}-{event_name_map.get(event,"PAD")}')
                node_colors.append(event_color_map.get(event, '#cccccc'))
                node_idx[curr_node] = len(nodes) - 1
        else:
            curr_node = f'{pos}_{event}'
            if curr_node not in node_idx:
                nodes.append(curr_node)
                node_labels.append(event_name_map.get(event, 'PAD'))
                node_colors.append(event_color_map.get(event, '#cccccc'))
                node_idx[curr_node] = len(nodes) - 1

        if prev is not None:
            sources.append(node_idx[prev])
            targets.append(node_idx[curr_node])
            values.append(abs(shap) + 0.1)
            colors.append(event_color_map.get(event, '#cccccc'))
            link_labels.append(f'SHAP: {shap:.2f}')
        prev = curr_node

    # outcome連線
    outcome_node = f'OUT_{record["outcome"]}'
    sources.append(node_idx[prev])
    targets.append(node_idx[outcome_node])
    values.append(1.5)
    colors.append(outcome_color_map[record['outcome']])
    link_labels.append(record['outcome'])

    sources.append(node_idx[outcome_node])
    targets.append(node_idx['UTILITY'])
    values.append(abs(record['utility']))
    colors.append('#FFD700' if record['utility'] > 0 else '#e41a1c')
    link_labels.append(f'${record["utility"]}')

# UTILITY加總label
total_utility = sum([r['utility'] for r in records])
node_labels[-1] = f'Utility: {total_utility:.2f}'

# Sankey圖
fig = go.Figure(go.Sankey(
    arrangement="snap",
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=node_labels,
        color=node_colors,
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=colors,
        label=link_labels
    )
))
fig.update_layout(
    title_text=f"SankeyX Clickstream Visualization ({'Separate' if separate_mode else 'Grouped'} Mode)",
    font_size=12,
    width=1400,
    height=700
)
fig.show()





In [14]:
# 1. 必要套件


import pandas as pd
import plotly.graph_objects as go

# 2. 讀取資料
df = pd.read_csv('shap_sampled_200.csv')

# 幫你預處理欄位（有些欄可能存成字串要eval）
import ast
def safe_eval(val):
    try:
        return ast.literal_eval(val)
    except:
        return val

df['truncated_sequence'] = df['truncated_sequence'].apply(safe_eval)

# 3. 篩選序列長度 >= 1（這裡不必再強制5步，只要有行為即可）
df = df[df['truncated_sequence'].apply(lambda x: len(x) >= 1)].reset_index(drop=True)

# 控件參數（你可以修改這裡）
num_sessions = 5  # 顯示幾筆
session_mode = 'first'  # 'first' or 'last'
intent_filter = None  # 例：'Comparative'，None代表所有

# 篩選資料
if intent_filter:
    filtered = df[df['Intent_type'] == intent_filter]
else:
    filtered = df

if session_mode == 'first':
    selected = filtered.head(num_sessions)
else:
    selected = filtered.tail(num_sessions)

# Utility matrix
utility_dict = {
    (1, 1): 3,    # TP
    (1, 0): -1,   # FP
    (0, 1): -2.5, # FN
    (0, 0): 1e-6  # TN
}

# 行為顏色對應
event_color_map = {
    1: '#1f77b4',  # Browse
    2: '#ff7f0e',  # Detail
    3: '#2ca02c',  # Add
    4: '#d62728',  # Remove
    5: '#9467bd',  # Purchase (通常沒在truncated_sequence內)
}

event_name_map = {
    1: 'Browse',
    2: 'Detail',
    3: 'Add',
    4: 'Remove',
    5: 'Purchase'
}

outcome_color_map = {
    'TP': '#6a3d9a',
    'TN': '#aaaaaa',
    'FP': '#e41a1c',
    'FN': '#ff7f00'
}

records = []
for _, row in selected.iterrows():
    seq = list(row['truncated_sequence'])[-5:]  # 最多取倒數5個步驟
    seq_len = len(seq)
    # SHAP欄位對應（防呆: 若找不到就補0）
    shap_start_idx = 21 - seq_len
    shap_values = []
    for i in range(shap_start_idx, 21):
        shap_col = f'SHAP_{i}'
        if shap_col in row and pd.notna(row[shap_col]):
            shap_values.append(row[shap_col])
        else:
            shap_values.append(0.0)
    # 若 SHAP 欄位還是不夠，再補0
    if len(shap_values) < seq_len:
        shap_values = [0.0] * (seq_len - len(shap_values)) + shap_values
    elif len(shap_values) > seq_len:
        shap_values = shap_values[-seq_len:]
    # 其餘同前
    y_pred = int(row['y_pred'])
    y_true = int(row['purchase'])
    intent = row['Intent_type']

    # 判斷 outcome
    if y_pred == 1 and y_true == 1:
        outcome = 'TP'
    elif y_pred == 0 and y_true == 0:
        outcome = 'TN'
    elif y_pred == 1 and y_true == 0:
        outcome = 'FP'
    elif y_pred == 0 and y_true == 1:
        outcome = 'FN'
    utility = utility_dict[(y_pred, y_true)]

    records.append({
        'sequence': seq,
        'shap': shap_values,
        'y_pred': y_pred,
        'y_true': y_true,
        'outcome': outcome,
        'utility': utility,
        'intent': intent
    })


# Sankey nodes & links（完全分開模式）
separate_mode = True  # True=完全分開, False=預設GROUP

nodes = []
node_labels = []
node_colors = []
node_idx = {}

# outcome/utility nodes（這兩種還是共用）
for out in ['TP', 'TN', 'FP', 'FN']:
    node_name = f'OUT_{out}'
    nodes.append(node_name)
    node_labels.append(out)
    node_colors.append(outcome_color_map[out])
    node_idx[node_name] = len(nodes) - 1

nodes.append('UTILITY')
node_labels.append('Utility')
node_colors.append('#FFD700')
node_idx['UTILITY'] = len(nodes) - 1

sources = []
targets = []
values = []
colors = []
link_labels = []

for sidx, record in enumerate(records):
    prev = None
    for pos, (event, shap) in enumerate(zip(record['sequence'], record['shap'])):
        event = int(event)
        if separate_mode:
            curr_node = f'{sidx}_{pos}_{event}'
            if curr_node not in node_idx:
                nodes.append(curr_node)
                node_labels.append(f'S{sidx}-{pos}-{event_name_map.get(event,"PAD")}')
                node_colors.append(event_color_map.get(event, '#cccccc'))
                node_idx[curr_node] = len(nodes) - 1
        else:
            curr_node = f'{pos}_{event}'
            if curr_node not in node_idx:
                nodes.append(curr_node)
                node_labels.append(event_name_map.get(event, 'PAD'))
                node_colors.append(event_color_map.get(event, '#cccccc'))
                node_idx[curr_node] = len(nodes) - 1

        if prev is not None:
            sources.append(node_idx[prev])
            targets.append(node_idx[curr_node])
            values.append(abs(shap) + 0.1)
            # ★★★ 這裡改成根據SHAP正負分色 ★★★
            if shap >= 0:
                colors.append('#888888')   # 灰色，正SHAP
            else:
                colors.append('#ffb6c1')   # 粉紅，負SHAP
            link_labels.append(f'SHAP: {shap:.2f}')
        prev = curr_node

    # 只有 prev 不為 None 才畫 outcome 及 utility 的連線
    outcome_node = f'OUT_{record["outcome"]}'
    if prev is not None:
        sources.append(node_idx[prev])
        targets.append(node_idx[outcome_node])
        values.append(1.5)
        colors.append(outcome_color_map[record['outcome']])
        link_labels.append(record['outcome'])

        sources.append(node_idx[outcome_node])
        targets.append(node_idx['UTILITY'])
        values.append(abs(record['utility']))
        if record['utility'] > 0:
            colors.append('#27ae60')  # 綠色
        elif record['utility'] < 0:
            colors.append('#e41a1c')  # 紅色
        else:
            colors.append('#aaaaaa')  # 0分灰
        link_labels.append(f'${record["utility"]}')
    else:
        # 沒有任何步驟（sequence空），不要連 outcome/utility
        pass


# UTILITY加總label
total_utility = sum([r['utility'] for r in records])
node_labels[-1] = f'Utility: {total_utility:.2f}'

fig = go.Figure(go.Sankey(
    arrangement="snap",
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=node_labels,
        color=node_colors,
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=colors,
        label=link_labels
    )
))
fig.update_layout(
    title_text=f"SankeyX Clickstream Visualization ({'Separate' if separate_mode else 'Grouped'} Mode)",
    font_size=12,
    width=1400,
    height=700
)
fig.show()


In [69]:
import pandas as pd
import plotly.graph_objects as go

# 1. 讀取資料
df = pd.read_csv('shap_sampled_200.csv')

# 2. 預處理行為序列欄位
import ast
def safe_eval(val):
    try:
        return ast.literal_eval(val)
    except:
        return val

df['truncated_sequence'] = df['truncated_sequence'].apply(safe_eval)

# 3. 控件參數
num_sessions = 5  # 顯示幾筆
session_mode = 'first'  # 'first' or 'last'
intent_filter = None  # 篩 intent，如 'Comparative'

# 篩選資料
if intent_filter:
    filtered = df[df['Intent_type'] == intent_filter]
else:
    filtered = df

if session_mode == 'first':
    selected = filtered.head(num_sessions)
else:
    selected = filtered.tail(num_sessions)

# 4. Utility matrix
utility_dict = {
    (1, 1): 3,    # TP
    (1, 0): -1,   # FP
    (0, 1): -2.5, # FN
    (0, 0): 1e-6  # TN
}

# 5. 顏色對應
event_color_map = {
    1: '#1f77b4',  # Browse
    2: '#ff7f0e',  # Detail
    3: '#2ca02c',  # Add
    4: '#d62728',  # Remove
    5: '#9467bd',  # Purchase (通常不在 truncated_sequence)
}

event_name_map = {
    1: 'Browse',
    2: 'Detail',
    3: 'Add',
    4: 'Remove',
    5: 'Purchase'
}

outcome_color_map = {
    'TP': '#6a3d9a',
    'TN': '#aaaaaa',
    'FP': '#e41a1c',
    'FN': '#ff7f00'
}

# 6. 組裝 records（無前置過濾！）
records = []
for _, row in selected.iterrows():
    seq = list(row['truncated_sequence'])[-5:]  # 最多取倒數5步
    seq_len = len(seq)
    # 假設 SHAP_1~SHAP_20
    all_shap_values = [row.get(f'SHAP_{i}', 0.0) for i in range(1, 21)]
    shap_values = [v * 10 for v in (all_shap_values[-seq_len:] if seq_len > 0 else [])]
    y_pred = int(row['y_pred'])
    y_true = int(row['purchase'])
    intent = row['Intent_type']
    # outcome
    if y_pred == 1 and y_true == 1:
        outcome = 'TP'
    elif y_pred == 0 and y_true == 0:
        outcome = 'TN'
    elif y_pred == 1 and y_true == 0:
        outcome = 'FP'
    elif y_pred == 0 and y_true == 1:
        outcome = 'FN'
    utility = utility_dict[(y_pred, y_true)]
    records.append({
        'sequence': seq,
        'shap': shap_values,
        'y_pred': y_pred,
        'y_true': y_true,
        'outcome': outcome,
        'utility': utility,
        'intent': intent
    })


# 7. Sankey nodes & links
separate_mode = False

nodes = []
node_labels = []
node_colors = []
node_idx = {}

# outcome/utility nodes
for out in ['TP', 'TN', 'FP', 'FN']:
    node_name = f'OUT_{out}'
    nodes.append(node_name)
    node_labels.append(out)
    node_colors.append(outcome_color_map[out])
    node_idx[node_name] = len(nodes) - 1

nodes.append('UTILITY')
node_labels.append('Utility')
node_colors.append('#FFD700')
node_idx['UTILITY'] = len(nodes) - 1


sources = []
targets = []
values = []
colors = []
link_labels = []

for sidx, record in enumerate(records):
    seq = record['sequence']
    shap_seq = record['shap']
    filtered = [(e, s) for e, s in zip(seq, shap_seq) if e != 0]
    if len(filtered) == 0:
        continue
    prev = None
    for step_idx, (event, shap) in enumerate(filtered):
      if separate_mode:
        curr_node = f'{sidx}_step{step_idx}'  # 獨立（每 session 各自一組）
      else:
        curr_node = f'group_{step_idx}_{event}'  # Group：同一個 step、同 event 都會合併
      if curr_node not in node_idx:
        nodes.append(curr_node)
        node_labels.append('')  # 不顯示
        node_colors.append(event_color_map.get(event, '#cccccc'))
        node_idx[curr_node] = len(nodes) - 1
      if prev is not None:
        sources.append(node_idx[prev])
        targets.append(node_idx[curr_node])
        values.append(abs(shap) + 0.1)
        if shap >= 0:
            colors.append('#888888')
        else:
            colors.append('#ffb6c1')
        link_labels.append('')
      prev = curr_node

    # outcome
    outcome_node = f'OUT_{record["outcome"]}'
    if prev is not None:
        last_shap = filtered[-1][1]
        sources.append(node_idx[prev])
        targets.append(node_idx[outcome_node])
        values.append(abs(last_shap) + 0.1)
        # 這裡決定顏色
        if last_shap >= 0:
            colors.append('#888888')  # SHAP Positive
        else:
            colors.append('#ffb6c1')  # SHAP Negative
        link_labels.append('')
        # Utility還是 outcome 分色
        sources.append(node_idx[outcome_node])
        targets.append(node_idx['UTILITY'])
        values.append(abs(record['utility']))
        if record['utility'] > 0:
            colors.append('#27ae60')
        elif record['utility'] < 0:
            colors.append('#e41a1c')
        else:
            colors.append('#aaaaaa')
        link_labels.append('')


# Utility總分
total_utility = sum([r['utility'] for r in records])
node_labels[node_idx['UTILITY']] = f'Utility: {total_utility:.2f}'

fig = go.Figure(go.Sankey(
    arrangement="snap",
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=node_labels,
        color=node_colors,
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=colors,
        label=link_labels
    )
))
fig.update_layout(
    title_text=f"SankeyX Clickstream Visualization ({'Separate' if separate_mode else 'Grouped'} Mode)",
    font_size=12,
    width=1400,
    height=700
)
fig.show()


In [65]:
import pandas as pd
import plotly.graph_objects as go

# 1. 讀取資料
df = pd.read_csv('shap_sampled_200.csv')

# 2. 預處理行為序列欄位
import ast
def safe_eval(val):
    try:
        return ast.literal_eval(val)
    except:
        return val

df['truncated_sequence'] = df['truncated_sequence'].apply(safe_eval)

# 3. 控件參數
num_sessions = 5  # 顯示幾筆
session_mode = 'first'  # 'first' or 'last'
intent_filter = None  # 篩 intent，如 'Comparative'

# 篩選資料
if intent_filter:
    filtered = df[df['Intent_type'] == intent_filter]
else:
    filtered = df

if session_mode == 'first':
    selected = filtered.head(num_sessions)
else:
    selected = filtered.tail(num_sessions)

# 4. Utility matrix
utility_dict = {
    (1, 1): 3,    # TP
    (1, 0): -1,   # FP
    (0, 1): -2.5, # FN
    (0, 0): 1e-6  # TN
}

# 5. 顏色對應
event_color_map = {
    1: '#1f77b4',  # Browse
    2: '#ff7f0e',  # Detail
    3: '#2ca02c',  # Add
    4: '#d62728',  # Remove
    5: '#9467bd',  # Purchase (通常不在 truncated_sequence)
}

event_name_map = {
    1: 'Browse',
    2: 'Detail',
    3: 'Add',
    4: 'Remove',
    5: 'Purchase'
}

outcome_color_map = {
    'TP': '#6a3d9a',
    'TN': '#aaaaaa',
    'FP': '#e41a1c',
    'FN': '#ff7f00'
}

# 6. 組裝 records（無前置過濾！）
records = []
for _, row in selected.iterrows():
    seq = list(row['truncated_sequence'])[-5:]  # 最多取倒數5步
    seq_len = len(seq)
    # 假設 SHAP_1~SHAP_20
    all_shap_values = [row.get(f'SHAP_{i}', 0.0) for i in range(1, 21)]
    shap_values = [v * 1 for v in (all_shap_values[-seq_len:] if seq_len > 0 else [])]
    y_pred = int(row['y_pred'])
    y_true = int(row['purchase'])
    intent = row['Intent_type']
    # outcome
    if y_pred == 1 and y_true == 1:
        outcome = 'TP'
    elif y_pred == 0 and y_true == 0:
        outcome = 'TN'
    elif y_pred == 1 and y_true == 0:
        outcome = 'FP'
    elif y_pred == 0 and y_true == 1:
        outcome = 'FN'
    utility = utility_dict[(y_pred, y_true)]
    records.append({
        'sequence': seq,
        'shap': shap_values,
        'y_pred': y_pred,
        'y_true': y_true,
        'outcome': outcome,
        'utility': utility,
        'intent': intent
    })


# 7. Sankey nodes & links
separate_mode = False

nodes = []
node_labels = []
node_colors = []
node_idx = {}

# outcome/utility nodes
for out in ['TP', 'TN', 'FP', 'FN']:
    node_name = f'OUT_{out}'
    nodes.append(node_name)
    node_labels.append(out)
    node_colors.append(outcome_color_map[out])
    node_idx[node_name] = len(nodes) - 1

nodes.append('UTILITY')
node_labels.append('Utility')
node_colors.append('#FFD700')
node_idx['UTILITY'] = len(nodes) - 1

sources = []
targets = []
values = []
colors = []
link_labels = []

for sidx, record in enumerate(records):
    seq = record['sequence']
    shap_seq = record['shap']
    filtered = [(e, s) for e, s in zip(seq, shap_seq) if e != 0]
    if len(filtered) == 0:
        continue
    prev = None
    for step_idx, (event, shap) in enumerate(filtered):
      if separate_mode:
        curr_node = f'{sidx}_step{step_idx}'  # 獨立（每 session 各自一組）
      else:
        curr_node = f'group_{step_idx}_{event}'  # Group：同一個 step、同 event 都會合併
      if curr_node not in node_idx:
        nodes.append(curr_node)
        node_labels.append('')  # 不顯示
        node_colors.append(event_color_map.get(event, '#cccccc'))
        node_idx[curr_node] = len(nodes) - 1
      if prev is not None:
        sources.append(node_idx[prev])
        targets.append(node_idx[curr_node])
        values.append(abs(shap) + 0.1)
        if shap >= 0:
            colors.append('#888888')
        else:
            colors.append('#ffb6c1')
        link_labels.append('')
      prev = curr_node

    # outcome
    outcome_node = f'OUT_{record["outcome"]}'
    if prev is not None:
        last_shap = filtered[-1][1]
        sources.append(node_idx[prev])
        targets.append(node_idx[outcome_node])
        values.append(abs(last_shap) + 0.1)
        # 這裡決定顏色
        if last_shap >= 0:
            colors.append('#888888')  # SHAP Positive
        else:
            colors.append('#ffb6c1')  # SHAP Negative
        link_labels.append('')
        # Utility還是 outcome 分色
        sources.append(node_idx[outcome_node])
        targets.append(node_idx['UTILITY'])
        values.append(abs(record['utility']))
        if record['utility'] > 0:
            colors.append('#27ae60')
        elif record['utility'] < 0:
            colors.append('#e41a1c')
        else:
            colors.append('#aaaaaa')
        link_labels.append('')





# Utility總分
total_utility = sum([r['utility'] for r in records])
node_labels[node_idx['UTILITY']] = f'Utility: {total_utility:.2f}'

# ----
show_label_node_names = {'TP', 'TN', 'FP', 'FN', 'Utility: %.2f' % total_utility, 'Utility'}
custom_node_labels = []
for lbl in node_labels:
    if lbl in show_label_node_names or (lbl.startswith('Utility') and 'Utility' in lbl):
        custom_node_labels.append(lbl)
    else:
        custom_node_labels.append('')
# ----

fig = go.Figure(go.Sankey(
    arrangement="snap",
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=custom_node_labels,
        color=node_colors,
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=colors,
        label=[''] * len(link_labels),  # link label 全隱藏
    )
))

# legend
legend_items = [
    # 行為 event
    ("Browse", "#1f77b4"),
    ("Detail", "#ff7f0e"),
    ("Add", "#2ca02c"),
    ("Remove", "#d62728"),
    # SHAP
    ("SHAP Positive", "#888888"),
    ("SHAP Negative", "#ffb6c1"),
    # 結果 Outcome
    ("TP", "#6a3d9a"),
    ("TN", "#aaaaaa"),
    ("FP", "#e41a1c"),
    ("FN", "#ff7f00"),
    # Utility
    ("Utility+", "#27ae60"),
    ("Utility-", "#e41a1c"),
]
legend_x = 1.11
legend_y_start = 0.98
legend_dy = 0.062

for i, (text, color) in enumerate(legend_items):
    # 增加分群之間的空行
    if text == "SHAP Positive":
        legend_y_start -= 0.01
    if text == "TP":
        legend_y_start -= 0.01
    if text == "Utility+":
        legend_y_start -= 0.01
    fig.add_shape(
        type="rect",
        xref="paper", yref="paper",
        x0=legend_x, x1=legend_x + 0.028,
        y0=legend_y_start - legend_dy * i - 0.022, y1=legend_y_start - legend_dy * i + 0.022,
        fillcolor=color,
        line=dict(width=1, color="#111"),
        layer='above'
    )
    fig.add_annotation(
        x=legend_x + 0.045, y=legend_y_start - legend_dy * i,
        xref="paper", yref="paper",
        showarrow=False,
        font=dict(size=15),
        text=text,
        align='left',
        bgcolor="white"
    )

fig.update_layout(
    title_text="SankeyX Clickstream Visualization (Only TP/TN/FP/FN/Utility Show Labels)",
    font_size=13,
    width=1400,
    height=700,
    plot_bgcolor="white",
    paper_bgcolor="white",
    margin=dict(l=40, r=270, t=70, b=40),
)
fig.show()



In [88]:
import pandas as pd
import plotly.graph_objects as go

# 1. 讀取資料
df = pd.read_csv('shap_sampled_200.csv')

# 2. 預處理行為序列欄位
import ast
def safe_eval(val):
    try:
        return ast.literal_eval(val)
    except:
        return val

df['truncated_sequence'] = df['truncated_sequence'].apply(safe_eval)

# 3. 控件參數
num_sessions = 10  # 顯示幾筆
session_mode = 'first'  # 'first' or 'last'
intent_filter = None  # 篩 intent，如 'Comparative'

if intent_filter:
    filtered = df[df['Intent_type'] == intent_filter]
else:
    filtered = df

if session_mode == 'first':
    selected = filtered.head(num_sessions)
else:
    selected = filtered.tail(num_sessions)

# 4. Utility matrix
utility_dict = {
    (1, 1): 3,    # TP
    (1, 0): -1,   # FP
    (0, 1): -2.5, # FN
    (0, 0): 1e-6  # TN
}

# 5. 顏色對應（優化過！）
event_color_map = {
    1: '#1f77b4',    # Browse（深藍）
    2: '#ffbe0b',    # Detail（亮黃橙）
    3: '#43aa8b',    # Add（青綠）
    4: '#fb8500',    # Remove（深橘，區分FP紅）
    5: '#3a86ff',    # Purchase（亮藍）
}
event_name_map = {
    1: 'Browse',
    2: 'Detail',
    3: 'Add',
    4: 'Remove',
    5: 'Purchase'
}
outcome_color_map = {
    'TP':   '#06d6a0',  # 專業綠
    'TN':   '#118ab2',  # 水藍
    'FP':   '#FF6B6B',  # 鮮明紅
    'FN':   '#ffd166',  # 金黃
}
intent_color_map = {
    "Hesitant Buyer":         "#ffe066",  # 亮黃
    "Comparative Buyer":      "#6c63ff",  # 藍紫
    "Unclassified":           "#adb5bd",  # 淺灰
    "Exploratory Buyer":      "#A3F7BF",  # 淡綠
    "Intermittent Revisitor": "#9bf6ff",  # 藍綠
    "Engaged Buyer":          "#ffb4a2",  # 橘粉
    "Uncertain Buyer":        "#b983ff",  # 紫色
}
utility_node_color = "#457b9d"  # 深藍，不跟FP或Remove撞色

# 6. 組裝 records
records = []
for _, row in selected.iterrows():
    seq = list(row['truncated_sequence'])[-5:]
    seq_len = len(seq)
    all_shap_values = [row.get(f'SHAP_{i}', 0.0) for i in range(1, 21)]
    shap_values = [v * 10 for v in (all_shap_values[-seq_len:] if seq_len > 0 else [])]
    y_pred = int(row['y_pred'])
    y_true = int(row['purchase'])
    intent = row['Intent_type']
    if y_pred == 1 and y_true == 1:
        outcome = 'TP'
    elif y_pred == 0 and y_true == 0:
        outcome = 'TN'
    elif y_pred == 1 and y_true == 0:
        outcome = 'FP'
    elif y_pred == 0 and y_true == 1:
        outcome = 'FN'
    utility = utility_dict[(y_pred, y_true)]
    records.append({
        'sequence': seq,
        'shap': shap_values,
        'y_pred': y_pred,
        'y_true': y_true,
        'outcome': outcome,
        'utility': utility,
        'intent': intent
    })

# 7. Sankey nodes & links
separate_mode = True

nodes = []
node_labels = []
node_colors = []
node_idx = {}

# outcome/utility nodes
for out in ['TP', 'TN', 'FP', 'FN']:
    node_name = f'OUT_{out}'
    nodes.append(node_name)
    node_labels.append(out)
    node_colors.append(outcome_color_map[out])
    node_idx[node_name] = len(nodes) - 1

nodes.append('UTILITY')
node_labels.append('Utility')
node_colors.append(utility_node_color)
node_idx['UTILITY'] = len(nodes) - 1

sources = []
targets = []
values = []
colors = []
link_labels = []

for sidx, record in enumerate(records):
    seq = record['sequence']
    shap_seq = record['shap']
    filtered = [(e, s) for e, s in zip(seq, shap_seq) if e != 0]
    if len(filtered) == 0:
        continue

    # --- group intent node ---
    intent = record['intent']
    group_intent_node = f'GROUP_INTENT_{intent}'
    if group_intent_node not in node_idx:
        nodes.append(group_intent_node)
        node_labels.append(intent)
        node_colors.append(intent_color_map.get(intent, '#ffe066'))
        node_idx[group_intent_node] = len(nodes) - 1

    prev = None

    for step_idx, (event, shap) in enumerate(filtered):
        # group 到 outcome 前
        if separate_mode:
            curr_node = f'{sidx}_step{step_idx}'
        else:
            curr_node = f'group_{step_idx}_{event}'
        if curr_node not in node_idx:
            nodes.append(curr_node)
            node_labels.append('')
            node_colors.append(event_color_map.get(event, '#cccccc'))
            node_idx[curr_node] = len(nodes) - 1

        # Intent group node → 第一行為
        if step_idx == 0:
            sources.append(node_idx[group_intent_node])
            targets.append(node_idx[curr_node])
            values.append(1)
            colors.append(intent_color_map.get(intent, '#ffe066'))
            link_labels.append('')

        # 其它步
        if prev is not None:
            sources.append(node_idx[prev])
            targets.append(node_idx[curr_node])
            values.append(abs(shap) + 0.1)
            colors.append('#888888' if shap >= 0 else '#ffb6c1')  # SHAP flow
            link_labels.append('')
        prev = curr_node

    # outcome
    outcome_node = f'OUT_{record["outcome"]}'
    if prev is not None:
        last_shap = filtered[-1][1]
        sources.append(node_idx[prev])
        targets.append(node_idx[outcome_node])
        values.append(abs(last_shap) + 0.1)
        if last_shap >= 0:
            colors.append('#888888')
        else:
            colors.append('#ffb6c1')
        link_labels.append('')
        sources.append(node_idx[outcome_node])
        targets.append(node_idx['UTILITY'])
        values.append(abs(record['utility']))
        if record['utility'] > 0:
            colors.append('#27ae60')
        elif record['utility'] < 0:
            colors.append('#e63946')
        else:
            colors.append('#bfc0c0')
        link_labels.append('')

# Utility總分
total_utility = sum([r['utility'] for r in records])
node_labels[node_idx['UTILITY']] = f'Utility: {total_utility:.2f}'

fig = go.Figure(go.Sankey(
    arrangement="snap",
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=node_labels,
        color=node_colors,
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=colors,
        label=link_labels
    )
))
fig.update_layout(
    title_text=f"SankeyX Clickstream Visualization (Intent Grouped, Event Grouped Before Outcome)",
    font_size=12,
    width=1400,
    height=700
)
fig.show()


In [104]:
import pandas as pd
import plotly.graph_objects as go

# 1. 讀取資料
df = pd.read_csv('shap_sampled_200.csv')

# 2. 預處理行為序列欄位
import ast
def safe_eval(val):
    try:
        return ast.literal_eval(val)
    except:
        return val

df['truncated_sequence'] = df['truncated_sequence'].apply(safe_eval)

# 3. 控件參數
num_sessions = 20
session_mode = 'first'
intent_filter = None

if intent_filter:
    filtered = df[df['Intent_type'] == intent_filter]
else:
    filtered = df

if session_mode == 'first':
    selected = filtered.head(num_sessions)
else:
    selected = filtered.tail(num_sessions)

# 4. Utility matrix
utility_dict = {
    (1, 1): 3,    # TP
    (1, 0): -1,   # FP
    (0, 1): -2.5, # FN
    (0, 0): 1e-6  # TN
}

# 5. 顏色對應
event_color_map = {
    1: '#1f77b4',    # Browse
    2: '#ffbe0b',    # Detail
    3: '#43aa8b',    # Add
    4: '#fb8500',    # Remove
    5: '#3a86ff',    # Purchase
}
outcome_color_map = {
    'TP':   '#06d6a0',
    'TN':   '#118ab2',
    'FP':   '#FF6B6B',
    'FN':   '#ffd166',
}
intent_color_map = {
    "Hesitant Buyer":         "#ffe066",
    "Comparative Buyer":      "#6c63ff",
    "Unclassified":           "#adb5bd",
    "Exploratory Buyer":      "#A3F7BF",
    "Intermittent Revisitor": "#9bf6ff",
    "Engaged Buyer":          "#ffb4a2",
    "Uncertain Buyer":        "#b983ff",
}
utility_node_color = "#457b9d"

# flow顏色：Intent到第一行為
intent_flow_color = "#c8d6e5"  # 淺灰藍

# 6. 組裝 records
records = []
for _, row in selected.iterrows():
    seq = list(row['truncated_sequence'])[-5:]
    seq_len = len(seq)
    all_shap_values = [row.get(f'SHAP_{i}', 0.0) for i in range(1, 21)]
    shap_values = [v * 10 for v in (all_shap_values[-seq_len:] if seq_len > 0 else [])]
    y_pred = int(row['y_pred'])
    y_true = int(row['purchase'])
    intent = row['Intent_type']
    if y_pred == 1 and y_true == 1:
        outcome = 'TP'
    elif y_pred == 0 and y_true == 0:
        outcome = 'TN'
    elif y_pred == 1 and y_true == 0:
        outcome = 'FP'
    elif y_pred == 0 and y_true == 1:
        outcome = 'FN'
    utility = utility_dict[(y_pred, y_true)]
    records.append({
        'sequence': seq,
        'shap': shap_values,
        'y_pred': y_pred,
        'y_true': y_true,
        'outcome': outcome,
        'utility': utility,
        'intent': intent
    })

# 7. Sankey nodes & links
separate_mode = False

nodes = []
node_labels = []
node_colors = []
node_idx = {}

for out in ['TP', 'TN', 'FP', 'FN']:
    node_name = f'OUT_{out}'
    nodes.append(node_name)
    node_labels.append(out)
    node_colors.append(outcome_color_map[out])
    node_idx[node_name] = len(nodes) - 1

nodes.append('UTILITY')
node_labels.append('Utility')
node_colors.append(utility_node_color)
node_idx['UTILITY'] = len(nodes) - 1

sources = []
targets = []
values = []
colors = []
link_labels = []

for sidx, record in enumerate(records):
    seq = record['sequence']
    shap_seq = record['shap']
    filtered = [(e, s) for e, s in zip(seq, shap_seq) if e != 0]
    if len(filtered) == 0:
        continue

    # --- group intent node ---
    intent = record['intent']
    group_intent_node = f'GROUP_INTENT_{intent}'
    if group_intent_node not in node_idx:
        nodes.append(group_intent_node)
        node_labels.append(intent)
        node_colors.append(intent_color_map.get(intent, '#ffe066'))
        node_idx[group_intent_node] = len(nodes) - 1

    prev = None

    for step_idx, (event, shap) in enumerate(filtered):
        # group 到 outcome 前
        if separate_mode:
            curr_node = f'{sidx}_step{step_idx}'
        else:
            curr_node = f'group_{step_idx}_{event}'
        if curr_node not in node_idx:
            nodes.append(curr_node)
            node_labels.append('')
            node_colors.append(event_color_map.get(event, '#cccccc'))
            node_idx[curr_node] = len(nodes) - 1

        # Intent group node → 第一行為
        if step_idx == 0:
            sources.append(node_idx[group_intent_node])
            targets.append(node_idx[curr_node])
            values.append(1)
            colors.append(intent_flow_color)  # 統一這個顏色！
            link_labels.append('')

        # 其它步
        if prev is not None:
            sources.append(node_idx[prev])
            targets.append(node_idx[curr_node])
            values.append(abs(shap) + 0.1)
            colors.append('#888888' if shap >= 0 else '#ffb6c1')  # SHAP flow
            link_labels.append('')
        prev = curr_node

    # outcome
    outcome_node = f'OUT_{record["outcome"]}'
    if prev is not None:
        last_shap = filtered[-1][1]
        sources.append(node_idx[prev])
        targets.append(node_idx[outcome_node])
        values.append(abs(last_shap) + 0.1)
        if last_shap >= 0:
            colors.append('#888888')
        else:
            colors.append('#ffb6c1')
        link_labels.append('')
        sources.append(node_idx[outcome_node])
        targets.append(node_idx['UTILITY'])
        values.append(abs(record['utility']))
        if record['utility'] > 0:
            colors.append('#27ae60')
        elif record['utility'] < 0:
            colors.append('#e63946')
        else:
            colors.append('#bfc0c0')
        link_labels.append('')

total_utility = sum([r['utility'] for r in records])
node_labels[node_idx['UTILITY']] = f'Utility: {total_utility:.2f}'
fig = go.Figure(go.Sankey(
    arrangement="snap",
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=node_labels,
        color=node_colors,
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=colors,
        label=link_labels
    )
))

# ----- 下面這段 legend code要在 Sankey plot 產生之後！ -----
fig.layout.shapes = []
fig.layout.annotations = []
# ... Sankey 畫好之後加 legend ...

event_legend_items = [
    ("Browse", "#1f77b4"),
    ("Detail", "#ffbe0b"),
    ("Add", "#43aa8b"),
    ("Remove", "#fb8500"),
    ("Purchase", "#3a86ff"),
]

legend_x = 1.12
legend_y_start = 0.98
legend_box_height = 0.08
legend_box_width = 0.08
legend_dy = 0.17

for i, (text, color) in enumerate(event_legend_items):
    box_y_top = legend_y_start - legend_dy * i
    box_y_bottom = box_y_top - legend_box_height

    fig.add_shape(
        type="rect",
        xref="paper", yref="paper",
        x0=legend_x, x1=legend_x + legend_box_width,
        y0=box_y_bottom, y1=box_y_top,
        fillcolor=color,
        line=dict(width=1, color="#22334b"),
        layer='above'
    )
    fig.add_annotation(
        x=legend_x + legend_box_width / 2,
        y=box_y_bottom - 0.022,
        xref="paper", yref="paper",
        showarrow=False,
        font=dict(size=16, family="Arial", color="#22334b"),  # 這裡指定 Arial
        text=text,
        align='center',
        xanchor='center',
        yanchor='top',
        bgcolor="rgba(255,255,255,0)"
    )

fig.update_layout(
    title_text="SankeyX Clickstream Visualization",
    font_size=15,
    font_family="Arial",      # 這裡直接套用所有節點
    font_color="#222222",
    width=1400,
    height=700,
    plot_bgcolor="white",
    paper_bgcolor="white",
    margin=dict(l=40, r=240, t=70, b=40),
)

fig.show()

