In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf


  from cryptography.hazmat.backends import default_backend


In [2]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# 判断是否使用TF2.x，如果是则切换到兼容模式
if tf.__version__.startswith('2'):
    import tensorflow.compat.v1 as tf
    tf.disable_v2_behavior()

# 固定随机种子
np.random.seed(2024)
tf.set_random_seed(2024)

# ===================== 1. 生成电商样例数据 =====================
def generate_ecommerce_data():
    # 数据参数
    user_num = 10000
    item_num = 5000
    sample_num = 100000
    user_feat_dim = 16  # 用户特征：年龄、性别、购买力等
    item_feat_dim = 16  # 商品特征：品类、价格、评分等
    behavior_feat_dim = 8  # 行为特征：点击频次、浏览时长等

    # 生成连续特征
    user_feats = np.random.normal(0, 1, (sample_num, user_feat_dim))
    item_feats = np.random.normal(0, 1, (sample_num, item_feat_dim))
    behavior_feats = np.random.normal(0, 1, (sample_num, behavior_feat_dim))

    # 生成点击标签（基于用户-商品特征内积 + 噪声）
    logits = np.sum(user_feats * item_feats, axis=1) + np.random.normal(0, 0.5, sample_num)
    labels = (logits > 0).astype(np.float32)

    # 合并特征并划分训练/测试集
    all_feats = np.concatenate([user_feats, item_feats, behavior_feats], axis=1)
    x_train, x_test, y_train, y_test = train_test_split(
        all_feats, labels, test_size=0.2, random_state=2024
    )
    return x_train, x_test, y_train, y_test, all_feats.shape[1]

# ===================== 2. 定义 RankMixer 核心层 =====================
def rank_mixer_layer(inputs, token_num=4, head_num=2, hidden_dim=64):
    batch_size = tf.shape(inputs)[0]
    input_dim = inputs.get_shape().as_list()[-1]

    # Step1: 特征 Token 化（均匀划分）
    token_dim = input_dim // token_num
    tokens = tf.reshape(inputs, [batch_size, token_num, token_dim])  # [B, T, D_t]

    # Step2: 多头 Token 混合（无参跨 Token 交互）
    head_dim = token_dim // head_num
    tokens_split = tf.reshape(tokens, [batch_size, token_num, head_num, head_dim])
    tokens_split = tf.transpose(tokens_split, [0, 2, 1, 3])  # [B, H, T, D_h]

    mixed_tokens = []
    for h in range(head_num):
        head_token = tokens_split[:, h, :, :]
        # 相邻 Token 拼接实现混合
#         head_mixed = tf.concat([head_token[:, 1:, :], head_token[:, :-1, :]], axis=-1)
        head_mixed = head_token
        
        mixed_tokens.append(head_mixed)
    mixed_tokens = tf.concat(mixed_tokens, axis=-1)  # [B, T-1, H*2*D_h]

    # Step3: 逐 Token 前馈网络
    ff_out = tf.layers.dense(mixed_tokens, hidden_dim, activation=tf.nn.relu)
    ff_out = tf.layers.dense(ff_out, token_dim)
    return tf.reduce_mean(ff_out, axis=1)  # Token 维度聚合 [B, D_t]

# ===================== 3. 定义 MHA 核心层 =====================
def mha_layer(inputs, head_num=4, hidden_dim=64):
    batch_size = tf.shape(inputs)[0]
    input_dim = inputs.get_shape().as_list()[-1]

    # 构造 Q/K/V 向量
    q = tf.layers.dense(inputs, hidden_dim, activation=tf.nn.relu)
    k = tf.layers.dense(inputs, hidden_dim, activation=tf.nn.relu)
    v = tf.layers.dense(inputs, hidden_dim, activation=tf.nn.relu)

    # 适配 MHA 3D 输入格式 [B, T, D]，构造 T=1 的序列
    q = tf.expand_dims(q, axis=1)
    k = tf.expand_dims(k, axis=1)
    v = tf.expand_dims(v, axis=1)

    # 自定义多头注意力计算
    def scaled_dot_product_attention(q, k, v):
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_logits = matmul_qk / tf.math.sqrt(dk)
        attention_weights = tf.nn.softmax(scaled_logits, axis=-1)
        output = tf.matmul(attention_weights, v)
        return output

    # 分头计算
    q_split = tf.split(q, head_num, axis=-1)
    k_split = tf.split(k, head_num, axis=-1)
    v_split = tf.split(v, head_num, axis=-1)

    outputs = []
    for q_h, k_h, v_h in zip(q_split, k_split, v_split):
        outputs.append(scaled_dot_product_attention(q_h, k_h, v_h))
    concat_output = tf.concat(outputs, axis=-1)
    
    # 全连接层整合
    output = tf.layers.dense(concat_output, hidden_dim)
    return tf.squeeze(output, axis=1)

# ===================== 4. 构建完整模型 =====================
def build_model(inputs, labels, model_type="rankmixer"):
    input_layer = tf.reshape(inputs, [-1, inputs.get_shape().as_list()[-1]])
    # 共享底层特征映射
    hidden = tf.layers.dense(input_layer, 128, activation=tf.nn.relu)
    hidden = tf.layers.dropout(hidden, rate=0.2, training=True)

    # 核心层选择
    if model_type == "rankmixer":
        core_out = rank_mixer_layer(hidden)
    elif model_type == "mha":
        core_out = mha_layer(hidden)
    else:
        raise ValueError("model_type must be 'rankmixer' or 'mha'")

    # 输出层（二分类）
    logits = tf.layers.dense(core_out, 1)
    preds = tf.nn.sigmoid(logits)

    # 损失函数与优化器
    loss = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(
            logits=logits, labels=tf.reshape(labels, [-1, 1])
        )
    )
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
    return preds, loss, optimizer

# ===================== 5. 模型训练与评估 =====================
def train_and_evaluate(model_type, x_train, x_test, y_train, y_test, feat_dim):
    graph = tf.Graph()
    with graph.as_default():
        # 定义占位符
        x_ph = tf.placeholder(tf.float32, [None, feat_dim])
        y_ph = tf.placeholder(tf.float32, [None])
        preds, loss, optimizer = build_model(x_ph, y_ph, model_type)
        init = tf.global_variables_initializer()

        # 训练参数
        epochs = 20
        batch_size = 256
        train_steps = len(x_train) // batch_size

        with tf.Session() as sess:
            sess.run(init)
            print("=== Training {} Model ===".format(model_type.upper()))
            for epoch in range(epochs):
                epoch_loss = 0.0
                # 批次训练
                for step in range(train_steps):
                    start_idx = step * batch_size
                    end_idx = start_idx + batch_size
                    batch_x = x_train[start_idx:end_idx]
                    batch_y = y_train[start_idx:end_idx]
                    _, batch_loss_val = sess.run(
                        [optimizer, loss],
                        feed_dict={x_ph: batch_x, y_ph: batch_y}
                    )
                    epoch_loss += batch_loss_val
                # 计算测试集 AUC
                y_pred_val = sess.run(preds, feed_dict={x_ph: x_test, y_ph: y_test})
                auc_val = roc_auc_score(y_test, y_pred_val)
                print("Epoch {}/{}, Avg Loss: {:.4f}, Test AUC: {:.4f}".format(
                    epoch+1, epochs, epoch_loss/train_steps, auc_val
                ))
        return auc_val

# ===================== 6. 主函数执行 =====================
if __name__ == "__main__":
    # 生成数据
    x_train, x_test, y_train, y_test, feat_dim = generate_ecommerce_data()

    # 训练并评估两个模型
    auc_rankmixer = train_and_evaluate("rankmixer", x_train, x_test, y_train, y_test, feat_dim)
    auc_mha = train_and_evaluate("mha", x_train, x_test, y_train, y_test, feat_dim)

    # 结果对比
    print("\n=== Final AUC Comparison ===")
    print("RankMixer AUC: {:.4f}".format(auc_rankmixer))
    print("MHA AUC: {:.4f}".format(auc_mha))
    print("AUC Difference (RankMixer - MHA): {:.4f}".format(auc_rankmixer - auc_mha))


=== Training RANKMIXER Model ===
Epoch 1/20, Avg Loss: 0.6156, Test AUC: 0.8407
Epoch 2/20, Avg Loss: 0.4348, Test AUC: 0.8939
Epoch 3/20, Avg Loss: 0.3836, Test AUC: 0.9109
Epoch 4/20, Avg Loss: 0.3614, Test AUC: 0.9178
Epoch 5/20, Avg Loss: 0.3499, Test AUC: 0.9205
Epoch 6/20, Avg Loss: 0.3403, Test AUC: 0.9240
Epoch 7/20, Avg Loss: 0.3325, Test AUC: 0.9281
Epoch 8/20, Avg Loss: 0.3302, Test AUC: 0.9268
Epoch 9/20, Avg Loss: 0.3214, Test AUC: 0.9315
Epoch 10/20, Avg Loss: 0.3168, Test AUC: 0.9326
Epoch 11/20, Avg Loss: 0.3110, Test AUC: 0.9340
Epoch 12/20, Avg Loss: 0.3088, Test AUC: 0.9324
Epoch 13/20, Avg Loss: 0.3038, Test AUC: 0.9353
Epoch 14/20, Avg Loss: 0.3009, Test AUC: 0.9371
Epoch 15/20, Avg Loss: 0.2951, Test AUC: 0.9365
Epoch 16/20, Avg Loss: 0.2950, Test AUC: 0.9405
Epoch 17/20, Avg Loss: 0.2900, Test AUC: 0.9382
Epoch 18/20, Avg Loss: 0.2855, Test AUC: 0.9405
Epoch 19/20, Avg Loss: 0.2857, Test AUC: 0.9399
Epoch 20/20, Avg Loss: 0.2827, Test AUC: 0.9430
=== Training MHA

### 1.0 简单的token混合

In [3]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# 判断是否使用TF2.x，如果是则切换到兼容模式
if tf.__version__.startswith('2'):
    import tensorflow.compat.v1 as tf
    tf.disable_v2_behavior()

# 固定随机种子
np.random.seed(2024)
tf.set_random_seed(2024)

# ===================== 1. 生成电商样例数据 =====================
def generate_ecommerce_data():
    # 数据参数
    user_num = 10000
    item_num = 5000
    sample_num = 100000
    user_feat_dim = 16  # 用户特征：年龄、性别、购买力等
    item_feat_dim = 16  # 商品特征：品类、价格、评分等
    behavior_feat_dim = 8  # 行为特征：点击频次、浏览时长等

    # 生成连续特征
    user_feats = np.random.normal(0, 1, (sample_num, user_feat_dim))
    item_feats = np.random.normal(0, 1, (sample_num, item_feat_dim))
    behavior_feats = np.random.normal(0, 1, (sample_num, behavior_feat_dim))

    # 生成点击标签（基于用户-商品特征内积 + 噪声）
    logits = np.sum(user_feats * item_feats, axis=1) + np.random.normal(0, 0.5, sample_num)
    labels = (logits > 0).astype(np.float32)

    # 合并特征并划分训练/测试集
    all_feats = np.concatenate([user_feats, item_feats, behavior_feats], axis=1)
    x_train, x_test, y_train, y_test = train_test_split(
        all_feats, labels, test_size=0.2, random_state=2024
    )
    return x_train, x_test, y_train, y_test, all_feats.shape[1]

# ===================== 2. 定义 RankMixer 核心层 =====================
def rank_mixer_layer(inputs, token_num=4, head_num=2, hidden_dim=64):
    # 打印输入 shape
    inputs_shape = inputs.get_shape().as_list()
    inputs = tf.Print(inputs, [tf.shape(inputs)], message="[rank_mixer_layer] inputs shape: ", summarize=10)
    print("[rank_mixer_layer] inputs static shape:", inputs_shape)
    
    batch_size = tf.shape(inputs)[0]
    input_dim = inputs.get_shape().as_list()[-1]
    print("[rank_mixer_layer] batch_size (dynamic):", batch_size)
    print("[rank_mixer_layer] input_dim:", input_dim)

    # Step1: 特征 Token 化（均匀划分）
    token_dim = input_dim // token_num
    print("[rank_mixer_layer] token_dim:", token_dim)
    tokens = tf.reshape(inputs, [batch_size, token_num, token_dim])  # [B, T, D_t]
    tokens_shape = tokens.get_shape().as_list()
    tokens = tf.Print(tokens, [tf.shape(tokens)], message="[rank_mixer_layer] tokens shape after reshape: ", summarize=10)
    print("[rank_mixer_layer] tokens static shape:", tokens_shape)

    # Step2: 多头 Token 混合（无参跨 Token 交互）
    head_dim = token_dim // head_num
    print("[rank_mixer_layer] head_dim:", head_dim)
    tokens_split = tf.reshape(tokens, [batch_size, token_num, head_num, head_dim])
    tokens_split_shape = tokens_split.get_shape().as_list()
    tokens_split = tf.Print(tokens_split, [tf.shape(tokens_split)], message="[rank_mixer_layer] tokens_split shape after reshape: ", summarize=10)
    print("[rank_mixer_layer] tokens_split static shape:", tokens_split_shape)
    
    tokens_split = tf.transpose(tokens_split, [0, 2, 1, 3])  # [B, H, T, D_h]
    tokens_split_shape_transposed = tokens_split.get_shape().as_list()
    tokens_split = tf.Print(tokens_split, [tf.shape(tokens_split)], message="[rank_mixer_layer] tokens_split shape after transpose: ", summarize=10)
    print("[rank_mixer_layer] tokens_split static shape after transpose:", tokens_split_shape_transposed)

    mixed_tokens = []
    for h in range(head_num):
        head_token = tokens_split[:, h, :, :]
        head_token_shape = head_token.get_shape().as_list()
        head_token = tf.Print(head_token, [tf.shape(head_token)], message="[rank_mixer_layer] head_token[{}] shape: ".format(h), summarize=10)
        print("[rank_mixer_layer] head_token[{}] static shape:".format(h), head_token_shape)
        
#         # 相邻 Token 拼接实现混合
#         head_token_shift1 = head_token[:, 1:, :]
#         head_token_shift2 = head_token[:, :-1, :]
#         head_token_shift1_shape = head_token_shift1.get_shape().as_list()
#         head_token_shift2_shape = head_token_shift2.get_shape().as_list()
#         head_token_shift1 = tf.Print(head_token_shift1, [tf.shape(head_token_shift1)], message="[rank_mixer_layer] head_token[{}] shift1 ([:, 1:, :]) shape: ".format(h), summarize=10)
#         head_token_shift2 = tf.Print(head_token_shift2, [tf.shape(head_token_shift2)], message="[rank_mixer_layer] head_token[{}] shift2 ([:, :-1, :]) shape: ".format(h), summarize=10)
#         print("[rank_mixer_layer] head_token[{}] shift1 static shape:".format(h), head_token_shift1_shape)
#         print("[rank_mixer_layer] head_token[{}] shift2 static shape:".format(h), head_token_shift2_shape)
        
    
#         head_mixed = tf.concat([head_token_shift1, head_token_shift2], axis=-1)
        head_mixed = head_token
        head_mixed_shape = head_mixed.get_shape().as_list()
        head_mixed = tf.Print(head_mixed, [tf.shape(head_mixed)], message="[rank_mixer_layer] head_mixed[{}] shape after concat: ".format(h), summarize=10)
        print("[rank_mixer_layer] head_mixed[{}] static shape:".format(h), head_mixed_shape)
        mixed_tokens.append(head_mixed)
    
    mixed_tokens = tf.concat(mixed_tokens, axis=-1)  # [B, T-1, H*2*D_h]
    mixed_tokens_shape = mixed_tokens.get_shape().as_list()
    mixed_tokens = tf.Print(mixed_tokens, [tf.shape(mixed_tokens)], message="[rank_mixer_layer] mixed_tokens shape after concat all heads: ", summarize=10)
    print("[rank_mixer_layer] mixed_tokens static shape:", mixed_tokens_shape)

    # Step3: 逐 Token 前馈网络
    ff_out = tf.layers.dense(mixed_tokens, hidden_dim, activation=tf.nn.relu)
    ff_out_shape = ff_out.get_shape().as_list()
    ff_out = tf.Print(ff_out, [tf.shape(ff_out)], message="[rank_mixer_layer] ff_out shape after first dense (hidden_dim={}): ".format(hidden_dim), summarize=10)
    print("[rank_mixer_layer] ff_out static shape after first dense:", ff_out_shape)
    
    ff_out = tf.layers.dense(ff_out, token_dim)
    ff_out_shape2 = ff_out.get_shape().as_list()
    ff_out = tf.Print(ff_out, [tf.shape(ff_out)], message="[rank_mixer_layer] ff_out shape after second dense (token_dim={}): ".format(token_dim), summarize=10)
    print("[rank_mixer_layer] ff_out static shape after second dense:", ff_out_shape2)
    
    output = tf.reduce_mean(ff_out, axis=1)  # Token 维度聚合 [B, D_t]
    output_shape = output.get_shape().as_list()
    output = tf.Print(output, [tf.shape(output)], message="[rank_mixer_layer] output shape after reduce_mean: ", summarize=10)
    print("[rank_mixer_layer] output static shape:", output_shape)
    
    return output

# ===================== 3. 定义 MHA 核心层 =====================
def mha_layer(inputs, head_num=4, hidden_dim=64):
    batch_size = tf.shape(inputs)[0]
    input_dim = inputs.get_shape().as_list()[-1]

    # 构造 Q/K/V 向量
    q = tf.layers.dense(inputs, hidden_dim, activation=tf.nn.relu)
    k = tf.layers.dense(inputs, hidden_dim, activation=tf.nn.relu)
    v = tf.layers.dense(inputs, hidden_dim, activation=tf.nn.relu)

    # 适配 MHA 3D 输入格式 [B, T, D]，构造 T=1 的序列
    q = tf.expand_dims(q, axis=1)
    k = tf.expand_dims(k, axis=1)
    v = tf.expand_dims(v, axis=1)

    # 自定义多头注意力计算
    def scaled_dot_product_attention(q, k, v):
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_logits = matmul_qk / tf.math.sqrt(dk)
        attention_weights = tf.nn.softmax(scaled_logits, axis=-1)
        output = tf.matmul(attention_weights, v)
        return output

    # 分头计算
    q_split = tf.split(q, head_num, axis=-1)
    k_split = tf.split(k, head_num, axis=-1)
    v_split = tf.split(v, head_num, axis=-1)

    outputs = []
    for q_h, k_h, v_h in zip(q_split, k_split, v_split):
        outputs.append(scaled_dot_product_attention(q_h, k_h, v_h))
    concat_output = tf.concat(outputs, axis=-1)
    
    # 全连接层整合
    output = tf.layers.dense(concat_output, hidden_dim)
    return tf.squeeze(output, axis=1)

# ===================== 4. 构建完整模型 =====================
def build_model(inputs, labels, model_type="rankmixer"):
    input_layer = tf.reshape(inputs, [-1, inputs.get_shape().as_list()[-1]])
    # 共享底层特征映射
    hidden = tf.layers.dense(input_layer, 128, activation=tf.nn.relu)
    hidden = tf.layers.dropout(hidden, rate=0.2, training=True)

    # 核心层选择
    if model_type == "rankmixer":
        core_out = rank_mixer_layer(hidden)
    elif model_type == "mha":
        core_out = mha_layer(hidden)
    else:
        raise ValueError("model_type must be 'rankmixer' or 'mha'")

    # 输出层（二分类）
    logits = tf.layers.dense(core_out, 1)
    preds = tf.nn.sigmoid(logits)

    # 损失函数与优化器
    loss = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(
            logits=logits, labels=tf.reshape(labels, [-1, 1])
        )
    )
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
    return preds, loss, optimizer

# ===================== 5. 模型训练与评估 =====================
def train_and_evaluate(model_type, x_train, x_test, y_train, y_test, feat_dim):
    graph = tf.Graph()
    with graph.as_default():
        # 定义占位符
        x_ph = tf.placeholder(tf.float32, [None, feat_dim])
        y_ph = tf.placeholder(tf.float32, [None])
        preds, loss, optimizer = build_model(x_ph, y_ph, model_type)
        init = tf.global_variables_initializer()

        # 训练参数
        epochs = 20
        batch_size = 256
        train_steps = len(x_train) // batch_size

        with tf.Session() as sess:
            sess.run(init)
            print("=== Training {} Model ===".format(model_type.upper()))
            for epoch in range(epochs):
                epoch_loss = 0.0
                # 批次训练
                for step in range(train_steps):
                    start_idx = step * batch_size
                    end_idx = start_idx + batch_size
                    batch_x = x_train[start_idx:end_idx]
                    batch_y = y_train[start_idx:end_idx]
                    _, batch_loss_val = sess.run(
                        [optimizer, loss],
                        feed_dict={x_ph: batch_x, y_ph: batch_y}
                    )
                    epoch_loss += batch_loss_val
                # 计算测试集 AUC
                y_pred_val = sess.run(preds, feed_dict={x_ph: x_test, y_ph: y_test})
                auc_val = roc_auc_score(y_test, y_pred_val)
                print("Epoch {}/{}, Avg Loss: {:.4f}, Test AUC: {:.4f}".format(
                    epoch+1, epochs, epoch_loss/train_steps, auc_val
                ))
        return auc_val

# ===================== 6. 主函数执行 =====================
if __name__ == "__main__":
    # 生成数据
    x_train, x_test, y_train, y_test, feat_dim = generate_ecommerce_data()

    # 训练并评估两个模型
    auc_rankmixer = train_and_evaluate("rankmixer", x_train, x_test, y_train, y_test, feat_dim)
    auc_mha = train_and_evaluate("mha", x_train, x_test, y_train, y_test, feat_dim)

    # 结果对比
    print("\n=== Final AUC Comparison ===")
    print("RankMixer AUC: {:.4f}".format(auc_rankmixer))
    print("MHA AUC: {:.4f}".format(auc_mha))
    print("AUC Difference (RankMixer - MHA): {:.4f}".format(auc_rankmixer - auc_mha))


('[rank_mixer_layer] inputs static shape:', [None, 128])
('[rank_mixer_layer] batch_size (dynamic):', <tf.Tensor 'strided_slice:0' shape=() dtype=int32>)
('[rank_mixer_layer] input_dim:', 128)
('[rank_mixer_layer] token_dim:', 32)
('[rank_mixer_layer] tokens static shape:', [None, 4, 32])
('[rank_mixer_layer] head_dim:', 16)
('[rank_mixer_layer] tokens_split static shape:', [None, 4, 2, 16])
('[rank_mixer_layer] tokens_split static shape after transpose:', [None, 2, 4, 16])
('[rank_mixer_layer] head_token[0] static shape:', [None, 4, 16])
('[rank_mixer_layer] head_mixed[0] static shape:', [None, 4, 16])
('[rank_mixer_layer] head_token[1] static shape:', [None, 4, 16])
('[rank_mixer_layer] head_mixed[1] static shape:', [None, 4, 16])
('[rank_mixer_layer] mixed_tokens static shape:', [None, 4, 32])
('[rank_mixer_layer] ff_out static shape after first dense:', [None, 4, 64])
('[rank_mixer_layer] ff_out static shape after second dense:', [None, 4, 32])
('[rank_mixer_layer] output static sh

### 2.0 增加gate门控

In [14]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# 判断是否使用TF2.x，如果是则切换到兼容模式
if tf.__version__.startswith('2'):
    import tensorflow.compat.v1 as tf
    tf.disable_v2_behavior()

# 固定随机种子
np.random.seed(2024)
tf.set_random_seed(2024)

    
# ===================== 1. 生成电商样例数据（未修改）====================
def generate_ecommerce_data():
    # 数据参数
    user_num = 10000
    item_num = 5000
    sample_num = 100000
    user_feat_dim = 16  # 用户特征：年龄、性别、购买力等
    item_feat_dim = 16  # 商品特征：品类、价格、评分等
    behavior_feat_dim = 8  # 行为特征：点击频次、浏览时长等

    # 生成连续特征
    user_feats = np.random.normal(0, 1, (sample_num, user_feat_dim))
    item_feats = np.random.normal(0, 1, (sample_num, item_feat_dim))
    behavior_feats = np.random.normal(0, 1, (sample_num, behavior_feat_dim))

    # 生成点击标签（基于用户-商品特征内积 + 噪声）
    logits = np.sum(user_feats * item_feats, axis=1) + np.random.normal(0, 0.5, sample_num)
    labels = (logits > 0).astype(np.float32)

    # 合并特征并划分训练/测试集
    all_feats = np.concatenate([user_feats, item_feats, behavior_feats], axis=1)
    x_train, x_test, y_train, y_test = train_test_split(
        all_feats, labels, test_size=0.2, random_state=2024
    )
    return x_train, x_test, y_train, y_test, all_feats.shape[1]

# ===================== 2. 定义 RankMixer 核心层（修复变量作用域）====================
def rank_mixer_layer(inputs, token_num=4, head_num=2, hidden_dim=64, is_gate=False):
    # 添加顶层变量作用域防止重复
    with tf.variable_scope('rank_mixer', reuse=tf.AUTO_REUSE):
        # 打印输入 shape
        inputs_shape = inputs.get_shape().as_list()
        inputs = tf.Print(inputs, [tf.shape(inputs)], message="[rank_mixer_layer] inputs shape: ", summarize=10)
        print("[rank_mixer_layer] inputs static shape:", inputs_shape)
        
        batch_size = tf.shape(inputs)[0]
        input_dim = inputs.get_shape().as_list()[-1]
        print("[rank_mixer_layer] batch_size (dynamic):", batch_size)
        print("[rank_mixer_layer] input_dim:", input_dim)

        # Step1: 特征 Token 化（均匀划分）
        token_dim = input_dim // token_num
        print("[rank_mixer_layer] token_dim:", token_dim)
        tokens = tf.reshape(inputs, [batch_size, token_num, token_dim])  # [B, T, D_t]
        tokens_shape = tokens.get_shape().as_list()
        tokens = tf.Print(tokens, [tf.shape(tokens)], message="[rank_mixer_layer] tokens shape after reshape: ", summarize=10)
        print("[rank_mixer_layer] tokens static shape:", tokens_shape)

        # Step2: 多头 Token 混合（无参跨 Token 交互）
        head_dim = token_dim // head_num
        print("[rank_mixer_layer] head_dim:", head_dim)
        tokens_split = tf.reshape(tokens, [batch_size, token_num, head_num, head_dim])
        tokens_split_shape = tokens_split.get_shape().as_list()
        tokens_split = tf.Print(tokens_split, [tf.shape(tokens_split)], message="[rank_mixer_layer] tokens_split shape after reshape: ", summarize=10)
        print("[rank_mixer_layer] tokens_split static shape:", tokens_split_shape)
        
        tokens_split = tf.transpose(tokens_split, [0, 2, 1, 3])  # [B, H, T, D_h]
        tokens_split_shape_transposed = tokens_split.get_shape().as_list()
        tokens_split = tf.Print(tokens_split, [tf.shape(tokens_split)], message="[rank_mixer_layer] tokens_split shape after transpose: ", summarize=10)
        print("[rank_mixer_layer] tokens_split static shape after transpose:", tokens_split_shape_transposed)

        mixed_tokens = []
        for h in range(head_num):
            with tf.variable_scope('head_{}_gate'.format(h), reuse=tf.AUTO_REUSE):
                head_token = tokens_split[:, h, :, :]
                head_token_shape = head_token.get_shape().as_list()
                head_token = tf.Print(head_token, [tf.shape(head_token)], 
                                     message="[rank_mixer_layer] head_token[{}] shape: ".format(h), summarize=10)
                head_mixed = head_token
                if is_gate:
                    # 新增门控机制（类似LSTM的Gate）
                    gate = tf.layers.dense(  # 生成Gate权重
                        inputs=head_token,
                        units=head_token.shape[-1],  # 保持维度一致
                        activation=tf.nn.sigmoid,
                        name='gate_dense'  # 固定名称但放在独立作用域下
                    )

                    # 应用门控
                    gated_head_token = head_token * gate
                    gated_head_token = tf.Print(gated_head_token, 
                                               [tf.shape(gated_head_token)], 
                                               message="[rank_mixer_layer] gated_head_token[{}] shape: ".format(h), summarize=10)

                    # 增加跨Token交互（保持原有逻辑）
                    head_mixed = tf.concat([  # 使用拼接混合
                        gated_head_token[:, 1:, :],  # 向前位移1
                        gated_head_token[:, :-1, :],  # 向后位移1
                    ], axis=-1)
                
                head_mixed_shape = head_mixed.get_shape().as_list()
                head_mixed = tf.Print(head_mixed, [tf.shape(head_mixed)], 
                                     message="[rank_mixer_layer] head_mixed[{}] shape after concat: ".format(h), summarize=10)
                mixed_tokens.append(head_mixed)
        
        mixed_tokens = tf.concat(mixed_tokens, axis=-1)  # [B, T-1, H*2*D_h]
        mixed_tokens_shape = mixed_tokens.get_shape().as_list()
        mixed_tokens = tf.Print(mixed_tokens, [tf.shape(mixed_tokens)], 
                              message="[rank_mixer_layer] mixed_tokens shape after concat all heads: ", summarize=10)
        
        # Step3: 逐 Token 前馈网络（保持原有逻辑）
        ff_out = tf.layers.dense(mixed_tokens, hidden_dim, activation=tf.nn.relu)
        ff_out = tf.Print(ff_out, [tf.shape(ff_out)], 
                         message="[rank_mixer_layer] ff_out shape after first dense (hidden_dim={}): ".format(hidden_dim), summarize=10)
        
        ff_out = tf.layers.dense(ff_out, token_dim)
        ff_out = tf.Print(ff_out, [tf.shape(ff_out)], 
                         message="[rank_mixer_layer] ff_out shape after second dense (token_dim={}): ".format(token_dim), summarize=10)
        
        output = tf.reduce_mean(ff_out, axis=1)  # Token 维度聚合 [B, D_t]
        output_shape = output.get_shape().as_list()
        output = tf.Print(output, [tf.shape(output)], 
                         message="[rank_mixer_layer] output shape after reduce_mean: ", summarize=10)
        
        return output

# ===================== 3. 其他层和模型构建（未修改）====================
def mha_layer(inputs, head_num=4, hidden_dim=64):
    batch_size = tf.shape(inputs)[0]
    input_dim = inputs.get_shape().as_list()[-1]

    # 构造 Q/K/V 向量
    q = tf.layers.dense(inputs, hidden_dim, activation=tf.nn.relu)
    k = tf.layers.dense(inputs, hidden_dim, activation=tf.nn.relu)
    v = tf.layers.dense(inputs, hidden_dim, activation=tf.nn.relu)

    # 适配 MHA 3D 输入格式 [B, T, D]，构造 T=1 的序列
    q = tf.expand_dims(q, axis=1)
    k = tf.expand_dims(k, axis=1)
    v = tf.expand_dims(v, axis=1)

    # 自定义多头注意力计算
    def scaled_dot_product_attention(q, k, v):
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_logits = matmul_qk / tf.math.sqrt(dk)
        attention_weights = tf.nn.softmax(scaled_logits, axis=-1)
        output = tf.matmul(attention_weights, v)
        return output

    # 分头计算
    q_split = tf.split(q, head_num, axis=-1)
    k_split = tf.split(k, head_num, axis=-1)
    v_split = tf.split(v, head_num, axis=-1)

    outputs = []
    for q_h, k_h, v_h in zip(q_split, k_split, v_split):
        outputs.append(scaled_dot_product_attention(q_h, k_h, v_h))
    concat_output = tf.concat(outputs, axis=-1)
    
    # 全连接层整合
    output = tf.layers.dense(concat_output, hidden_dim)
    return tf.squeeze(output, axis=1)

def build_model(inputs, labels, model_type="rankmixer"):
    input_layer = tf.reshape(inputs, [-1, inputs.get_shape().as_list()[-1]])
    # 共享底层特征映射
    hidden = tf.layers.dense(input_layer, 128, activation=tf.nn.relu)
    hidden = tf.layers.dropout(hidden, rate=0.2, training=True)

    # 核心层选择
    if model_type == "rankmixer":
        core_out = rank_mixer_layer(hidden,token_num=4, head_num=2, hidden_dim=64, is_gate=False)
    elif model_type == "mha":
        core_out = mha_layer(hidden)
    elif model_type == "rankmixer_gate":
        core_out = rank_mixer_layer(hidden,token_num=4, head_num=2, hidden_dim=64, is_gate=True)
    else:
        raise ValueError("model_type must be 'rankmixer' or 'mha'")

    # 输出层（二分类）
    logits = tf.layers.dense(core_out, 1)
    preds = tf.nn.sigmoid(logits)

    # 损失函数与优化器
    loss = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(
            logits=logits, labels=tf.reshape(labels, [-1, 1])
        )
    )
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
    return preds, loss, optimizer

def train_and_evaluate(model_type, x_train, x_test, y_train, y_test, feat_dim):
    graph = tf.Graph()
    with graph.as_default():
        # 定义占位符
        x_ph = tf.placeholder(tf.float32, [None, feat_dim])
        y_ph = tf.placeholder(tf.float32, [None])
        preds, loss, optimizer = build_model(x_ph, y_ph, model_type)
        init = tf.global_variables_initializer()

        # 训练参数
        epochs = 20
        batch_size = 256
        train_steps = len(x_train) // batch_size

        with tf.Session() as sess:
            sess.run(init)
            print("=== Training {} Model ===".format(model_type.upper()))
            for epoch in range(epochs):
                epoch_loss = 0.0
                # 批次训练
                for step in range(train_steps):
                    start_idx = step * batch_size
                    end_idx = start_idx + batch_size
                    batch_x = x_train[start_idx:end_idx]
                    batch_y = y_train[start_idx:end_idx]
                    _, batch_loss_val = sess.run(
                        [optimizer, loss],
                        feed_dict={x_ph: batch_x, y_ph: batch_y}
                    )
                    epoch_loss += batch_loss_val
                # 计算测试集 AUC
                y_pred_val = sess.run(preds, feed_dict={x_ph: x_test, y_ph: y_test})
                auc_val = roc_auc_score(y_test, y_pred_val)
                print("Epoch {}/{}, Avg Loss: {:.4f}, Test AUC: {:.4f}".format(
                    epoch+1, epochs, epoch_loss/train_steps, auc_val
                ))
        return auc_val

# ===================== 4. 主函数执行（未修改）====================
if __name__ == "__main__":
    # 生成数据
    x_train, x_test, y_train, y_test, feat_dim = generate_ecommerce_data()

    # 训练并评估两个模型
    auc_rankmixer = train_and_evaluate("rankmixer", x_train, x_test, y_train, y_test, feat_dim)
    auc_rankmixer_gate = train_and_evaluate("rankmixer_gate", x_train, x_test, y_train, y_test, feat_dim)
    auc_mha = train_and_evaluate("mha", x_train, x_test, y_train, y_test, feat_dim)

    # 结果对比
    print("\n=== Final AUC Comparison ===")
    print("RankMixer AUC: {:.4f}".format(auc_rankmixer))
    print("RankMixerGate AUC: {:.4f}".format(auc_rankmixer_gate))
    print("MHA AUC: {:.4f}".format(auc_mha))
    print("AUC Difference (RankMixer - MHA): {:.4f}".format(auc_rankmixer - auc_mha))


('[rank_mixer_layer] inputs static shape:', [None, 128])
('[rank_mixer_layer] batch_size (dynamic):', <tf.Tensor 'rank_mixer/strided_slice:0' shape=() dtype=int32>)
('[rank_mixer_layer] input_dim:', 128)
('[rank_mixer_layer] token_dim:', 32)
('[rank_mixer_layer] tokens static shape:', [None, 4, 32])
('[rank_mixer_layer] head_dim:', 16)
('[rank_mixer_layer] tokens_split static shape:', [None, 4, 2, 16])
('[rank_mixer_layer] tokens_split static shape after transpose:', [None, 2, 4, 16])
=== Training RANKMIXER Model ===
Epoch 1/20, Avg Loss: 0.6001, Test AUC: 0.8580
Epoch 2/20, Avg Loss: 0.4171, Test AUC: 0.9010
Epoch 3/20, Avg Loss: 0.3733, Test AUC: 0.9137
Epoch 4/20, Avg Loss: 0.3514, Test AUC: 0.9224
Epoch 5/20, Avg Loss: 0.3414, Test AUC: 0.9256
Epoch 6/20, Avg Loss: 0.3308, Test AUC: 0.9272
Epoch 7/20, Avg Loss: 0.3264, Test AUC: 0.9283
Epoch 8/20, Avg Loss: 0.3199, Test AUC: 0.9317
Epoch 9/20, Avg Loss: 0.3161, Test AUC: 0.9335
Epoch 10/20, Avg Loss: 0.3112, Test AUC: 0.9330
Epoch 

In [35]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# 判断是否使用TF2.x，如果是则切换到兼容模式
if tf.__version__.startswith('2'):
    import tensorflow.compat.v1 as tf
    tf.disable_v2_behavior()

# 固定随机种子
np.random.seed(2024)
tf.set_random_seed(2024)

def gelu(x):
    return x * 0.5 * (1.0 + tf.erf(x / tf.sqrt(2.0)))

def generate_ecommerce_data():
    # 数据参数
    user_num = 10000
    item_num = 5000
    sample_num = 100000
    user_feat_dim = 16  # 用户特征：年龄、性别、购买力等
    item_feat_dim = 16  # 商品特征：品类、价格、评分等
    behavior_feat_dim = 8  # 行为特征：点击频次、浏览时长等

    # 生成连续特征
    user_feats = np.random.normal(0, 1, (sample_num, user_feat_dim))
    item_feats = np.random.normal(0, 1, (sample_num, item_feat_dim))
    behavior_feats = np.random.normal(0, 1, (sample_num, behavior_feat_dim))

    # 生成点击标签（基于用户-商品特征内积 + 噪声）
    logits = np.sum(user_feats * item_feats, axis=1) + np.random.normal(0, 0.5, sample_num)
    labels = (logits > 0).astype(np.float32)

    # 合并特征并划分训练/测试集
    all_feats = np.concatenate([user_feats, item_feats, behavior_feats], axis=1)
    x_train, x_test, y_train, y_test = train_test_split(
        all_feats, labels, test_size=0.2, random_state=2024
    )
    return x_train, x_test, y_train, y_test, all_feats.shape[1]

def rank_mixer_layer(inputs, token_num=4, head_num=2, hidden_dim=64, is_gate="rankmixer"):
    with tf.variable_scope('rank_mixer', reuse=tf.AUTO_REUSE):
        batch_size = tf.shape(inputs)[0]
        input_dim = inputs.shape[-1].value  # 获取静态维度
        token_dim = input_dim // token_num
        tokens = tf.reshape(inputs, [batch_size, token_num, token_dim])

        head_dim = token_dim // head_num
        tokens_split = tf.reshape(tokens, [batch_size, token_num, head_num, head_dim])
        tokens_split = tf.transpose(tokens_split, [0, 2, 1, 3])

        mixed_tokens = []
        for h in range(head_num):
            with tf.variable_scope('head_{}_mixer'.format(h), reuse=tf.AUTO_REUSE):
                head_token = tokens_split[:, h, :, :]
                
                if is_gate == "rankmixer_gate":

                    # 局部混合：使用静态计算的head_dim
                    local_mix = tf.layers.dense(
                        head_token, head_dim,
                        activation=gelu,
                        kernel_initializer=tf.keras.initializers.glorot_uniform()
                    )

                    # 全局混合
                    global_avg = tf.reduce_mean(head_token, axis=1, keep_dims=True)
                    global_mix = tf.tile(global_avg, [1, head_token.shape[1], 1])

                    # 特征拼接
                    combined = tf.concat([head_token, local_mix, global_mix], axis=-1)

                    # 门控机制：使用静态计算的combined维度
                    combined_dim = combined.shape[-1].value
                    gate = tf.layers.dense(
                        combined, combined_dim,
                        activation=tf.nn.sigmoid,
                        kernel_initializer=tf.keras.initializers.he_normal()
                    )
                    gated = combined * gate
                    mixed = tf.layers.dense(gated, head_dim)
                elif is_gate == "rankmixer_mlp":
                    mixed = tf.layers.dense(head_token, head_dim)
                else:
                    mixed = head_token

                # 维度变换：使用head_dim
                
                mixed_tokens.append(mixed)

        # 合并所有头的输出
        mixed_all = tf.concat(mixed_tokens, axis=-1)
        mixed_all = tf.reshape(mixed_all, [batch_size, token_num, token_dim])  # 显式设置形状

        # 最终变换
        ff_out = tf.layers.dense(mixed_all, hidden_dim, activation=gelu)
        ff_out = tf.layers.dense(ff_out, token_dim)

        # 输出聚合
        output = tf.reduce_mean(ff_out, axis=1)
        return output

def mha_layer(inputs, head_num=4, hidden_dim=64):
    batch_size = tf.shape(inputs)[0]
    input_dim = inputs.shape[-1]

    q = tf.layers.dense(inputs, hidden_dim, activation=tf.nn.relu)
    k = tf.layers.dense(inputs, hidden_dim, activation=tf.nn.relu)
    v = tf.layers.dense(inputs, hidden_dim, activation=tf.nn.relu)

    q = tf.expand_dims(q, axis=1)
    k = tf.expand_dims(k, axis=1)
    v = tf.expand_dims(v, axis=1)

    def scaled_dot_product_attention(q, k, v):
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_logits = matmul_qk / tf.math.sqrt(dk)
        attention_weights = tf.nn.softmax(scaled_logits, axis=-1)
        return tf.matmul(attention_weights, v)

    q_split = tf.split(q, head_num, axis=-1)
    k_split = tf.split(k, head_num, axis=-1)
    v_split = tf.split(v, head_num, axis=-1)

    outputs = []
    for q_h, k_h, v_h in zip(q_split, k_split, v_split):
        outputs.append(scaled_dot_product_attention(q_h, k_h, v_h))
    concat_output = tf.concat(outputs, axis=-1)
    
    output = tf.layers.dense(concat_output, hidden_dim)
    return tf.squeeze(output, axis=1)

def build_model(inputs, labels, model_type="rankmixer"):
    input_layer = tf.reshape(inputs, [-1, inputs.shape[-1]])
    hidden = tf.layers.dense(input_layer, 128, activation=tf.nn.relu)
    hidden = tf.layers.dropout(hidden, rate=0.2, training=True)
    
    print("hidden shape:{}".format(hidden.shape.as_list()))

    if model_type == "rankmixer":
        core_out = rank_mixer_layer(hidden,token_num=4, head_num=2, hidden_dim=64, is_gate="rankmixer")
    elif model_type == "rankmixer_mlp":
        core_out = rank_mixer_layer(hidden,token_num=4, head_num=2, hidden_dim=64, is_gate="rankmixer_mlp")
    elif model_type == "rankmixer_gate":
        core_out = rank_mixer_layer(hidden,token_num=4, head_num=2, hidden_dim=64, is_gate="rankmixer_gate")
        
    elif model_type == "rankmixer_4_2_64":
        core_out_tmp = mha_layer(hidden)
        input_dim = hidden.shape[-1]
        hidden = tf.layers.dense(core_out_tmp, input_dim)
        core_out = rank_mixer_layer(hidden,token_num=4, head_num=2, hidden_dim=64, is_gate="rankmixer")

    elif model_type == "rankmixer_2_4_64":
        core_out = rank_mixer_layer(hidden,token_num=2, head_num=4, hidden_dim=64, is_gate="rankmixer")
    elif model_type == "rankmixer_1_8_64":
        core_out = rank_mixer_layer(hidden,token_num=1, head_num=8, hidden_dim=64, is_gate="rankmixer")
    elif model_type == "mha":
        core_out = mha_layer(hidden)
    else:
        raise ValueError("model_type must be 'rankmixer' or 'mha'")

        
    print("core_out shape:{}".format(core_out.shape.as_list()))
    logits = tf.layers.dense(core_out, 1)
    preds = tf.nn.sigmoid(logits)

    loss = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(
            logits=logits, labels=tf.reshape(labels, [-1, 1])
        )
    )
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
    return preds, loss, optimizer

def train_and_evaluate(model_type, x_train, x_test, y_train, y_test, feat_dim):
    graph = tf.Graph()
    with graph.as_default():
        x_ph = tf.placeholder(tf.float32, [None, feat_dim])
        y_ph = tf.placeholder(tf.float32, [None])
        preds, loss, optimizer = build_model(x_ph, y_ph, model_type)
        init = tf.global_variables_initializer()

        epochs = 20
        batch_size = 256
        train_steps = len(x_train) // batch_size

        with tf.Session() as sess:
            sess.run(init)
            print("=== Training {} Model ===".format(model_type.upper()))
            for epoch in range(epochs):
                epoch_loss = 0.0
                for step in range(train_steps):
                    batch_x = x_train[step*batch_size:(step+1)*batch_size]
                    batch_y = y_train[step*batch_size:(step+1)*batch_size]
                    _, loss_val = sess.run([optimizer, loss], 
                                          {x_ph: batch_x, y_ph: batch_y})
                    epoch_loss += loss_val

                y_pred = sess.run(preds, {x_ph: x_test})
                auc = roc_auc_score(y_test, y_pred)
                print("Epoch {}/{}, Avg Loss: {:.4f}, Test AUC: {:.4f}".format(
                    epoch+1, epochs, epoch_loss/train_steps, auc
                ))

            return roc_auc_score(y_test, sess.run(preds, {x_ph: x_test}))

if __name__ == "__main__":
    x_train, x_test, y_train, y_test, feat_dim = generate_ecommerce_data()
    auc_rankmixer = train_and_evaluate("rankmixer", x_train, x_test, y_train, y_test, feat_dim)
#     auc_rankmixer_mlp = train_and_evaluate("rankmixer_mlp", x_train, x_test, y_train, y_test, feat_dim)
#     auc_rankmixer_gate = train_and_evaluate("rankmixer_gate", x_train, x_test, y_train, y_test, feat_dim)
    auc_rankmixer_4_2_64 = train_and_evaluate("rankmixer_4_2_64", x_train, x_test, y_train, y_test, feat_dim)
    auc_rankmixer_2_4_64 = train_and_evaluate("rankmixer_2_4_64", x_train, x_test, y_train, y_test, feat_dim)
    auc_rankmixer_1_8_64 = train_and_evaluate("rankmixer_1_8_64", x_train, x_test, y_train, y_test, feat_dim)

    auc_mha = train_and_evaluate("mha", x_train, x_test, y_train, y_test, feat_dim)
    
    print("\n=== Final AUC Comparison ===")
    print("RankMixer AUC: {:.4f}".format(auc_rankmixer))
#     print("RankMixerMLP AUC: {:.4f}".format(auc_rankmixer_gate))
#     print("RankMixerGATE AUC: {:.4f}".format(auc_rankmixer_gate))

    print("RankMixer auc_rankmixer_4_2_64 AUC: {:.4f}".format(auc_rankmixer_4_2_64))
    print("RankMixer auc_rankmixer_2_4_64 AUC: {:.4f}".format(auc_rankmixer_2_4_64))
    print("RankMixer auc_rankmixer_1_8_64 AUC: {:.4f}".format(auc_rankmixer_1_8_64))

    print("MHA AUC: {:.4f}".format(auc_mha))
    print("AUC Difference (RankMixer - MHA): {:.4f}".format(auc_rankmixer - auc_mha))


hidden shape:[None, 128]
core_out shape:[None, 32]
=== Training RANKMIXER Model ===
Epoch 1/20, Avg Loss: 0.5957, Test AUC: 0.8637
Epoch 2/20, Avg Loss: 0.4177, Test AUC: 0.9065
Epoch 3/20, Avg Loss: 0.3723, Test AUC: 0.9174
Epoch 4/20, Avg Loss: 0.3508, Test AUC: 0.9234
Epoch 5/20, Avg Loss: 0.3429, Test AUC: 0.9254
Epoch 6/20, Avg Loss: 0.3328, Test AUC: 0.9284
Epoch 7/20, Avg Loss: 0.3233, Test AUC: 0.9283
Epoch 8/20, Avg Loss: 0.3160, Test AUC: 0.9341
Epoch 9/20, Avg Loss: 0.3096, Test AUC: 0.9328
Epoch 10/20, Avg Loss: 0.3065, Test AUC: 0.9371
Epoch 11/20, Avg Loss: 0.3047, Test AUC: 0.9383
Epoch 12/20, Avg Loss: 0.2978, Test AUC: 0.9380
Epoch 13/20, Avg Loss: 0.2954, Test AUC: 0.9412
Epoch 14/20, Avg Loss: 0.2900, Test AUC: 0.9415
Epoch 15/20, Avg Loss: 0.2898, Test AUC: 0.9401
Epoch 16/20, Avg Loss: 0.2835, Test AUC: 0.9444
Epoch 17/20, Avg Loss: 0.2788, Test AUC: 0.9440
Epoch 18/20, Avg Loss: 0.2761, Test AUC: 0.9452
Epoch 19/20, Avg Loss: 0.2783, Test AUC: 0.9443
Epoch 20/20, 