论文链接：https://arxiv.org/abs/1708.05123

In [None]:
!pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
import pandas as pd
import numpy as np

# 列出需要忽略的列，这些列在后续的建模过程中不会使用
IGNORE_COLS = [
    "id", "target",
    "ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
    "ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08",
    "ps_calc_09", "ps_calc_10", "ps_calc_11", "ps_calc_12",
    "ps_calc_13", "ps_calc_14",
    "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
    "ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin"
]

NUMERIC_COLS = [
    # # binary
    # "ps_ind_06_bin", "ps_ind_07_bin", "ps_ind_08_bin",
    # "ps_ind_09_bin", "ps_ind_10_bin", "ps_ind_11_bin",
    # "ps_ind_12_bin", "ps_ind_13_bin", "ps_ind_16_bin",
    # "ps_ind_17_bin", "ps_ind_18_bin",
    # "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
    # "ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin",
    # numeric
    "ps_reg_01", "ps_reg_02", "ps_reg_03",
    "ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",

    # feature engineering
    "missing_feat", "ps_car_13_x_ps_reg_03",
]

def load_data():
    """
    从远程URL加载训练和测试数据集，并进行预处理。
    
    Returns:
        dfTrain (pd.DataFrame): 预处理后的训练数据集。
        dfTest (pd.DataFrame): 预处理后的测试数据集。
        X_train (np.ndarray): 训练特征数组。
        y_train (np.ndarray): 训练目标数组。
        X_test (np.ndarray): 测试特征数组。
        ids_test (np.ndarray): 测试集中的ID数组。
    """
    
    dfTrain = pd.read_csv("https://testonly-2023.oss-cn-hangzhou.aliyuncs.com/data/common/train.csv")
    dfTest = pd.read_csv("https://testonly-2023.oss-cn-hangzhou.aliyuncs.com/data/common/test.csv")
    
    def preprocess(df):
        """
        对数据集进行预处理，包括添加缺失值特征和计算新的特征列。
        
        Args:
            df (pd.DataFrame): 输入的数据集。
        
        Returns:
            pd.DataFrame: 预处理后的数据集。
        """
        # 过滤掉 'id' 和 'target' 列，其余所有列都进行预处理
        cols = [c for c in df.columns if c not in ["id", "target"]]
        
        # 计算每行中缺失值 (-1) 的数量，作为一个新的特征列 'missing_feat'
        df["missing_feat"] = np.sum((df[cols] == -1).values, axis=1)
        
        # 将 'ps_car_13' 列和 'ps_reg_03' 列相乘，生成新特征列 'ps_car_13_x_ps_reg_03'
        df["ps_car_13_x_ps_reg_03"] = df["ps_car_13"] * df["ps_reg_03"]
        
        return df
    
    # 对训练和测试数据集进行预处理
    dfTrain = preprocess(dfTrain)
    dfTest = preprocess(dfTest)
    
    # 过滤掉 'id' 和 'target' 列，并排除 IGNORE_COLS 中列出的列
    cols = [c for c in dfTrain.columns if c not in ["id", "target"]]
    cols = [c for c in cols if (not c in IGNORE_COLS)]
    
    # 提取训练特征、目标和测试特征、ID
    X_train = dfTrain[cols].values
    y_train = dfTrain["target"].values
    X_test = dfTest[cols].values
    ids_test = dfTest["id"].values
    
    return dfTrain, dfTest, X_train, y_train, X_test, ids_test

# 加载并预处理数据
dfTrain, dfTest, X_train, y_train, X_test, ids_test = load_data()

# 打印数据类型和一些基本信息
print("\nShape of X_train:", X_train.shape)  # 打印训练特征数组的形状
print("Shape of y_train:", y_train.shape)  # 打印训练目标数组的形状
print("Shape of X_test:", X_test.shape)  # 打印测试特征数组的形状
print("Shape of ids_test:", ids_test.shape)  # 打印测试集ID数组的形状

In [None]:
def generate_feature_dictionary(df, ignore_cols, numeric_cols):
    """
    生成特征字典并计算特征维度。
    
    Args:
        df (pd.DataFrame): 输入的数据框。
        ignore_cols (list): 需要忽略的列列表。
        numeric_cols (list): 数值列列表，这些列也会被忽略。
    
    Returns:
        dict: 特征字典，其中每列的唯一值映射到一个唯一索引。
        int: 特征的维度，即唯一特征值的总数。
    """
    # 初始化特征计数器
    feature_counter = 0
    
    # 存储每列的特征映射
    feature_dictionary = {}
    
    # 遍历 DataFrame 中的每一列
    for column in df.columns:
        # 忽略指定的列和数值列
        if column in ignore_cols or column in numeric_cols:
            continue
        
        # 获取列中唯一值的集合
        unique_values = df[column].unique()
        
        # 创建特征字典，其中键为唯一值，值为从 feature_counter 开始的递增索引
        feature_dictionary[column] = {val: idx for idx, val in enumerate(unique_values, start=feature_counter)}
        
        # 更新特征计数器
        feature_counter += len(unique_values)
    
    # 设置特征维度
    feature_dimension = feature_counter
    
    return feature_dictionary, feature_dimension

combined_df = pd.concat([dfTrain, dfTest])

feature_dict, feature_dim = generate_feature_dictionary(combined_df, IGNORE_COLS, NUMERIC_COLS)

In [None]:
def parse(df, ignore_cols, numeric_cols, feat_dict=None, has_label=False):
    """
    解析输入的 DataFrame。
    
    Args:
        df (pd.DataFrame): 输入的数据框。
        ignore_cols (list): 需要忽略的列名列表。
        numeric_cols (list): 数值型特征列名列表。
        feat_dict (dict): 分类特征映射字典（默认为 None）。
        has_label (bool): 表示是否包含目标列 'target'。
    
    Returns:
        tuple: 包含解析后的特征索引数组、特征值数组，以及数值型特征数组和标签或 ID。
    """
    assert feat_dict is not None, "feat_dict must be provided"

    # 创建一个 DataFrame 的副本以避免修改原始数据
    dfi = df.copy()

    if has_label:
        # 提取目标列 'target' 并移除 'id' 和 'target' 列
        y = dfi["target"].values
        dfi.drop(["id", "target"], axis=1, inplace=True)
    else:
        # 提取 'id' 列并移除 'id' 列
        ids = dfi["id"].values
        dfi.drop(["id"], axis=1, inplace=True)

    # 获取数值型特征的数据并转换为 numpy.ndarray
    numeric_Xv = dfi[numeric_cols].values
    dfi.drop(numeric_cols, axis=1, inplace=True)
    
    # 创建 dfv 数据框，用于存储特征值
    dfv = dfi.copy()

    # 标识分类特征的索引（one-hot编码方式）
    for col in dfi.columns:
        if col in ignore_cols:
            dfi.drop(col, axis=1, inplace=True)
            dfv.drop(col, axis=1, inplace=True)
        else:
            dfi[col] = dfi[col].map(feat_dict[col])
            dfv[col] = 1.0

    # 将特征索引转换为 numpy.ndarray
    cate_Xi = dfi.values
    # 将特征值转换为 numpy.ndarray
    cate_Xv = dfv.values

    if has_label:
        return cate_Xi, cate_Xv, numeric_Xv, y
    else:
        return cate_Xi, cate_Xv, numeric_Xv, ids


cate_Xi_train, cate_Xv_train, numeric_Xv_train, y_train = \
    parse(dfTrain, IGNORE_COLS, NUMERIC_COLS, feature_dict, has_label=True)
cate_Xi_test, cate_Xv_test, numeric_Xv_test,ids_test = \
    parse(dfTest, IGNORE_COLS, NUMERIC_COLS, feature_dict, has_label=False)

print("Shape of Category Feature Indices (cate_Xi):", cate_Xi_train.shape)
print("Shape of Category Feature Values (cate_Xv):", cate_Xv_train.shape)
print("Size of Numeric Feature Values (numeric_Xv):", numeric_Xv_train.shape)
print("Size of Target Labels (y):", y_train.shape)

print(cate_Xi_train[0])
print(cate_Xv_train[0])

In [None]:
import tensorflow as tf

def initialize_graph(cate_feature_size: int, cate_field_num: int, embedding_size: int):
    """
    初始化TensorFlow计算图并返回会话和必要的张量。
    
    Args:
        cate_feature_size (int): 类别特征的尺寸。
        cate_field_num (int): 类别字段的数量。
        embedding_size (int): 嵌入向量的大小。
    
    Returns:
        session: 初始化的TensorFlow会话。
        x0: 拼接后的输入张量。
        feat_index: 类别特征索引的占位符。
        feat_value: 类别特征值的占位符。
        numeric_value: 数值特征值的占位符。
        label: 标签的占位符。
    """
    random_seed = 2024
    graph = tf.Graph()
    x0 = None
    session = None

    with graph.as_default():
        # 设置随机种子
        tf.set_random_seed(random_seed)

        # 定义占位符
        feat_index = tf.placeholder(tf.int32, shape=[None, None], name='feat_index')
        feat_value = tf.placeholder(tf.float32, shape=[None, None], name='feat_value')
        numeric_value = tf.placeholder(tf.float32, [None, None], name='num_value')
        label = tf.placeholder(tf.float32, shape=[None, 1], name='label')

        # 定义类别特征的嵌入矩阵权重
        feature_embedding_weight = tf.Variable(
            tf.random_normal([cate_feature_size, embedding_size], 0.0, 0.01),
            name='feature_embeddings'
        )

        # 查找嵌入向量
        embeddings = tf.nn.embedding_lookup(feature_embedding_weight, feat_index)

        # 重塑特征值张量以匹配嵌入维度
        feat_value_reshape = tf.reshape(feat_value, shape=[-1, cate_field_num, 1])
        
        # 按元素相乘以调整嵌入
        embeddings = tf.multiply(embeddings, feat_value_reshape)
        
        # 重塑嵌入张量为平坦的维度
        embeddings_reshape = tf.reshape(embeddings, shape=[-1, cate_field_num * embedding_size])
        
        # 拼接数值特征和类别特征
        x0 = tf.concat([numeric_value, embeddings_reshape], axis=1)
        
        # 初始化所有变量
        initializer = tf.global_variables_initializer()
        
        # 创建会话并初始化变量
        session = tf.Session()
        session.run(initializer)

    return session, x0, feat_index, feat_value, numeric_value, label
           

cate_feature_dim = feature_dim
cate_field_num = cate_Xi_train.shape[1]
embedding_size = 8

session, x0, feat_index, feat_value, numeric_value, label = \
    initialize_graph(cate_feature_dim, cate_field_num, embedding_size)

feed_dict = {
    feat_index: cate_Xi_train,
    feat_value: cate_Xv_train,
    numeric_value: numeric_Xv_train,
    label: y_train.reshape(-1, 1)
}

x0_result = session.run(x0, feed_dict=feed_dict)

print(x0_result.shape)