# DeepFM

In [1]:
# 导入
import sys

sys.path.append("E:/dataFiles/github/MFlow")

In [2]:
# 数据生成
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# TODO: 不使用pandas以及sklearn

# 数据读取，并去掉第一列
data = pd.read_csv("E:/dataFiles/github/MFlow/data/Titanic.csv").drop(
    ["PassengerId", "Name", "Ticket", "Cabin"], axis=1)
# 将字符串标签替换为数字并转换为one-hot
le = LabelEncoder()
ohe = OneHotEncoder(sparse=False)
pclass = ohe.fit_transform(le.fit_transform(data["Pclass"].fillna(0)).reshape(-1, 1))
sex = ohe.fit_transform(le.fit_transform(data["Sex"].fillna("")).reshape(-1, 1))
embarked = ohe.fit_transform(le.fit_transform(data["Embarked"].fillna("")).reshape(-1, 1))
# 特征组合
xs = np.concatenate([
    pclass, sex, data[["Age"]].fillna(0), data[["SibSp"]].fillna(0),
    data[["Parch"]].fillna(0), data[["Fare"]].fillna(0), embarked], axis=1)
# 标签
ys = data["Survived"].values * 2 - 1

# 特征维数
FEAT_DIM = xs.shape[1]
# 嵌入向量维度
HIDDEN_DIM = 2

print(xs.shape, ys.shape)
print(xs[0], ys[0])

(891, 13) (891,)
[ 0.    0.    1.    0.    1.   22.    1.    0.    7.25  0.    0.    0.
  1.  ] -1


In [3]:
# 训练
from mflow import core, ops, opts, lays

# 超参数
lr = 0.005
epoch = 200
batch_size = 16

with core.NameScope("DeepFM"):
    # 初始化变量
    x = core.Variable(size=(FEAT_DIM, 1), trainable=False)
    w = core.Variable(size=(1, FEAT_DIM), trainable=True)
    x_pclass = core.Variable(size=(pclass.shape[1], 1), trainable=False)
    x_sex = core.Variable(size=(sex.shape[1], 1), trainable=False)
    x_embarked = core.Variable(size=(embarked.shape[1], 1), trainable=False)
    w_pclass = core.Variable(size=(HIDDEN_DIM, pclass.shape[1]), trainable=True)
    w_sex = core.Variable(size=(HIDDEN_DIM, sex.shape[1]), trainable=True)
    w_embarked = core.Variable(size=(HIDDEN_DIM, embarked.shape[1]), trainable=True)
    b = core.Variable(size=(1, 1), trainable=True)
    y = core.Variable(size=(1, 1), trainable=False)
    # 模型定义
    # deep部分
    embedding_pclass = ops.MatMul(w_pclass, x_pclass)
    embedding_sex = ops.MatMul(w_sex, x_sex)
    embedding_embarked = ops.MatMul(w_embarked, x_embarked)
    embedding = ops.Concat(embedding_pclass, embedding_sex, embedding_embarked)
    hidden_1 = lays.Linear(embedding, 3 * HIDDEN_DIM, 8, "RELU")
    hidden_2 = lays.Linear(hidden_1, 8, 4, "ReLU")
    deep = lays.Linear(hidden_2, 4, 1, None)
    # FM部分
    fm = ops.Add(
        ops.MatMul(ops.Reshape(embedding, shape=(1, 3 * HIDDEN_DIM)), embedding),
        ops.MatMul(w, x)
    )
    # 组合
    pred = ops.Add(fm, deep, b)
    predicter = ops.Logistic(pred)
    loss = ops.loss.LogLoss(ops.Multiply(y, pred))
    adam = opts.Adam(core.DefaultGraph, loss, lr)
    # 开始训练
    for ep in range(1, epoch + 1):
        bs_idx = 0  # 批次计数
        # 这是一个epoch的过程
        for i, (feat, lab) in enumerate(zip(xs, ys)):
            x.setValue(np.mat(feat).T)
            x_pclass.setValue(np.mat(feat[:3]).T)
            x_sex.setValue(np.mat(feat[3:5]).T)
            x_embarked.setValue(np.mat(feat[9:]).T)
            y.setValue(np.mat(lab))
            adam.step()
            bs_idx += 1
            if bs_idx == batch_size:
                adam.update()
                bs_idx = 0
        # 一个epoch完成后进行评估
        preds = []
        for feat in xs:
            x.setValue(np.mat(feat).T)
            x_pclass.setValue(np.mat(feat[:3]).T)
            x_sex.setValue(np.mat(feat[3:5]).T)
            x_embarked.setValue(np.mat(feat[9:]).T)
            predicter.forward()
            preds.append(predicter.value[0, 0])  # 结果
        preds = (np.array(preds) > 0.5).astype("int") * 2 - 1
        acc = (ys == preds).astype("uint8").sum() / len(xs)
        print("Epoch: {:d}, acc: {:.3f}.".format(ep, acc))

Epoch: 1, acc: 0.746.
Epoch: 2, acc: 0.788.
Epoch: 3, acc: 0.808.
Epoch: 4, acc: 0.806.
Epoch: 5, acc: 0.804.
Epoch: 6, acc: 0.804.
Epoch: 7, acc: 0.802.
Epoch: 8, acc: 0.805.
Epoch: 9, acc: 0.804.
Epoch: 10, acc: 0.804.
Epoch: 11, acc: 0.805.
Epoch: 12, acc: 0.805.
Epoch: 13, acc: 0.804.
Epoch: 14, acc: 0.804.
Epoch: 15, acc: 0.802.
Epoch: 16, acc: 0.802.
Epoch: 17, acc: 0.802.
Epoch: 18, acc: 0.804.
Epoch: 19, acc: 0.804.
Epoch: 20, acc: 0.801.
Epoch: 21, acc: 0.802.
Epoch: 22, acc: 0.804.
Epoch: 23, acc: 0.804.
Epoch: 24, acc: 0.804.
Epoch: 25, acc: 0.802.
Epoch: 26, acc: 0.801.
Epoch: 27, acc: 0.802.
Epoch: 28, acc: 0.804.
Epoch: 29, acc: 0.805.
Epoch: 30, acc: 0.804.
Epoch: 31, acc: 0.801.
Epoch: 32, acc: 0.802.
Epoch: 33, acc: 0.800.
Epoch: 34, acc: 0.801.
Epoch: 35, acc: 0.802.
Epoch: 36, acc: 0.802.
Epoch: 37, acc: 0.801.
Epoch: 38, acc: 0.801.
Epoch: 39, acc: 0.801.
Epoch: 40, acc: 0.801.
Epoch: 41, acc: 0.800.
Epoch: 42, acc: 0.801.
Epoch: 43, acc: 0.800.
Epoch: 44, acc: 0.80