In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from deepctr.models import WDL
from deepctr.feature_column import SparseFeat, get_feature_names
from sklearn.metrics import mean_absolute_error

In [3]:
# 数据加载
df = pd.read_csv('F:/BI/第五周/核心板/L5/libfm/ratings.csv')

In [5]:
# 选取特征
sparse_features = ["userId", "movieId", "timestamp"]
target = ['rating']

In [6]:
for feature in sparse_features:
    transfer = LabelEncoder()
    df[feature] = transfer.fit_transform(df[feature])

In [8]:
fixlen_feature_columns = [SparseFeat(f, df[f].nunique()) for f in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [11]:
# 计算特征gender中不同特征值的个数
df['userId'].nunique()

7120

In [12]:
SparseFeat(name='userId', vocabulary_size=7120, embedding_dim=4)

SparseFeat(name='userId', vocabulary_size=7120, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x0000017496B07088>, embedding_name='userId', group_name='default_group', trainable=True)

In [None]:
# 数据集拆分

In [13]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=30)

In [14]:
df_train.shape, df_test.shape

((838860, 4), (209715, 4))

In [15]:
train_model_input = {n:df_train[n].values for n in feature_names}
test_model_input = {n:df_test[n].values for n in feature_names}

In [None]:
# 模型的建立

In [16]:
model = WDL(
    linear_feature_columns,
    dnn_feature_columns,
    task='regression',
    dnn_hidden_units=(256, 256),
    l2_reg_linear=10,
    l2_reg_embedding=10,
    l2_reg_dnn=1,
    dnn_dropout=0.5,
    dnn_activation='relu',
    seed=1020
           )

In [17]:
# 模型编译
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

In [18]:
# 训练模型
%%time
EPOCH = 10
history = model.fit(
    train_model_input,
    df_train[target].values,
    batch_size=256,
    epochs=EPOCH,
    validation_split=0.2,
    shuffle=True,
    verbose=1,
    workers=2
)

hist_dict  = history.history

Epoch 1/10


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 17min 20s


In [19]:
hist_dict.keys()

dict_keys(['loss', 'mse', 'val_loss', 'val_mse'])

In [20]:
# 预测
test_preds = model.predict(test_model_input, batch_size=256)

In [21]:
test_preds[:2]

array([[3.5383964],
       [3.5379453]], dtype=float32)

In [22]:
# 计算MSE
mean_squared_error(df_test[target].values, test_preds)

1.109423841748677

In [23]:
df_peds = df_test.copy()

In [24]:
df_peds.shape

(209715, 4)

In [25]:
df_peds.head(1)

Unnamed: 0,userId,movieId,rating,timestamp
817193,5450,1190,3.5,588296


In [26]:
test_preds.shape

(209715, 1)

In [27]:
df_peds['preds'] = test_preds[:, 0]

In [28]:
# 真实值与预测值对比
df_peds[['rating', 'preds']].head()

Unnamed: 0,rating,preds
817193,3.5,3.538396
438441,2.0,3.537945
370515,5.0,3.537991
840250,4.5,3.538244
250809,3.0,3.538232
