In [10]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat,get_feature_names

In [25]:
def get_model_input(feature_columns, data):
    _dict = data[feature_columns].to_dict(orient='list')

    for name in feature_columns:
        _dict[name] = np.array(_dict[name])
        
    return _dict

def get_fixed_feature_columns(feature_columns, feature_max_idx, n_dim):
    _columns = []
    
    for name in feature_columns:
        _columns.append(SparseFeat(name, feature_max_idx[name] + 1, n_dim))
        
    return _columns

In [26]:
# 加载训练样本与测试样本
train_df = pd.read_parquet('ml1M-train.parquet', engine='pyarrow')
test_df = pd.read_parquet('ml1M-test.parquet', engine='pyarrow')

with open('ml1M_feature_max_idx.json', 'r') as json_file:
    feature_max_idx = json.load(json_file)
    feature_max_idx["rating"] = 5
    
train_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip,watch_movie_seq,watch_genre_seq,seq_len
0,5505,355,4,959732229,"Lion King, The (1994)",3,1,3,5,107,"[581, 741, 958, 1916, 1839, 968, 1, 2899, 2484...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 6, 6, 10,...",33
1,4446,1125,2,965089335,Alien (1979),1,2,2,19,2047,"[2064, 3208, 633, 1963, 3257, 2, 309, 59, 1788...","[1, 1, 1, 2, 2, 2, 4, 2, 2, 1, 2, 2, 5, 3, 2, ...",50
2,2231,673,5,974602217,Faces (1968),8,2,6,12,2680,"[3427, 2953, 3546, 2799, 3185, 3229, 149, 3237...","[8, 2, 8, 8, 8, 2, 8, 8, 8, 8, 6, 8, 8, 8, 8, ...",50
3,4942,2439,4,962642043,Superman (1978),1,2,5,13,1356,"[1730, 2427, 2262, 2461, 2215, 3459, 2335, 151...","[13, 1, 11, 1, 5, 8, 1, 8, 1, 1, 1, 2, 5, 1, 1...",44
4,1067,1766,4,974952597,On the Waterfront (1954),6,2,4,13,3082,"[2480, 1140, 241, 2786, 3271, 864, 1130, 2899,...","[8, 5, 8, 5, 8, 8, 11, 3, 15, 15, 10, 8, 5, 3,...",50


In [27]:
# 构建模型的输入
fixlen_feature_columns = ['user_id', 'movie_id', 'genres', 'rating', 'gender', 'age', 'occupation', 'zip']
target = ['rating']
train_model_input = get_model_input(fixlen_feature_columns, train_df)
test_model_input = get_model_input(fixlen_feature_columns, test_df)
fixlen_feature_columns = get_fixed_feature_columns(fixlen_feature_columns, feature_max_idx, n_dim=4)

In [29]:
# DeepFM模型构建
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )

In [30]:
# 训练模型
history = model.fit(train_model_input, train_df[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Epoch 1/10
2814/2814 - 6s - loss: 0.1128 - mse: 0.1128 - val_loss: 4.6847e-05 - val_mse: 8.9520e-06
Epoch 2/10
2814/2814 - 5s - loss: 6.7540e-05 - mse: 3.9774e-05 - val_loss: 3.0285e-04 - val_mse: 2.8457e-04
Epoch 3/10
2814/2814 - 5s - loss: 1.4338e-04 - mse: 1.2849e-04 - val_loss: 7.8458e-05 - val_mse: 6.0494e-05
Epoch 4/10
2814/2814 - 5s - loss: 1.6168e-04 - mse: 1.4149e-04 - val_loss: 1.3720e-04 - val_mse: 1.0663e-04
Epoch 5/10
2814/2814 - 5s - loss: 1.5907e-04 - mse: 1.3169e-04 - val_loss: 1.7109e-04 - val_mse: 1.4189e-04
Epoch 6/10
2814/2814 - 5s - loss: 1.5024e-04 - mse: 1.2377e-04 - val_loss: 1.4935e-04 - val_mse: 1.2794e-04
Epoch 7/10
2814/2814 - 5s - loss: 1.3862e-04 - mse: 1.1257e-04 - val_loss: 1.1233e-04 - val_mse: 1.0250e-04
Epoch 8/10
2814/2814 - 5s - loss: 1.3675e-04 - mse: 1.1083e-04 - val_loss: 1.6253e-04 - val_mse: 1.4622e-04
Epoch 9/10
2814/2814 - 5s - loss: 1.3157e-04 - mse: 1.0590e-04 - val_loss: 4.2893e-05 - val_mse: 2.4776e-05
Epoch 10/10
2814/2814 - 5s - loss: 1

In [32]:
pred_ans = model.predict(test_model_input, batch_size=256)
print("test MSE", round(mean_squared_error(test_df[target].values, pred_ans), 4))

test MSE 0.0


In [39]:
pred_ans[1001]

array([4.0016527], dtype=float32)

In [40]:
test_df[target].values[1001]

array([4])