In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr_torch.models import DeepFM


def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))


if __name__ == "__main__":
    data = pd.read_csv("./movielens_sample.txt")
#     sparse_features = ["movie_id", "user_id",
#                        "gender", "age", "occupation", "zip", ]
    sparse_features = ["movie_id", "gender", "age"]
    target = ['rating']

    # 1.Label Encoding for sparse features,and process sequence features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    # preprocess the sequence feature

    key2index = {}
    genres_list = list(map(split, data['genres'].values))
    genres_length = np.array(list(map(len, genres_list)))
    max_len = max(genres_length)
    # Notice : padding=`post`
    genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
                              for feat in sparse_features]

    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
        key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',)]  
    # Notice : value 0 is for padding for sequence input feature

    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model
    model_input = {name: data[name] for name in sparse_features}  #
    model_input["genres"] = genres_list

    # 4.Define Model,compile and train

    device = 'cpu'
#     use_cuda = True
    use_cuda = False
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = 'cuda:0'

    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device)
#     model = DeepFM(linear_feature_columns, dnn_feature_columns, task='multiclass', device=device)

    model.compile("adam", "mse", metrics=['mse'], )
#     model.compile("adam", "binary_crossentropy", metrics=['acc'], )

    history = model.fit(model_input, data[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

cpu
Train on 160 samples, validate on 40 samples, 1 steps per epoch
Epoch 1/10
0s - loss:  14.3369 - mse:  14.3369 - val_mse:  13.2144
Epoch 2/10
0s - loss:  13.9993 - mse:  13.9993 - val_mse:  12.9205
Epoch 3/10
0s - loss:  13.6893 - mse:  13.6893 - val_mse:  12.6459
Epoch 4/10
0s - loss:  13.3997 - mse:  13.3997 - val_mse:  12.4208
Epoch 5/10
0s - loss:  13.1610 - mse:  13.1610 - val_mse:  12.2067
Epoch 6/10
0s - loss:  12.9335 - mse:  12.9335 - val_mse:  11.9843
Epoch 7/10
0s - loss:  12.6973 - mse:  12.6973 - val_mse:  11.7535
Epoch 8/10
0s - loss:  12.4522 - mse:  12.4522 - val_mse:  11.5141
Epoch 9/10
0s - loss:  12.1982 - mse:  12.1982 - val_mse:  11.2660
Epoch 10/10
0s - loss:  11.9348 - mse:  11.9348 - val_mse:  11.0091


In [2]:
# test_model_input = {}
# for name in sparse_features:
#      test_model_input[name] = model_input[name][0:5]
from sklearn.metrics import mean_squared_error

test_model_input = {name: data[name][0:5] for name in sparse_features}
test_model_input["genres"] = genres_list[0:5]

pred_ans = model.predict(test_model_input, batch_size=256)

true_ans = data[target][0:5].values
print("test MSE", round(mean_squared_error(true_ans, pred_ans), 4))

test MSE 12.218


In [None]:
test_model_input

In [None]:
pred_ans

In [None]:
true_ans

In [None]:
for key, values in  model_input.items():
#     print(key, values)
    print(key)
    print('-----')
    print(values[0])
    print(type(values))

In [2]:
len(data[target].values)

200

In [3]:
type(data[target].values)
data[target].values.shape

(200, 1)

In [9]:
unique_items, counts = np.unique(data[target].values, return_counts=True)
print(counts)
print(unique_items)

[1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 2 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1
 1 1]
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 15

In [10]:
type((model_input))

dict