In [36]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from deepctr.models import WDL
from deepctr.feature_column import SparseFeat,get_feature_names

In [12]:
data = pd.read_csv('./movielens_sample.csv')

In [13]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009


In [14]:
sparse_features = ['user_id','movie_id','genres','gender','age','occupation','zip']
for feat in sparse_features :
    lab = LabelEncoder()
    data[feat] = lab.fit_transform(data[feat])

In [15]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,107,12,4,968035345,Ed Wood (1994),43,0,2,4,35
1,123,169,3,966536874,Patriot Games (1992),25,1,1,4,118
2,12,6,4,976203603,"Bridges of Madison County, The (1995)",63,0,2,13,99
3,21,112,3,975430389,Indiana Jones and the Temple of Doom (1984),1,1,1,18,55
4,187,45,5,957782527,"Apartment, The (1960)",43,1,5,19,41


In [18]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) for feat in sparse_features]

In [19]:
fixlen_feature_columns

[SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7f8510902750>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7f8511715dd0>, embedding_name='movie_id', group_name='default_group', trainable=True),
 SparseFeat(name='genres', vocabulary_size=79, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7f8510a940d0>, embedding_name='genres', group_name='default_group', trainable=True),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.init

In [22]:
train,test = train_test_split(data,test_size=0.2)
train_model_input = {name:train[name].values for name in sparse_features}
test_model_inpput = {name:test[name].values for name in sparse_features}

In [43]:
model = WDL(fixlen_feature_columns,fixlen_feature_columns,task='regression')
model.compile('adam','mse',metrics=['mse'])
history = model.fit(train_model_input,train['rating'].values,epochs=15,validation_split=0.2,verbose=2)


Epoch 1/15
4/4 - 0s - loss: 13.6588 - mse: 13.6588 - val_loss: 15.4258 - val_mse: 15.4258
Epoch 2/15
4/4 - 0s - loss: 13.0078 - mse: 13.0078 - val_loss: 14.6888 - val_mse: 14.6888
Epoch 3/15
4/4 - 0s - loss: 12.2070 - mse: 12.2070 - val_loss: 13.7579 - val_mse: 13.7579
Epoch 4/15
4/4 - 0s - loss: 11.1827 - mse: 11.1827 - val_loss: 12.5871 - val_mse: 12.5871
Epoch 5/15
4/4 - 0s - loss: 9.8993 - mse: 9.8993 - val_loss: 11.1344 - val_mse: 11.1344
Epoch 6/15
4/4 - 0s - loss: 8.3841 - mse: 8.3841 - val_loss: 9.3807 - val_mse: 9.3807
Epoch 7/15
4/4 - 0s - loss: 6.5263 - mse: 6.5263 - val_loss: 7.3602 - val_mse: 7.3602
Epoch 8/15
4/4 - 0s - loss: 4.5855 - mse: 4.5855 - val_loss: 5.1909 - val_mse: 5.1909
Epoch 9/15
4/4 - 0s - loss: 2.6930 - mse: 2.6930 - val_loss: 3.1665 - val_mse: 3.1665
Epoch 10/15
4/4 - 0s - loss: 1.4326 - mse: 1.4325 - val_loss: 1.6974 - val_mse: 1.6973
Epoch 11/15
4/4 - 0s - loss: 1.0143 - mse: 1.0143 - val_loss: 1.0937 - val_mse: 1.0937
Epoch 12/15
4/4 - 0s - loss: 1.297

In [41]:
pred_ans = model.predict(test_model_inpput)

In [44]:
for pred,real in zip(pred_ans,test['rating']):
    print(pred[0],'\t',real)

2.471647 	 1
2.6054475 	 1
2.3183064 	 3
2.6303968 	 5
2.8857474 	 3
2.3274262 	 2
2.8408983 	 4
2.3871212 	 5
2.3965783 	 4
2.5380304 	 4
2.7082858 	 4
2.7082698 	 5
2.5533893 	 3
2.4517272 	 3
2.8611135 	 4
3.324551 	 5
2.92032 	 1
2.6374516 	 3
2.7672143 	 4
2.471625 	 4
3.5648694 	 4
2.7082913 	 5
2.4306126 	 3
3.1205094 	 5
2.793662 	 5
3.0300746 	 2
2.8875585 	 5
2.410688 	 5
2.910089 	 2
2.644645 	 4
2.3943613 	 1
2.951304 	 3
2.6461034 	 1
2.708271 	 4
3.1370192 	 4
2.5138237 	 3
2.5666418 	 4
2.5946863 	 5
2.7083027 	 2
2.6809695 	 3


In [45]:
data.loc[[1,2]]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
1,123,169,3,966536874,Patriot Games (1992),25,1,1,4,118
2,12,6,4,976203603,"Bridges of Madison County, The (1995)",63,0,2,13,99


In [46]:
round(2.4)

2