In [143]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from deepctr.models import WDL
from deepctr.feature_column import SparseFeat,get_feature_names
import random
import numpy as np
import math

In [9]:
data = pd.read_csv('./u.data',header=None,sep='\t')

In [11]:
data.columns=['user_id','item_id','rating','timestamp']

In [17]:
data = data.sort_values(['user_id','item_id','rating'])

In [15]:
user = pd.read_csv('./u.user',header=None,sep='|')

In [18]:
user.columns=['user_id','age','gender','occupation','zip']

In [27]:
user

Unnamed: 0,user_id,age,gender,occupation,zip
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [28]:
data

Unnamed: 0,user_id,item_id,rating,timestamp
32236,1,1,5,874965758
23171,1,2,3,876893171
83307,1,3,4,878542960
62631,1,4,3,876893119
47638,1,5,3,889751712
...,...,...,...,...
68857,943,1067,2,875501756
74200,943,1074,4,888640250
78704,943,1188,3,888640250
86600,943,1228,3,888640275


In [31]:
data.join(user,on='user_id')

ValueError: columns overlap but no suffix specified: Index(['user_id'], dtype='object')

In [32]:
data.join(user,on='user_id',lsuffix='_caller', rsuffix='_other')

Unnamed: 0,user_id_caller,item_id,rating,timestamp,user_id_other,age,gender,occupation,zip
32236,1,1,5,874965758,2.0,53.0,F,other,94043
23171,1,2,3,876893171,2.0,53.0,F,other,94043
83307,1,3,4,878542960,2.0,53.0,F,other,94043
62631,1,4,3,876893119,2.0,53.0,F,other,94043
47638,1,5,3,889751712,2.0,53.0,F,other,94043
...,...,...,...,...,...,...,...,...,...
68857,943,1067,2,875501756,,,,,
74200,943,1074,4,888640250,,,,,
78704,943,1188,3,888640250,,,,,
86600,943,1228,3,888640275,,,,,


In [47]:
user_c = user.set_index('user_id')
data_c = data.set_index('user_id')
pd.concat([data_c,user_c],axis=1)

Unnamed: 0_level_0,item_id,rating,timestamp,age,gender,occupation,zip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,5,874965758,24,M,technician,85711
1,2,3,876893171,24,M,technician,85711
1,3,4,878542960,24,M,technician,85711
1,4,3,876893119,24,M,technician,85711
1,5,3,889751712,24,M,technician,85711
...,...,...,...,...,...,...,...
943,1067,2,875501756,22,M,student,77841
943,1074,4,888640250,22,M,student,77841
943,1188,3,888640250,22,M,student,77841
943,1228,3,888640275,22,M,student,77841


In [56]:
data_all = data.merge(user)
print(data_all['user_id'].nunique())
data_all.head()

942


Unnamed: 0,user_id,item_id,rating,timestamp,age,gender,occupation,zip
0,1,0,4,888550871,24,M,technician,85711
1,1,9,2,888551853,24,M,technician,85711
2,1,12,4,888551922,24,M,technician,85711
3,1,13,4,888551853,24,M,technician,85711
4,1,18,3,888550871,24,M,technician,85711


In [57]:
feature_name = ['user_id','item_id','age','gender','occupation','zip']
for feat in feature_name :
    lab = LabelEncoder()
    data_all[feat] = lab.fit_transform(data_all[feat])

In [59]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data_all[feat].nunique(),embedding_dim=4) for feat in feature_name]


In [64]:
train,test = train_test_split(data_all,test_size=0.2)
train_model_input = {name:train[name].values for name in feature_name}
test_model_input = {name:test[name].values for name in feature_name}
model = WDL(fixlen_feature_columns,fixlen_feature_columns,task='regression')
model.compile('adam','mse',metrics=['mse'])
history = model.fit(train_model_input,train['rating'].values,epochs=10,validation_split=0.2,verbose=2)

Epoch 1/10
1995/1995 - 3s - loss: 1.1616 - mse: 1.1614 - val_loss: 0.9070 - val_mse: 0.9067
Epoch 2/10
1995/1995 - 2s - loss: 0.9012 - mse: 0.9008 - val_loss: 0.8940 - val_mse: 0.8936
Epoch 3/10
1995/1995 - 2s - loss: 0.8738 - mse: 0.8732 - val_loss: 0.8868 - val_mse: 0.8860
Epoch 4/10
1995/1995 - 2s - loss: 0.8535 - mse: 0.8526 - val_loss: 0.8912 - val_mse: 0.8901
Epoch 5/10
1995/1995 - 2s - loss: 0.8379 - mse: 0.8367 - val_loss: 0.8678 - val_mse: 0.8665
Epoch 6/10
1995/1995 - 2s - loss: 0.8236 - mse: 0.8221 - val_loss: 0.8897 - val_mse: 0.8881
Epoch 7/10
1995/1995 - 2s - loss: 0.8081 - mse: 0.8062 - val_loss: 0.8889 - val_mse: 0.8869
Epoch 8/10
1995/1995 - 2s - loss: 0.7933 - mse: 0.7912 - val_loss: 0.8651 - val_mse: 0.8628
Epoch 9/10
1995/1995 - 2s - loss: 0.7765 - mse: 0.7740 - val_loss: 0.8647 - val_mse: 0.8621
Epoch 10/10
1995/1995 - 2s - loss: 0.7612 - mse: 0.7584 - val_loss: 0.8657 - val_mse: 0.8627


In [149]:
train,test = train_test_split(data_all,test_size=0.2)

In [153]:
for i in range(1,16):
    print(i)
    train_model_input = {name:train[name].values for name in feature_name}
    test_model_input = {name:test[name].values for name in feature_name}
    model = WDL(fixlen_feature_columns,fixlen_feature_columns,task='regression')
    model.compile('adam','mse',metrics=['mse'])
    history = model.fit(train_model_input,train['rating'].values,epochs=i,validation_split=0.2,verbose=2)
    pred_ans = model.predict(test_model_input)
    rating_devition = 0 
    for pred,real in zip(pred_ans,test['rating'].values):
        rating_devition += abs(pred - real)
    print(rating_devition)
    print('-----------------')

1
1995/1995 - 3s - loss: 1.1508 - mse: 1.1506 - val_loss: 0.9488 - val_mse: 0.9486
[15204.914]
-----------------
2
Epoch 1/2
1995/1995 - 3s - loss: 1.1380 - mse: 1.1379 - val_loss: 0.9138 - val_mse: 0.9136
Epoch 2/2
1995/1995 - 2s - loss: 0.8921 - mse: 0.8917 - val_loss: 0.8976 - val_mse: 0.8971
[15006.773]
-----------------
3
Epoch 1/3
1995/1995 - 3s - loss: 1.1404 - mse: 1.1403 - val_loss: 0.9470 - val_mse: 0.9468
Epoch 2/3
1995/1995 - 2s - loss: 0.8965 - mse: 0.8961 - val_loss: 0.9054 - val_mse: 0.9050
Epoch 3/3
1995/1995 - 2s - loss: 0.8664 - mse: 0.8658 - val_loss: 0.8914 - val_mse: 0.8907
[14964.966]
-----------------
4
Epoch 1/4
1995/1995 - 3s - loss: 1.1438 - mse: 1.1436 - val_loss: 0.9282 - val_mse: 0.9279
Epoch 2/4
1995/1995 - 2s - loss: 0.8946 - mse: 0.8942 - val_loss: 0.8921 - val_mse: 0.8916
Epoch 3/4
1995/1995 - 2s - loss: 0.8635 - mse: 0.8629 - val_loss: 0.8894 - val_mse: 0.8887
Epoch 4/4
1995/1995 - 2s - loss: 0.8470 - mse: 0.8461 - val_loss: 0.8843 - val_mse: 0.8833
[1

In [138]:
def train_test(data,size=0.2):
    #按照用户切割
    train_index = []
    test_index = []
    random.seed(666)
    for i in range(942):
        index =list(data[data['user_id']==i].index.values) 
        random.shuffle(index)
        length = round(len(index)*(1-size))
        train_index +=index[:length]
        test_index +=index[length:]
    return data.loc[train_index],data.loc[test_index]

In [155]:
train,test = train_test(data_all,size=0.2)

In [141]:

train_model_input = {name:train[name].values for name in feature_name}
test_model_input = {name:test[name].values for name in feature_name}
model = WDL(fixlen_feature_columns,fixlen_feature_columns,task='regression')
model.compile('adam','mse',metrics=['mse'])
history = model.fit(train_model_input,train['rating'].values,epochs=50,validation_split=0.2,verbose=2)
pred_ans = model.predict(test_model_input)

Epoch 1/50
1995/1995 - 3s - loss: 1.1452 - mse: 1.1451 - val_loss: 1.0302 - val_mse: 1.0299
Epoch 2/50
1995/1995 - 2s - loss: 0.8954 - mse: 0.8951 - val_loss: 1.0173 - val_mse: 1.0169
Epoch 3/50
1995/1995 - 2s - loss: 0.8703 - mse: 0.8698 - val_loss: 1.0247 - val_mse: 1.0241
Epoch 4/50
1995/1995 - 2s - loss: 0.8534 - mse: 0.8526 - val_loss: 1.0406 - val_mse: 1.0397
Epoch 5/50
1995/1995 - 2s - loss: 0.8396 - mse: 0.8385 - val_loss: 1.0318 - val_mse: 1.0305
Epoch 6/50
1995/1995 - 2s - loss: 0.8224 - mse: 0.8211 - val_loss: 1.0800 - val_mse: 1.0785
Epoch 7/50
1995/1995 - 2s - loss: 0.8005 - mse: 0.7989 - val_loss: 1.0324 - val_mse: 1.0306
Epoch 8/50
1995/1995 - 2s - loss: 0.7805 - mse: 0.7786 - val_loss: 1.0949 - val_mse: 1.0928
Epoch 9/50
1995/1995 - 2s - loss: 0.7608 - mse: 0.7586 - val_loss: 1.0680 - val_mse: 1.0656
Epoch 10/50
1995/1995 - 2s - loss: 0.7477 - mse: 0.7452 - val_loss: 1.0592 - val_mse: 1.0566
Epoch 11/50
1995/1995 - 2s - loss: 0.7352 - mse: 0.7325 - val_loss: 1.0782 - va

In [157]:
for i in range(1,10):
    print(i)
    train_model_input = {name:train[name].values for name in feature_name}
    test_model_input = {name:test[name].values for name in feature_name}
    model = WDL(fixlen_feature_columns,fixlen_feature_columns,task='regression')
    model.compile('adam','mse',metrics=['mse'])
    history = model.fit(train_model_input,train['rating'].values,epochs=i,validation_split=0.2,verbose=2)
    pred_ans = model.predict(test_model_input)
    rating_devition = 0 
    for pred,real in zip(pred_ans,test['rating'].values):
        rating_devition += abs(pred - real)
    print(rating_devition)
    print('-----------------')

1
1995/1995 - 3s - loss: 1.1636 - mse: 1.1635 - val_loss: 1.0450 - val_mse: 1.0448
[15389.212]
-----------------
2
Epoch 1/2
1995/1995 - 3s - loss: 1.1543 - mse: 1.1542 - val_loss: 1.0199 - val_mse: 1.0197
Epoch 2/2
1995/1995 - 2s - loss: 0.9031 - mse: 0.9028 - val_loss: 1.0426 - val_mse: 1.0422
[15491.081]
-----------------
3
Epoch 1/3
1995/1995 - 3s - loss: 1.1484 - mse: 1.1483 - val_loss: 1.0255 - val_mse: 1.0253
Epoch 2/3
1995/1995 - 3s - loss: 0.9055 - mse: 0.9051 - val_loss: 1.0032 - val_mse: 1.0028
Epoch 3/3
1995/1995 - 2s - loss: 0.8798 - mse: 0.8792 - val_loss: 1.0197 - val_mse: 1.0190
[14874.8545]
-----------------
4
Epoch 1/4
1995/1995 - 3s - loss: 1.1423 - mse: 1.1422 - val_loss: 1.0354 - val_mse: 1.0352
Epoch 2/4
1995/1995 - 2s - loss: 0.9052 - mse: 0.9049 - val_loss: 1.0199 - val_mse: 1.0195
Epoch 3/4
1995/1995 - 2s - loss: 0.8787 - mse: 0.8782 - val_loss: 1.0283 - val_mse: 1.0276
Epoch 4/4
1995/1995 - 2s - loss: 0.8613 - mse: 0.8605 - val_loss: 1.0460 - val_mse: 1.0450
[

In [98]:
a =[]

[1, 2, 2, 3, 2]

In [119]:
a += list(range(50))
a

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49]

In [108]:
a

[1, 2, 3]