- https://www.kaggle.com/code/leejunseok97/deepfm-deepctr-torch

In [1]:
import os, random, gc
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [2]:
dtype={'id': np.dtype(int), # personal_id
    'click': np.dtype(int), # target
    'hour': np.dtype(str), # datetime
    'C1': np.dtype(str), # anonymized categorical variable
    'banner_pos': np.dtype(str), # showing banner
    'site_id': np.dtype(str), 
    'site_domain': np.dtype(str), 
    'site_category': np.dtype(str), 
    'app_id': np.dtype(str), 
    'app_domain': np.dtype(str), 
    'app_category': np.dtype(str), 
    'device_id': np.dtype(str), 
    'device_ip': np.dtype(str), 
    'device_model': np.dtype(str),
    'device_type': np.dtype(str),
    'device_conn_type': np.dtype(str),
    'C14': np.dtype(str), # C14 ~ C21: anonymized categorical variables
    'C15': np.dtype(str),
    'C16': np.dtype(str),
    'C17': np.dtype(str),
    'C18': np.dtype(str),
    'C19': np.dtype(str),
    'C20': np.dtype(str),
    'C21':np.dtype(str)
      }
num_records = 40428967
sample_size = 50000
skip_values = sorted(random.sample(range(1,num_records), num_records - sample_size))
parse_date = lambda val : pd.datetime.strptime(val, '%y%m%d%H')

train = pd.read_csv("../data/avazu/train.gz", parse_dates=['hour'], date_parser=parse_date, dtype=dtype, skiprows=skip_values)

  parse_date = lambda val : pd.datetime.strptime(val, '%y%m%d%H')


In [3]:
test = pd.read_csv('../data/avazu/test.gz')

print('Train dataset:',train.shape)
print('Test dataset:',test.shape)

Train dataset: (50000, 24)
Test dataset: (4577464, 23)


In [8]:
train.set_index('hour',inplace=True)
test.set_index('hour',inplace=True)

In [4]:
import torch
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

In [9]:
data = train 

In [10]:
sparse_features = train.iloc[::,2:14].columns.values.tolist()
dense_features = train.iloc[::,15:].columns.values.tolist()
sparse_features.append('id')
target = ['click']

In [11]:
test.head()

Unnamed: 0_level_0,id,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14103100,1.000017e+19,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,8330,320,50,761,3,175,100075,23
14103100,1.000018e+19,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,22676,320,50,2616,0,35,100083,51
14103100,1.000055e+19,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,22676,320,50,2616,0,35,100083,51
14103100,1.000109e+19,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,a99f214a,...,1,0,18648,320,50,1092,3,809,100156,61
14103100,1.000138e+19,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,a99f214a,...,1,0,23160,320,50,2667,0,47,-1,221


In [12]:
test_sparse_features = test.iloc[::,0:14].columns.values.tolist()
test_dense_features = test.iloc[::,14:].columns.values.tolist()

In [13]:
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
scaler = MinMaxScaler(feature_range=(0,1))
encoder = LabelEncoder()

In [14]:
for feat in sparse_features:
    data[feat] = encoder.fit_transform(data[feat])
data[dense_features] = scaler.fit_transform(data[dense_features])

In [15]:
for feat in test_sparse_features:
    test[feat] = encoder.fit_transform(test[feat])
test[dense_features] = scaler.fit_transform(test[test_dense_features])

In [16]:
fixlen_feature_columns = [
  SparseFeat(feat,data[feat].nunique()) for feat in sparse_features] + [
  DenseFeat(feat,1,) for feat in dense_features
]

fixlen_feature_columns

[SparseFeat(name='C1', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C1', group_name='default_group'),
 SparseFeat(name='banner_pos', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='banner_pos', group_name='default_group'),
 SparseFeat(name='site_id', vocabulary_size=1160, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='site_id', group_name='default_group'),
 SparseFeat(name='site_domain', vocabulary_size=1006, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='site_domain', group_name='default_group'),
 SparseFeat(name='site_category', vocabulary_size=20, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='site_category', group_name='default_group'),
 SparseFeat(name='app_id', vocabulary_size=982, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='app_id', group_name='default_group'),
 SparseFeat(name='app_domain', vocabulary_size=70, embedding_dim=4, use_hash=

In [17]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

In [18]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

['C1',
 'banner_pos',
 'site_id',
 'site_domain',
 'site_category',
 'app_id',
 'app_domain',
 'app_category',
 'device_id',
 'device_ip',
 'device_model',
 'device_type',
 'id',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21']

In [19]:
train_model_input = {name: data[name]for name in feature_names}

In [20]:
test_model_input = {name: test[name] for name in feature_names}

In [21]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')

In [22]:
model = DeepFM(linear_feature_columns,dnn_feature_columns,
              task='binary',device=device,dnn_dropout=0.7)
model

DeepFM(
  (embedding_dict): ModuleDict(
    (C1): Embedding(7, 4)
    (banner_pos): Embedding(7, 4)
    (site_id): Embedding(1160, 4)
    (site_domain): Embedding(1006, 4)
    (site_category): Embedding(20, 4)
    (app_id): Embedding(982, 4)
    (app_domain): Embedding(70, 4)
    (app_category): Embedding(20, 4)
    (device_id): Embedding(8531, 4)
    (device_ip): Embedding(41537, 4)
    (device_model): Embedding(2585, 4)
    (device_type): Embedding(4, 4)
    (id): Embedding(50000, 4)
  )
  (linear_model): Linear(
    (embedding_dict): ModuleDict(
      (C1): Embedding(7, 1)
      (banner_pos): Embedding(7, 1)
      (site_id): Embedding(1160, 1)
      (site_domain): Embedding(1006, 1)
      (site_category): Embedding(20, 1)
      (app_id): Embedding(982, 1)
      (app_domain): Embedding(70, 1)
      (app_category): Embedding(20, 1)
      (device_id): Embedding(8531, 1)
      (device_ip): Embedding(41537, 1)
      (device_model): Embedding(2585, 1)
      (device_type): Embedding(4, 1)


In [23]:
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['binary_crossentropy','auc'])

In [24]:
%%time
history = model.fit(train_model_input, train[target].values, batch_size=1024, epochs=10, verbose=1,
                        validation_split=0.2)

cpu
Train on 40000 samples, validate on 10000 samples, 40 steps per epoch


1it [00:00,  8.20it/s]


RuntimeError: Output 0 of UnbindBackward0 is a view and its base or another view of its base has been modified inplace. This view is the output of a function that returns multiple views. Such functions do not allow the output views to be modified inplace. You should replace the inplace operation by an out-of-place one.