In [1]:
! pip install deepctr

Collecting deepctr
  Downloading deepctr-0.7.5-py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 94 kB/s 
Installing collected packages: deepctr
Successfully installed deepctr-0.7.5


In [2]:
from sklearn.metrics import mean_squared_error
import numpy as np 
import pandas as pd 
import gc
import os
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from deepctr.models import DeepFM
from deepctr.inputs import SparseFeat,get_feature_names



In [3]:
# 显示所有列
pd.set_option('display.max_columns', None)

In [4]:
file_path = '/kaggle/input/avazu-ctr-prediction/'
fp_train = os.path.join(file_path,'train.gz')
fp_test = os.path.join(file_path,'test.gz')
fp_sample = os.path.join(file_path,'sampleSubmission.gz')
# train = pd.read_csv(os.path.join(file_path,'train.gz'))
# df_train_ini = pd.read_csv(fp_train, nrows = 10)

In [5]:
skip = sorted(random.sample(range(1, 40000000), 30000000)) 
df_train = pd.read_csv(fp_train, skiprows = skip)

In [6]:
df_test = pd.read_csv(fp_test)

In [7]:
df_train['hour'] = pd.to_datetime(df_train['hour'],format='%y%m%d%H').dt.hour
df_test['hour'] = pd.to_datetime(df_test['hour'],format='%y%m%d%H').dt.hour

In [8]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
            start_mem - end_mem) / start_mem))
    return df



In [9]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

Mem. usage decreased to 994.58 Mb (47.9% reduction)
Mem. usage decreased to 432.18 Mb (46.2% reduction)


In [10]:
df_train['label']='train'
df_test['label'] = 'test'
df = pd.concat([df_train,df_test])

In [11]:
del df_train,df_test
gc.collect()

40

In [12]:
#数据加载
sparse_features = ['hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
target = ['click']


In [13]:

# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    df[feature] = lbe.fit_transform(df[feature])
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, df[feature].nunique()) for feature in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
print(fixlen_feature_columns)
print(feature_names)

[SparseFeat(name='hour', vocabulary_size=24, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='hour', group_name='default_group'), SparseFeat(name='C1', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C1', group_name='default_group'), SparseFeat(name='banner_pos', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='banner_pos', group_name='default_group'), SparseFeat(name='site_id', vocabulary_size=4177, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='site_id', group_name='default_group'), SparseFeat(name='site_domain', vocabulary_size=5880, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='site_domain', group_name='default_group'), SparseFeat(name='site_category', vocabulary_size=26, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='site_category', group_name='default_group'), SparseFeat(name='app_id', vocabulary_size=7212, embedding_dim=4, use_hash=False, dtype=

In [14]:
train = df[df['label']=='train']
test = df[df['label']=='test']

In [15]:
del df
gc.collect()

40

In [16]:
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}


In [17]:
# 使用DeepFM进行训练
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
# binary_crossentropy就是logloss
model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2, )


Train on 8343173 samples, validate on 2085794 samples


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




In [18]:
# 使用DeepFM进行预测
pred_ans = model.predict(test_model_input, batch_size=256)
# 输出RMSE或MSE
# mse = round(mean_squared_error(test[target].values, pred_ans), 4)
# rmse = mse ** 0.5
# print("test RMSE", rmse)


In [20]:
sample = pd.read_csv(fp_sample)
result = sample[['id']]

In [21]:
result['click'] = pred_ans[:,0]

In [23]:
result.to_csv('result_deepfm.csv',index=None)