In [1]:
import pandas as pd
import numpy as np

from deepctr.models import DeepFM
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.inputs import  SparseFeat, DenseFeat,get_feature_names


DeepCTR version 0.7.4 detected. Your version is 0.7.3.
Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.7.4


In [2]:
# 数据加载
train = pd.read_csv('../../data/rs6-attrition-predict/train.csv')
test = test1 = pd.read_csv('../../data/rs6-attrition-predict/test.csv')
#print(train['Attrition'].value_counts())

# 设置标记位
test['Attrition']=-1
test = test[train.columns]
data = pd.concat([train, test])
# 处理Attrition字段
data['Attrition']=data['Attrition'].map(lambda x:1 if x=='Yes' else -1 if x==-1 else 0)
# 分类特征
cate = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']
# 使用factorize将离散值对应为 index值
for feature in cate:
    data[feature] = pd.factorize(data[feature])[0]
#data.to_csv('temp.csv')
#print(data)

# 去掉没用的列 员工号码，标准工时（=80）
data = data.drop(['user_id', 'EmployeeNumber', 'StandardHours'], axis=1)
# 训练集测试集分离, 如果Attrition=-1 说明是测试集
train, test = data[data['Attrition']!=-1], data[data['Attrition']==-1]
#train = train.drop('Attrition', axis=1)
#print(train)
#print(len(train[train['Attrition']==1]),len(train[train['Attrition']==0]))

In [3]:
sparse_features = cate
# 除了分类特征以外，其余都是稠密类型
dense_features = list(set([i if i not in cate else '' for i in train.drop('Attrition', axis=1).columns]))
dense_features.remove('')
# 处理缺失值
train[sparse_features] = train[sparse_features].fillna('-1', )
train[dense_features] = train[dense_features].fillna(0, )
# 对离散特征进行标签编码
target = ['Attrition']
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
#print(data)

# 对数据进行0-1规划反
mms = MinMaxScaler(feature_range=(0, 1))
train[dense_features] = mms.fit_transform(train[dense_features])
test[dense_features] = mms.fit_transform(test[dense_features])

# 处理定长离散特征
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                          for feat in sparse_features] + [DenseFeat(feat, 1,)
                          for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
print(fixlen_feature_columns)

# 得到所有特证名
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

train_model_input = {name:train[name] for name in feature_names}
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )

history = model.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=50, verbose=2, validation_split=0.2, )

# 对测试集进行预测
test_model_input = {name:test[name] for name in feature_names}
#print(test_model_input)
predict = model.predict(test_model_input, batch_size=256)
## 转化为二分类输出
test1['Attrition'] = predict
test1['Attrition']=test1['Attrition']#.map(lambda x:1 if x>=0.5 else 0)
# 使用user_id作为索引
test1.set_index(["user_id"], inplace=True)
test1[['Attrition']].to_csv('submit_deepfm.csv')

[SparseFeat(name='BusinessTravel', vocabulary_size=3, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='BusinessTravel', group_name='default_group'), SparseFeat(name='Department', vocabulary_size=3, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='Department', group_name='default_group'), SparseFeat(name='EducationField', vocabulary_size=6, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='EducationField', group_name='default_group'), SparseFeat(name='Gender', vocabulary_size=2, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='Gender', group_name='default_group'), SparseFeat(name='JobRole', vocabulary_size=9, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='JobRole', group_name='default_group'), SparseFeat(name='MaritalStatus', vocabulary_size=3, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='MaritalStatus', group_name='default_group'), SparseFeat(name='Over18', vocabulary_size=1, embedding_dim=4,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://p

































































































































Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor






Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


































































































































































































Train on 940 samples, validate on 236 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/50
940/940 - 1s - loss: 0.6907 - binary_crossentropy: 0.6907 - val_loss: 0.6096 - val_binary_crossentropy: 0.6095
Epoch 2/50
940/940 - 0s - loss: 0.5431 - binary_crossentropy: 0.5431 - val_loss: 0.5165 - val_binary_crossentropy: 0.5165
Epoch 3/50
940/940 - 0s - loss: 0.4508 - binary_crossentropy: 0.4508 - val_loss: 0.4807 - val_binary_crossentropy: 0.4807
Epoch 4/50
940/940 - 0s - loss: 0.4142 - binary_crossentropy: 0.4142 - val_loss: 0.4925 - val_binary_crossentropy: 0.4925
Epoch 5/50
940/940 - 0s - loss: 0.4155 - binary_crossentropy: 0.4155 - val_loss: 0.5112 - val_binary_crossentropy: 0.5112
Epoch 6/50
940/940 - 0s - loss: 0.4228 - binary_crossentropy: 0.4228 - val_loss: 0.5098 - val_binary_crossentropy: 0.5098
Epoch 7/50
940/940 - 0s - loss: 0.4147 - binary_crossentropy: 0.4147 - val_loss: 0.4915 - val_binary_crossentropy: 0.4915
Epoch 8/50
940/940 - 0s - loss: 0.4032 - binary_crossentropy: 0.4032 - val_loss: 0.4768 - val_binary_crossentropy: 0.4768
Epoch 9/50
940/940 - 0s 