# 尝试建立模型提取特征

## 读取数据

In [121]:
import numpy as np
import pandas as pd
import pickle
import os

In [2]:
TRAIN_PATH = 'model_data/train_data.pkl'
TEST_PATH = 'model_data/test_data.pkl'

In [3]:
with open(TRAIN_PATH,'rb') as f:
    model_train_df = pickle.load(f)
with open(TEST_PATH,'rb') as f:
    model_test_df = pickle.load(f)

In [4]:
model_train_df[:4]

Unnamed: 0,buyer_admin_id,item_id,num_shop,num_item,pop_unpop_rate,item_price_sum,item_price_max,item_price_min,item_price_mean,item_price_median,num_item_cate,num_store_cate,shoptime_1,shoptime_2,shoptime_3,num_sell,cate_id,store_id,item_price,label
0,8362078,1,12,10,0.0,16655,8742,38,1387.916667,147.5,7,10,4.0,6.0,2.0,1,2324,10013,4501,1
1,8362078,3346056,12,10,0.0,16655,8742,38,1387.916667,147.5,7,10,4.0,6.0,2.0,28,1243,3185,168,1
2,2436524,3346056,21,21,0.238095,19744,9200,46,940.190476,497.0,9,17,0.0,20.0,1.0,28,1243,3185,168,1
3,2792675,3346056,25,25,0.12,48236,19339,47,1929.44,200.0,15,19,5.0,16.0,4.0,28,1243,3185,168,1


In [139]:
model_test_df[:3]

Unnamed: 0,buyer_admin_id,num_shop,num_item,pop_unpop_rate,item_price_sum,item_price_max,item_price_min,item_price_mean,item_price_median,num_item_cate,num_store_cate,shoptime_1,shoptime_2,shoptime_3,item_id,num_sell,cate_id,store_id,item_price,label
0,1061132,0.011552,0.068493,0.058824,0.001182,0.051224,0.003088,0.029665,0.021775,0.075,0.103175,0.004464,0.016484,0.029762,189045,1,1506,11599,237,1
1,2129504,0.004332,0.009132,0.0,0.000287,0.012819,0.028032,0.01786,0.015737,0.0,0.007937,0.010417,0.0,0.0,189045,1,1506,11599,237,1
2,2129504,0.004332,0.009132,0.0,0.000287,0.012819,0.028032,0.01786,0.015737,0.0,0.007937,0.010417,0.0,0.0,189045,1,1506,11599,237,1


In [138]:
model_train_df.shape

(12843064, 20)

## 创建Dataset

In [107]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

In [74]:
def data_norm(data_df):
    # 先整理列名方便后面检索，然后进行数据的归一化
    data_df = data_df[['buyer_admin_id',
                       'num_shop',  #购买次数，
                       'num_item',  #购买商品件数，
                       'pop_unpop_rate',  #热销商品与非热销商品件数的比重，
                       'item_price_sum',  # 商品总价格
                       'item_price_max',  #商品最大价格
                       'item_price_min',  #商品最小价格
                       'item_price_mean',  #商品价格均值
                       'item_price_median', #商品价格中值
                       'num_item_cate',  #商品类别数
                       'num_store_cate', #商店类别数
                       'shoptime_1',  #在0-8时间段购买商品数所占比重
                       'shoptime_2', #在8-16时间段...
                       'shoptime_3',  #在16-24时间段..
                       'item_id',
                       'num_sell',
                       'cate_id',
                       'store_id',
                       'item_price',
                       'label'
                        ]]
    data_df.iloc[:,1:14] = data_df.iloc[:,1:14].apply(lambda x : (x-np.min(x))/(np.max(x)-np.min(x))) # 数据相关归一化
    return data_df

In [150]:
test = np.array([1,2,3])

In [160]:
test.astype(np.float32)

array([1., 2., 3.], dtype=float32)

In [175]:
class AntaiDataset(Dataset):
    def __init__(self,data_df):
        self.data_df = data_df
        
    def __len__(self):
        return self.data_df.shape[0]
    
    def __getitem__(self, index):
        data_series = self.data_df.iloc[index]
        label = data_series['label']
        admin_features = data_series.iloc[:14].values
        item_features = data_series.iloc[14:-1].values
        return admin_features.astype(np.float32), item_features.astype(np.float32), np.array(label)

In [62]:
model_train_df  = data_norm(model_train_df)

In [75]:
model_test_df = data_norm(model_test_df)

In [65]:
model_train_df[:2]

Unnamed: 0,buyer_admin_id,num_shop,num_item,pop_unpop_rate,item_price_sum,item_price_max,item_price_min,item_price_mean,item_price_median,num_item_cate,num_store_cate,shoptime_1,shoptime_2,shoptime_3,item_id,num_sell,cate_id,store_id,item_price,label
0,8362078,0.000257,0.005498,0.0,0.000649,0.432074,0.001842,0.068651,0.007251,0.012658,0.007311,0.000116,0.002007,0.000116,1,1,2324,10013,4501,1
1,8362078,0.000257,0.005498,0.0,0.000649,0.432074,0.001842,0.068651,0.007251,0.012658,0.007311,0.000116,0.002007,0.000116,3346056,28,1243,3185,168,1


In [68]:
anti_trian_dataset[0][0]  # 用户特征信息

array([8.36207800e+06, 2.57309942e-04, 5.49786194e-03, 0.00000000e+00,
       6.48785394e-04, 4.32074352e-01, 1.84162063e-03, 6.86509435e-02,
       7.25068052e-03, 1.26582278e-02, 7.31112916e-03, 1.15938668e-04,
       2.00736032e-03, 1.15540150e-04])

In [94]:
anti_trian_dataset[0][0][0]

8362078.0

In [71]:
anti_trian_dataset[0][0].shape  # 用户特征数

(14,)

In [69]:
anti_trian_dataset[0][1] #商品特征信息

array([1.0000e+00, 1.0000e+00, 2.3240e+03, 1.0013e+04, 4.5010e+03])

In [72]:
anti_trian_dataset[0][1].shape #商品特征数

(5,)

In [70]:
anti_trian_dataset[0][2]  # 标签

array(1.)

In [76]:
model_test_df[:3]

Unnamed: 0,buyer_admin_id,num_shop,num_item,pop_unpop_rate,item_price_sum,item_price_max,item_price_min,item_price_mean,item_price_median,num_item_cate,num_store_cate,shoptime_1,shoptime_2,shoptime_3,item_id,num_sell,cate_id,store_id,item_price,label
0,1061132,0.011552,0.068493,0.058824,0.001182,0.051224,0.003088,0.029665,0.021775,0.075,0.103175,0.004464,0.016484,0.029762,189045,1,1506,11599,237,1
1,2129504,0.004332,0.009132,0.0,0.000287,0.012819,0.028032,0.01786,0.015737,0.0,0.007937,0.010417,0.0,0.0,189045,1,1506,11599,237,1
2,2129504,0.004332,0.009132,0.0,0.000287,0.012819,0.028032,0.01786,0.015737,0.0,0.007937,0.010417,0.0,0.0,189045,1,1506,11599,237,1


## 构建模型

模型参数

In [144]:
# 用户ID数
admin_id_max = model_train_df['buyer_admin_id'].max()
# admin_id_max =1000
# 商品ID数
item_id_max =  model_train_df['item_id'].max()
# item_id_max = 1000
# 商品卖出数
item_sellnum_max =  model_train_df['num_sell'].max()
# item_sellnum_max = 1000
# 商品种类数目
cate_id_max  = model_train_df['cate_id'].max()
# cate_id_max = 400
# 商店种类数
store_id_max = model_train_df['store_id'].max()
# store_id_max = 400
#商品价格
item_price_max = model_train_df['item_price'].max()
# item_price_max = 100
# embedding 维度
embed_dim = 64
# fc 大小
fc1_dim = 128
fc2_dim = 256

#admin features
admin_feature_dim = 13

模型初始化

In [112]:
import ipdb

In [182]:
class AntaiRSModel(nn.Module):
    def __init__(self):
        super(AntaiRSModel,self).__init__()
        torch.manual_seed(1) # 随机初始化embedding举证
        self.admin_id_embeds = nn.Embedding(admin_id_max, embed_dim) # 64
        self.item_id_embeds = nn.Embedding(item_id_max, embed_dim)
        self.item_sellnum_embeds = nn.Embedding(item_sellnum_max, embed_dim)
        self.cate_id_embeds = nn.Embedding(cate_id_max, embed_dim//2)
        self.store_id_embeds = nn.Embedding(store_id_max, embed_dim//2)
        self.item_price_embeds = nn.Embedding(item_price_max, embed_dim)
        # 用户dense 初始化
        self.uid_fc = nn.Linear(embed_dim, fc1_dim)
        self.admin_fc1 = nn.Linear(admin_feature_dim,fc1_dim)
        self.admin_fc2 = nn.Linear(fc1_dim*2,fc2_dim)
        # 商品dense 初始化
        self.itemid_fc = nn.Linear(embed_dim, fc1_dim)
        self.sellnum_fc = nn.Linear(embed_dim, fc1_dim)
        self.cateid_fc = nn.Linear(embed_dim//2, fc1_dim//2)
        self.storeid_fc = nn.Linear(embed_dim//2, fc1_dim//2)
        self.itemprice = nn.Linear(embed_dim, fc1_dim)
        self.item_fc = nn.Linear(fc1_dim*4, fc2_dim)
        
    def forward(self,admin, item): # bs x 14, bs x5
        # 用户信息embedding
#         admin = admin.float()
#         item = item.float()
#         ipdb.set_trace()
        uid_embed_layer = self.admin_id_embeds(admin[:,0].long()) # bs x 1x embed_dim => bs x 64
        # 用户dense
        uid_dense = self.uid_fc(uid_embed_layer) # bs  x128
        admin_dense = self.admin_fc1(admin[:,1:])  # bs x 128
        # 用户concat + dense
        admin_concat = torch.cat((uid_dense, admin_dense),dim=1) # bsx 256
        admin_concat_out = self.admin_fc2(admin_concat)  # bsx 256
        # 商品信息embedding
        itemid_embed_layer = self.item_id_embeds(item[:,0].long())  # bsx64
        sellnum_embed_layer = self.item_sellnum_embeds(item[:,1].long()) # bs x64
        cateid_embed_layer = self.cate_id_embeds(item[:,2].long()) # bs x32
        storeid_embed_layer = self.store_id_embeds(item[:,3].long()) # bs x32
        itemprice_embed_layer = self.item_price_embeds(item[:,4].long())
        # 商品dense
        itemid_dense = self.itemid_fc(itemid_embed_layer) # bsx128
        sellnum_dense = self.sellnum_fc(sellnum_embed_layer) # bsx  128
        cateid_dense = self.cateid_fc(cateid_embed_layer) # bs x64
        storeid_dense = self.storeid_fc(storeid_embed_layer)  # bs x 64
        itemprice_dense = self.itemprice(itemprice_embed_layer)  # bs  x128
        # 商品concat + dense
        item_concat = torch.cat((itemid_dense,sellnum_dense,cateid_dense,storeid_dense, itemprice_dense),dim=1)  # bs x 128*4
        item_concat_out = self.item_fc(item_concat)  # bs x 256
        out = torch.sigmoid(torch.sum(admin_concat_out*item_concat_out,1))    # bs x 1    
        return out

In [183]:
model = AntaiRSModel()

In [184]:
anti_trian_dataset = AntaiDataset(model_train_df)
data_loader = DataLoader(anti_trian_dataset,batch_size=32, num_workers=1)

In [148]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = torch.device('cpu')

In [185]:
model.to(device)
model.eval()
for i, batch_input in enumerate(data_loader):
    ipdb.set_trace()
    admin = batch_input[0].to(device)
    item = batch_input[1].to(device)
    model(admin, item)

> [0;32m<ipython-input-185-82bb04663f65>[0m(5)[0;36m<module>[0;34m()[0m
[0;32m      4 [0;31m    [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 5 [0;31m    [0madmin[0m [0;34m=[0m [0mbatch_input[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m.[0m[0mto[0m[0;34m([0m[0mdevice[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      6 [0;31m    [0mitem[0m [0;34m=[0m [0mbatch_input[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m.[0m[0mto[0m[0;34m([0m[0mdevice[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> n
> [0;32m<ipython-input-185-82bb04663f65>[0m(6)[0;36m<module>[0;34m()[0m
[0;32m      5 [0;31m    [0madmin[0m [0;34m=[0m [0mbatch_input[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m.[0m[0mto[0m[0;34m([0m[0mdevice[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 6 [0;31m    [0mitem[0m [0;34m=[0m [0mbatch_input[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m.[0m

ipdb> n
> [0;32m<ipython-input-182-9f6b4f06e0ba>[0m(40)[0;36mforward[0;34m()[0m
[0;32m     39 [0;31m        [0mstoreid_embed_layer[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mstore_id_embeds[0m[0;34m([0m[0mitem[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m[0;36m3[0m[0;34m][0m[0;34m.[0m[0mlong[0m[0;34m([0m[0;34m)[0m[0;34m)[0m [0;31m# bs x32[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 40 [0;31m        [0mitemprice_embed_layer[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mitem_price_embeds[0m[0;34m([0m[0mitem[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m[0;36m4[0m[0;34m][0m[0;34m.[0m[0mlong[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     41 [0;31m        [0;31m# 商品dense[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> n
> [0;32m<ipython-input-182-9f6b4f06e0ba>[0m(42)[0;36mforward[0;34m()[0m
[0;32m     41 [0;31m        [0;31m# 商品dense[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 42 [0;31m        [0mi



> [0;32m<ipython-input-182-9f6b4f06e0ba>[0m(51)[0;36mforward[0;34m()[0m
[0;32m     49 [0;31m        [0mitem_concat_out[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mitem_fc[0m[0;34m([0m[0mitem_concat[0m[0;34m)[0m  [0;31m# bs x 256[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     50 [0;31m        [0mout[0m [0;34m=[0m [0mF[0m[0;34m.[0m[0msigmoid[0m[0;34m([0m[0mtorch[0m[0;34m.[0m[0msum[0m[0;34m([0m[0madmin_concat_out[0m[0;34m*[0m[0mitem_concat_out[0m[0;34m,[0m[0;36m1[0m[0;34m)[0m[0;34m)[0m    [0;31m# bs x 1[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 51 [0;31m        [0;32mreturn[0m [0mout[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> out.shape
torch.Size([32])
ipdb> out[:4]
tensor([0.4489, 0.0275, 0.5566, 0.2115], grad_fn=<SliceBackward>)
ipdb> tmp2 = torch.sigmoid(torch.sum(admin_concat_out*item_concat_out,1))
ipdb> tmp2[:4]
tensor([0.4489, 0.0275, 0.5566, 0.2115], grad_fn=<SliceBackward>)
ipdb> q


BdbQuit: 