# 尝试建立模型提取特征

## 读取数据

In [3]:
import numpy as np
import pandas as pd
import pickle
import os

In [276]:
TRAIN_PATH = 'model_data/all_train_data.pkl'
TEST_PATH = 'model_data/test_data.pkl'
# SUB_DATA_PATH = 'model_data/yulao_sub_data1.pkl'
SUB_DATA_PATH = 'model_data/yulao_sub_data2.pkl'

In [277]:
with open(TRAIN_PATH,'rb') as f:
    model_train_df = pickle.load(f)
with open(TEST_PATH,'rb') as f:
    model_test_df = pickle.load(f)
with open(SUB_DATA_PATH,'rb') as f:
    model_sub_df = pickle.load(f)

In [48]:
model_train_df[:4]

Unnamed: 0,buyer_admin_id,cate_id,item_id,item_price,item_price_max,item_price_mean,item_price_median,item_price_min,item_price_sum,label,num_item,num_item_cate,num_sell,num_shop,num_store_cate,pop_unpop_rate,shoptime_1,shoptime_2,shoptime_3,store_id
0,8362078,2324,1,4501,8742,1387.916667,147.5,38,16655,1,10,7,1.0,12,10,0.0,4.0,6.0,2.0,10013
1,8362078,1243,3346056,168,8742,1387.916667,147.5,38,16655,1,10,7,28.0,12,10,0.0,4.0,6.0,2.0,3185
2,2436524,1243,3346056,168,9200,940.190476,497.0,46,19744,1,21,9,28.0,21,17,0.238095,0.0,20.0,1.0,3185
3,2792675,1243,3346056,168,19339,1929.44,200.0,47,48236,1,25,15,28.0,25,19,0.12,5.0,16.0,4.0,3185


In [278]:
model_sub_df[:3]

Unnamed: 0,buyer_admin_id,item_id,num_shop,num_item,pop_unpop_rate,item_price_sum,item_price_max,item_price_min,item_price_mean,item_price_median,num_item_cate,num_store_cate,shoptime_1,shoptime_2,shoptime_3,num_sell,cate_id,store_id,item_price,label
0,152,8410857,7.0,7.0,0.0,14538.0,7479.0,200.0,2076.857143,1279.0,4.0,7.0,7.0,0.0,0.0,2,2686,6213,800,-1
1,152,7937154,7.0,7.0,0.0,14538.0,7479.0,200.0,2076.857143,1279.0,4.0,7.0,7.0,0.0,0.0,61,2686,40393,1301,-1
2,152,8472223,7.0,7.0,0.0,14538.0,7479.0,200.0,2076.857143,1279.0,4.0,7.0,7.0,0.0,0.0,6,2686,79168,1234,-1


In [8]:
model_train_df.shape

(23064775, 20)

In [66]:
model_sub_df.shape

(341378, 20)

## 创建Dataset

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

In [10]:
def data_norm(data_df):
    # 先整理列名方便后面检索，然后进行数据的归一化
    data_df = data_df[['buyer_admin_id',
                       'num_shop',  #购买次数，
                       'num_item',  #购买商品件数，
                       'pop_unpop_rate',  #热销商品与非热销商品件数的比重，
                       'item_price_sum',  # 商品总价格
                       'item_price_max',  #商品最大价格
                       'item_price_min',  #商品最小价格
                       'item_price_mean',  #商品价格均值
                       'item_price_median', #商品价格中值
                       'num_item_cate',  #商品类别数
                       'num_store_cate', #商店类别数
                       'shoptime_1',  #在0-8时间段购买商品数所占比重
                       'shoptime_2', #在8-16时间段...
                       'shoptime_3',  #在16-24时间段..
                       'item_id',
                       'num_sell',
                       'cate_id',
                       'store_id',
                       'item_price',
                       'label'
                        ]]
    data_df.iloc[:,1:14] = data_df.iloc[:,1:14].apply(lambda x : (x-np.min(x))/(np.max(x)-np.min(x))) # 数据相关归一化
    return data_df

In [11]:
class AntaiDataset(Dataset):
    def __init__(self,data_df):
        self.data_df = data_df
        
    def __len__(self):
        return self.data_df.shape[0]
    
    def __getitem__(self, index):
        data_series = self.data_df.iloc[index]
        label = data_series['label']
        admin_features = data_series.iloc[:14].values
        item_features = data_series.iloc[14:-1].values
        return admin_features.astype(np.float32), item_features.astype(np.float32), np.array(label)

## 构建模型

模型参数

In [67]:
# 用户ID数
admin_id_max = model_train_df['buyer_admin_id'].max()+1
print('用户ID数: %s'%admin_id_max)
item_id_max =  model_train_df['item_id'].max()+1
print('商品ID数: %s'%item_id_max)
# 商品卖出数
item_sellnum_max =  model_train_df['num_sell'].max()+1
print('商品卖出数: %s'%item_sellnum_max)
# 商品种类数目
cate_id_max  = model_train_df['cate_id'].max()+1
print('商品种类数目: %s'%cate_id_max)
# 商店种类数
store_id_max = model_train_df['store_id'].max()+1
print('商店种类数: %s'%store_id_max)
#商品价格
item_price_max = model_train_df['item_price'].max()+1
print('商品价格: %s'%item_price_max)
# embedding 维度
embed_dim = 64
# fc 大小
fc1_dim = 128
fc2_dim = 256

#admin features
admin_feature_dim = 13

用户ID数: 13046722
商品ID数: 13046735
商品卖出数: 112660
商品种类数目: 4244
商店种类数: 95106
商品价格: 20231


模型初始化

In [74]:
import config as cfg
import ipdb

In [75]:
class AntaiRSModel(nn.Module):
    def __init__(self):
        super(AntaiRSModel,self).__init__()
        torch.manual_seed(1) # 随机初始化embedding举证
#         print('init admin id embedding layer...')
#         self.admin_id_embeds = nn.Embedding(admin_id_max, cfg.EMBED_DIM) # 64
#         print('init item id embedding layer...')
#         self.item_id_embeds = nn.Embedding(item_id_max, cfg.EMBED_DIM)
        print('init item sellnum embedding layer...')
        self.item_sellnum_embeds = nn.Embedding(item_sellnum_max, cfg.EMBED_DIM)
        print('init cate id embedding layer...')
        self.cate_id_embeds = nn.Embedding(cate_id_max, cfg.EMBED_DIM)
        print('init store id embedding layer...')
        self.store_id_embeds = nn.Embedding(store_id_max, cfg.EMBED_DIM)
        print('init item price embedding layer...')
        self.item_price_embeds = nn.Embedding(item_price_max, cfg.EMBED_DIM)
        print('init dense layer...')
        # 用户dense 初始化
        self.uid_fc = nn.Linear(cfg.EMBED_DIM, cfg.FC1_DIM)
        self.admin_fc1 = nn.Linear(cfg.ADMIN_FEATURE_DIM,cfg.FC1_DIM)
        self.admin_fc2 = nn.Linear(cfg.FC1_DIM,cfg.FC2_DIM)
        # 商品dense 初始化
#         self.itemid_fc = nn.Linear(cfg.EMBED_DIM, cfg.FC1_DIM)
        self.sellnum_fc = nn.Linear(cfg.EMBED_DIM, cfg.FC1_DIM)
        self.cateid_fc = nn.Linear(cfg.EMBED_DIM, cfg.FC1_DIM)
        self.storeid_fc = nn.Linear(cfg.EMBED_DIM, cfg.FC1_DIM)
        self.itemprice = nn.Linear(cfg.EMBED_DIM, cfg.FC1_DIM)
        self.item_fc = nn.Linear(cfg.FC1_DIM*4, cfg.FC2_DIM)
        
    def forward(self,admin, item): # bs x 14, bs x5
        # 用户信息embedding
#         uid_embed_layer = self.admin_id_embeds(admin[:,0].long()) # bs x 1x embed_dim => bs x 64
#         # 用户dense
#         uid_dense = self.uid_fc(uid_embed_layer) # bs  x128
        admin_dense = self.admin_fc1(admin[:,1:])  # bs x 128
        # 用户concat + dense
#         admin_concat = torch.cat((uid_dense, admin_dense),dim=1) # bsx 256
        admin_concat_out = self.admin_fc2(admin_dense)  # bsx 256
        # 商品信息embedding
#         itemid_embed_layer = self.item_id_embeds(item[:,0].long())  # bsx64
        sellnum_embed_layer = self.item_sellnum_embeds(item[:,1].long()) # bs x64
        cateid_embed_layer = self.cate_id_embeds(item[:,2].long()) # bs x64
        storeid_embed_layer = self.store_id_embeds(item[:,3].long()) # bs x64
        itemprice_embed_layer = self.item_price_embeds(item[:,4].long())  # 64
        # 商品dense
        sellnum_dense = self.sellnum_fc(sellnum_embed_layer) # bsx  128
        cateid_dense = self.cateid_fc(cateid_embed_layer) # bs x128
        storeid_dense = self.storeid_fc(storeid_embed_layer)  # bs x 128
        itemprice_dense = self.itemprice(itemprice_embed_layer)  # bs  x128
        # 商品concat + dense
        item_concat = torch.cat([sellnum_dense,cateid_dense,storeid_dense, itemprice_dense],dim=1)  # bs x 128*4
        item_concat_out = self.item_fc(item_concat)  # bs x 256
        out = torch.sigmoid(torch.sum(admin_concat_out*item_concat_out,1))    # bs x 1    
        return out

## 预测最后结果

In [88]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = torch.device('cuda')

In [71]:
MODEL_PATH = '/users/qipccc/competitions/antai/src/stored_data/antaiv1_bs2048_lr0.001_20190730_07_28_32_21.9136/model/model_best'

In [81]:
model = AntaiRSModel()
model = nn.DataParallel(model) # 因为训练的时候进行了封装，所以读取模型的时候也需要进行封装后读取模型参数

init item sellnum embedding layer...
init cate id embedding layer...
init store id embedding layer...
init item price embedding layer...
init dense layer...


In [84]:
state = torch.load(MODEL_PATH)
model.load_state_dict(state)

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [279]:
model_sub_df = data_norm(model_sub_df)
anti_sub_dataset = AntaiDataset(model_sub_df)
data_loader = DataLoader(anti_sub_dataset,batch_size=32, num_workers=32)

In [280]:
model.to(device)
model.eval()
all_pred = []
for i, batch_input in enumerate(data_loader):
#     ipdb.set_trace()
    admin = batch_input[0].to(device)
    item = batch_input[1].to(device)
    pred = model(admin, item)
    all_pred.extend(pred.cpu().detach().numpy())
    print('step : %d'%i, end='\r')

step : 10668

In [281]:
with open('sub_data/yulao_sub_pred_2.pkl', 'wb') as output_file: # 保存概率预测值
        pickle.dump(np.array(all_pred), output_file)

# 生成最后的submission文件

In [205]:
from tqdm import trange

In [282]:
yulao_sub = pd.read_csv('../submission.csv',header=None)

In [283]:
yulao_sub.shape

(11398, 31)

In [284]:
model_sub_df[:3]

Unnamed: 0,buyer_admin_id,num_shop,num_item,pop_unpop_rate,item_price_sum,item_price_max,item_price_min,item_price_mean,item_price_median,num_item_cate,num_store_cate,shoptime_1,shoptime_2,shoptime_3,item_id,num_sell,cate_id,store_id,item_price,label
0,152,0.004332,0.027397,0.0,0.002501,0.375258,0.023637,0.155643,0.08642,0.0375,0.047619,0.010417,0.0,0.0,8410857,2,2686,6213,800,-1
1,152,0.004332,0.027397,0.0,0.002501,0.375258,0.023637,0.155643,0.08642,0.0375,0.047619,0.010417,0.0,0.0,7937154,61,2686,40393,1301,-1
2,152,0.004332,0.027397,0.0,0.002501,0.375258,0.023637,0.155643,0.08642,0.0375,0.047619,0.010417,0.0,0.0,8472223,6,2686,79168,1234,-1


In [285]:
sub_df = pd.DataFrame(columns=['buyer_admin_id','item_id','pred'])
sub_df['buyer_admin_id'] = model_sub_df['buyer_admin_id']
sub_df['item_id'] = model_sub_df['item_id']
sub_df['pred'] = all_pred

In [286]:
yulao_sub = pd.read_csv('../submission.csv',header=None)
new_sub = yulao_sub.values
for i in trange(new_sub.shape[0]):
    admin_id = new_sub[i,0]
    single_admin = sub_df[sub_df['buyer_admin_id'] == admin_id]
    if len(single_admin) != 0:
        single_admin_item_pred = single_admin.sort_values('pred',ascending=False)['item_id'].to_list()  # 针对每一个admin得到其对应的可能购买的商品的排序
        if len(single_admin_item_pred) == 30:
            new_sub[i,1:] = single_admin_item_pred
        else:
            tmp_items = []
            k = 0
            for j in range(1,31):
                if new_sub[i,j] in single_admin_item_pred:
                    tmp_items.append(single_admin_item_pred[k])
                    k +=1 
                else:
                    tmp_items.append(new_sub[i,j])
            new_sub[i,1:] = tmp_items

100%|██████████| 11398/11398 [00:19<00:00, 589.60it/s]


In [287]:
new_sub[12]

array([    8464,  2093281,  5111816,  2686621, 11246931,  4569637,
         115517,  8632863,  4157502,   476783,  2076583,  8084213,
        8941084,  8673091, 11055796,  8994530,  4781216,  8673145,
        1160884,  2094230,    70466,  8797012,   992917,  4589918,
       10322913,  2914396,  9024875,   463417,  8241688,   747330,
         662430])

In [288]:
# admin商品信息不足30条id
admin_item_count = sub_df.groupby(['buyer_admin_id']).size().to_list()
no30admin_idx = np.where(np.array(admin_item_count) != 30)[0]
no30admin_id = sub_df['buyer_admin_id'].unique()[no30admin_idx]
no30admin_id[:10]
no30admin_idx[:10]

array([  7,  12,  21,  53,  60, 149, 153, 219, 227, 234])

In [252]:
single_admin = sub_df[sub_df['buyer_admin_id'] == 8464]
single_admin_item_pred = single_admin.sort_values('pred',ascending=False)['item_id'].to_list()  # 针对每一个admin得到其对应的可能购买的商品的排序
single_admin_item_pred

[2093281,
 5111816,
 11246931,
 4569637,
 115517,
 8632863,
 4157502,
 476783,
 2076583,
 8084213,
 8941084,
 11055796,
 8673091,
 4781216,
 8673145,
 2094230,
 70466,
 8797012,
 4589918,
 10322913,
 9024875,
 2686621,
 463417,
 8241688,
 2914396,
 747330,
 992917,
 1160884,
 662430]

经过检查应该没有问题

输出最后文件

In [289]:
new_submission = pd.DataFrame(new_sub)
new_submission.to_csv('net_submission2.csv',header=False, index=False)

In [264]:
new_submission.values[:,0]

array([     152,      282,      321, ..., 13044048, 13046354, 13046601])

In [267]:
# 对比原始文件查看是否能对应上
sub_src = pd.read_csv('../submission.csv',header=None)
sub_src.values[:,0]

In [269]:
for a,b in zip(new_submission.values[:,0],sub_src.values[:,0]):
    if a != b:
        print(1)