In [10]:
import os
import yaml
from pathlib import Path
import pandas as pd
import torch
import torch.nn as nn
from typing import Dict

In [2]:
cur_path = Path.cwd()
work_dir =cur_path.parent.parent 
print(f'change working dir from {cur_path} to {work_dir}')
os.chdir(work_dir)

change working dir from /Users/brian/Documents/git/RecJourney/models/MMOE to /Users/brian/Documents/git/RecJourney


# 阿里妈妈数据集
download from https://tianchi.aliyun.com/dataset/147588

## 任务
本次比赛以阿里电商广告为研究对象，提供了淘宝平台的海量真实交易数据，参赛选手通过人工智能技术构建预测模型预估用户的购买意向，即给定广告点击相关的用户（user）、广告商品（ad）、检索词（query）、上下文内容（context）、商店（shop）等信息的条件下预测广告产生购买行为的概率（pCVR），形式化定义为：pCVR=P(conversion=1 | query, user, ad, context, shop)。

结合淘宝平台的业务场景和不同的流量特点，我们定义了以下两类挑战：

（1）日常的转化率预估

（2）特殊日期的转化率预估

## 数据说明

本次比赛为参赛选手提供了5类数据（基础数据、广告商品信息、用户信息、上下文信息和店铺信息）。基础数据表提供了搜索广告最基本的信息，以及“是否交易”的标记。广告商品信息、用户信息、上下文信息和店铺信息等4类数据，提供了对转化率预估可能有帮助的辅助信息。

### 基础数据

| 字段        | 解释                                                         |
| :---------- | :----------------------------------------------------------- |
| instance_id | 样本编号，Long                                               |
| is_trade    | 是否交易的标记位，Int类型；取值是0或者1，其中1 表示这条样本最终产生交易，0 表示没有交易 |
| item_id     | 广告商品编号，Long类型                                       |
| user_id     | 用户的编号，Long类型                                         |
| context_id  | 上下文信息的编号，Long类型                                   |
| shop_id     | 店铺的编号，Long类型                                         |

### 广告商品信息

| 字段                 | 解释                                                         |
| :------------------- | :----------------------------------------------------------- |
| item_id              | 广告商品编号，Long类型                                       |
| item_category_list   | 广告商品的的类目列表，String类型；从根类目（最粗略的一级类目）向叶子类目（最精细的类目）依次排列，数据拼接格式为 "category_0;category_1;category_2"，其中 category_1 是 category_0 的子类目，category_2 是 category_1 的子类目 |
| item_property_list   | 广告商品的属性列表，String类型；数据拼接格式为 "property_0;property_1;property_2"，各个属性没有从属关系 |
| item_brand_id        | 广告商品的品牌编号，Long类型                                 |
| item_city_id         | 广告商品的城市编号，Long类型                                 |
| item_price_level     | 广告商品的价格等级，Int类型；取值从0开始，数值越大表示价格越高 |
| item_sales_level     | 广告商品的销量等级，Int类型；取值从0开始，数值越大表示销量越大 |
| item_collected_level | 广告商品被收藏次数的等级，Int类型；取值从0开始，数值越大表示被收藏次数越大 |
| item_pv_level        | 广告商品被展示次数的等级，Int类型；取值从0开始，数值越大表示被展示次数越大 |

### 用户信息

| 字段               | 解释                                                         |
| :----------------- | :----------------------------------------------------------- |
| user_id            | 用户的编号，Long类型                                         |
| user_gender_id     | 用户的预测性别编号，Int类型；0表示女性用户，1表示男性用户，2表示家庭用户 |
| user_age_level     | 用户的预测年龄等级，Int类型；数值越大表示年龄越大            |
| user_occupation_id | 用户的预测职业编号，Int类型                                  |
| user_star_level    | 用户的星级编号，Int类型；数值越大表示用户的星级越高          |

### 上下文信息

| 字段                      | 解释                                                         |
| :------------------------ | :----------------------------------------------------------- |
| context_id                | 上下文信息的编号，Long类型                                   |
| context_timestamp         | 广告商品的展示时间，Long类型；取值是以秒为单位的Unix时间戳，以1天为单位对时间戳进行了偏移 |
| context_page_id           | 广告商品的展示页面编号，Int类型；取值从1开始，依次增加；在一次搜索的展示结果中第一屏的编号为1，第二屏的编号为2 |
| predict_category_property | 根据查询词预测的类目属性列表，String类型；数据拼接格式为“category_A:property_A_1,property_A_2,property_A_3;category_B:-1;category_C:property_C_1,property_C_2” ，其中 category_A、category_B、category_C 是预测的三个类目；property_B 取值为-1，表示预测的第二个类目 category_B 没有对应的预测属性 |

### 店铺信息

| 字段                      | 解释                                                         |
| :------------------------ | :----------------------------------------------------------- |
| shop_id                   | 店铺的编号，Long类型                                         |
| shop_review_num_level     | 店铺的评价数量等级，Int类型；取值从0开始，数值越大表示评价数量越多 |
| shop_review_positive_rate | 店铺的好评率，Double类型；取值在0到1之间，数值越大表示好评率越高 |
| shop_star_level           | 店铺的星级编号，Int类型；取值从0开始，数值越大表示店铺的星级越高 |
| shop_score_service        | 店铺的服务态度评分，Double类型；取值在0到1之间，数值越大表示评分越高 |
| shop_score_delivery       | 店铺的物流服务评分，Double类型；取值在0到1之间，数值越大表示评分越高 |
| shop_score_description    | 店铺的描述相符评分，Double类型；取值在0到1之间，数值越大表示评分越高 |

在上述各张数据表中，绝大部分样本包含了完整的字段数据，也有少部分样本缺乏特定字段的数据。如果一条样本的某个字段为“-1”，表示这个样本的对应字段缺乏数据。


In [3]:
data_dir = work_dir / "data" / "round1_ijcai_18_train" / "round1_ijcai_18_train_20180301.txt"
print(f"data dir:{data_dir}")
chunk_size = 10000

data dir:/Users/brian/Documents/git/RecJourney/data/round1_ijcai_18_train/round1_ijcai_18_train_20180301.txt


In [4]:
with open(work_dir / "config" / "ijcai18.yml", 'r') as f:
    cfg = yaml.safe_load(f)

In [5]:
from preprocess.dtypes import DataConfig
data_cfg = DataConfig(**cfg)
print(data_cfg)

DataConfig(train_dir='data/round1_ijcai_18_train', test_dir='data/round1_ijcai_18_test_a', data_columns=['instance_id', 'item_id', 'item_category_list', 'item_property_list', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_id', 'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level', 'context_id', 'context_timestamp', 'context_page_id', 'predict_category_property', 'shop_id', 'shop_review_num_level', 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description', 'is_trade'], pipelines=[pipeline(col_in='user_id', col_out='user_id_emb', fillna=0, source='user', feature_type='sparse', ops=[functools.partial(<function str_hash at 0x30081bd00>, num_embeddings=220000)], dtype='int', tensor_type='int', num_embeddings=220000), pipeline(col_in='item_id', col_out='item_id_emb', fillna=0, source='item', feature_type='sparse', ops=[functools.partial(<f

In [6]:
label_column = "is_trade"
id_columns = [
    "instance_id",
    "item_id",
    "user_id",
    "context_id",
    "shop_id"
]

item_columns = [
    "item_category_list",
    "item_property_list",
    "item_brand_id",
    "item_city_id",
    "item_price_level",
    "item_sales_level",
    "item_collected_level",
    "item_pv_level",
]


user_columns = [
    "user_gender_id",
    "user_age_level",
    "user_occupation_id",
    "user_star_level"
]

context_columns = [
    "context_timestamp",
    "context_page_id",
    "predict_category_property",
]

shop_columns = [
    "shop_review_num_level",
    "shop_review_positive_rate",
    "shop_star_level",
    "shop_score_service",
    "shop_score_delivery",
    "shop_score_description"
]


In [7]:
df = pd.read_csv(data_dir, sep=' ', chunksize=chunk_size)
chunk = df.get_chunk(chunk_size)
print(chunk.columns)
chunk

Index(['instance_id', 'item_id', 'item_category_list', 'item_property_list',
       'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level',
       'item_collected_level', 'item_pv_level', 'user_id', 'user_gender_id',
       'user_age_level', 'user_occupation_id', 'user_star_level', 'context_id',
       'context_timestamp', 'context_page_id', 'predict_category_property',
       'shop_id', 'shop_review_num_level', 'shop_review_positive_rate',
       'shop_star_level', 'shop_score_service', 'shop_score_delivery',
       'shop_score_description', 'is_trade'],
      dtype='object')


Unnamed: 0,instance_id,item_id,item_category_list,item_property_list,item_brand_id,item_city_id,item_price_level,item_sales_level,item_collected_level,item_pv_level,...,context_page_id,predict_category_property,shop_id,shop_review_num_level,shop_review_positive_rate,shop_star_level,shop_score_service,shop_score_delivery,shop_score_description,is_trade
0,108641074714126964,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4006,5799347067982556520:-1;509660095530134768:-1;5...,6765930309048922341,4,1.0,5002,1.000000,1.000000,1.000000,0
1,5754713551599725161,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4001,5799347067982556520:9172976955054793469;790838...,6765930309048922341,4,1.0,5002,1.000000,1.000000,1.000000,0
2,842679481291040981,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4001,5799347067982556520:5131280576272319091;725801...,6765930309048922341,4,1.0,5002,1.000000,1.000000,1.000000,0
3,937088850059189027,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4016,509660095530134768:-1;5799347067982556520:-1;7...,6765930309048922341,4,1.0,5002,1.000000,1.000000,1.000000,0
4,7975697065017708072,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4001,5799347067982556520:9172976955054793469;790838...,6765930309048922341,4,1.0,5002,1.000000,1.000000,1.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,5914623189257523594,1502399167133432846,7908382889764677758;8277336076276184272,5977512434884267894;2072967855524022579;513128...,7024713306861377790,7534238860363577544,7,11,11,17,...,4003,"8277336076276184272:2636395404473730413,820214...",4954894036019286224,17,1.0,5015,0.973817,0.971494,0.985987,1
9996,5082808267147521726,2723541945574109807,7908382889764677758;509660095530134768,2072967855524022579;5131280576272319091;263639...,7024713306861377790,7534238860363577544,7,15,16,20,...,4001,509660095530134768:-1;2011981573061447208:-1;5...,4954894036019286224,17,1.0,5015,0.973817,0.971494,0.985987,0
9997,5016137884844600866,2723541945574109807,7908382889764677758;509660095530134768,2072967855524022579;5131280576272319091;263639...,7024713306861377790,7534238860363577544,7,15,16,20,...,4001,"509660095530134768:9148482949976129397,2636395...",4954894036019286224,17,1.0,5015,0.973817,0.971494,0.985987,0
9998,6811589004997751056,4399478434881285132,7908382889764677758;5755694407684602296,2072967855524022579;5131280576272319091;263639...,7024713306861377790,7534238860363577544,6,12,13,17,...,4014,5755694407684602296:2636395404473730413;201198...,4954894036019286224,17,1.0,5015,0.973817,0.971494,0.985987,0


In [8]:
for chunk_idx, chunk in enumerate(df):
    # each chunk
    tensor_dict = dict()
    for feat_idx, pipe in enumerate(data_cfg.pipelines):
        # each column
        x = chunk[pipe.col_in]
        x.fillna(pipe.fillna)
        for op in pipe.ops:
            # a list of operation
            x = x.apply(op)
        # chunk[pipe.col_out] = x
        tensor = torch.tensor( x.to_list())
        tensor_dict[pipe.col_out] = tensor
        # break
    break

In [11]:
from preprocess.emb import build_emb_dict, cal_feat_dim
from models.MMOE.mmoe import SharedBottomModel


emb_dim = 8
dim_out = 64
dim_hidden = [32, 32]
dims = [dim_out, dim_out, 1]
task_num = 1
drop_out = 0.1
emb_dict = build_emb_dict(data_cfg, emb_dim=emb_dim)
feat_dims = cal_feat_dim(data_cfg, emb_dim=emb_dim)


base_model = SharedBottomModel(
    dim_in=feat_dims,
    dim_out=dim_out,
    dim_hidden=dim_hidden,
    dims=dims,
    task_num=task_num,
    dropout=drop_out,
)


class Trainer(nn.Module):
    def __init__(
        self,
        cfg: DataConfig,
        emb_dict: nn.ModuleDict,
        base_model: nn.Module,
        epochs: int,
        device: str = "cpu",
    ) -> None:
        super().__init__()
        self.epochs = epochs
        self.cfg = cfg
        self.emb_dict = emb_dict
        self.base_model = base_model
        self.loss_fn = nn.BCELoss()
        self.optimizer = torch.optim.Adam(self.parameters(), lr=0.001)

    def _chunk_to_tensor(self, chunk: pd.DataFrame, reduce: str = "sum"):
        x = chunk[pipe.col_in]
        x.fillna(pipe.fillna)
        for op in pipe.ops:
            # a list of operation
            x = x.apply(op)
        tensor = torch.tensor(x.to_list())
        if pipe.feature_type.endswith("sparse"):
            tensor = tensor.long()
            tensor = self.emb_dict[pipe.col_out](tensor)
            # size = (batch, emb) or (batch, seq_len, emb)
            if reduce == "sum" and len(tensor.shape) == 3:
                tensor = tensor.sum(dim=1)
            elif reduce == "mean" and len(tensor.shape) == 2:
                tensor = tensor.mean(dim=1)
            else:
                raise NotImplementedError
            # TODO support label
        return tensor

    def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor:
        x = self._preprocess(x)
        return self.base_model(x)

    def train_one_chunk(
        self,
        x: Dict[str, torch.Tensor],
        y: Dict[torch.Tensor],
        chunk_index: int,
        epoch: int,
    ):
        y_pred = self.base_model(x)
        # TODO train

    def train_one_epoch(self, epoch: int):
        df = pd.read_csv(self.cfg.train_dir, sep=self.cfg.sep, chunksize=chunk_size)
        for chunk_idx, chunk in enumerate(df):
            # each chunk
            tensor_dict = dict()
            for feat_idx, pipe in enumerate(self.cfg.pipelines):
                # each column
                tensor_dict[pipe.col_out] = self._chunk_to_tensor(chunk)

            # x, y = tensor_dict

            self.train_one_chunk(tensor_dict, chunk_idx, epoch)

    def train(self):
        for e in range(self.epochs):
            self.train_one_epoch(e)

tensor([[ 0, 34, 29],
        [ 0, 34, 29],
        [ 0, 34, 29],
        ...,
        [ 0, 34, 29],
        [ 0, 34, 29],
        [ 0, 34, 29]])