# 概要
> このjupyter notebookでは、「ユーザ$u$の商品$i$を再購買するまでの期間」を予測する。再購買までの期間を予測するために、時間依存性共変量(特徴ベクトル)を入力したときにハザード関数値を出力するニューラルネットを構築する。以後、このニューラルネットを**ニューラルハザードモデル**と呼称する。

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# データ用意

[サンプルデータセット（EC） – Arm Treasure Data](https://support.treasuredata.com/hc/ja/articles/213697057-%E3%82%B5%E3%83%B3%E3%83%97%E3%83%AB%E3%83%87%E3%83%BC%E3%82%BF%E3%82%BB%E3%83%83%E3%83%88-EC-)

## データの読み込み

In [12]:
DATASET_PATH = '../dataset/ECサンプルデータセット/{file_name}'


# メンバーID入りアクセスログ
login_ec_df = pd.read_csv(
        DATASET_PATH.format(file_name='login_ec.csv')
    )

login_ec_df.head()

Unnamed: 0,member_id,time
0,79509,1299079509
1,2079509,1299079509
2,1079509,1299079509
3,2079860,1299079860
4,79860,1299079860


In [13]:
# カートアクション
cart_action_df = pd.read_csv(
        DATASET_PATH.format(file_name='cart.csv')
    )

cart_action_df.head()

Unnamed: 0,cart_id,goods_id,action,create_at,update_at,last_update,time
0,108750017,583266,UPD,2013-11-26 03:11:06,2013-11-26 03:11:06,2013-11-26 03:11:06,1385478215
1,108750017,662680,UPD,2013-11-26 03:11:06,2013-11-26 03:11:06,2013-11-26 03:11:06,1385478215
2,108750017,664077,UPD,2013-11-26 03:11:06,2013-11-26 03:11:06,2013-11-26 03:11:06,1385478215
3,108199875,661648,ADD,2013-11-26 03:11:10,2013-11-26 03:11:10,2013-11-26 03:11:10,1385478215
4,105031004,661231,ADD,2013-11-26 03:11:41,2013-11-26 03:11:41,2013-11-26 03:11:41,1385478215


In [14]:
# メンバー情報
member_df = pd.read_csv(
        DATASET_PATH.format(file_name='member.csv')
    )

member_df.head()

Unnamed: 0,sex,last_update,closed_account_time,age,state,device,country,city,member_id,time
0,1,2003-08-18 04:19:00,0000-00-00 00:00:00,,ontario,desktop,canada,sarnia,111491,1061147940
1,1,2002-06-23 07:01:00,0000-00-00 00:00:00,,texas,tablet,usa,austin,49321,1024783260
2,1,2012-08-24 05:52:54,0000-00-00 00:00:00,,,smart phone,sweden,upplv_sby,2074992,1345755174
3,1,2002-04-13 03:24:00,0000-00-00 00:00:00,,connecticut,desktop,usa,wallingford,40307,1018635840
4,0,2002-08-06 09:17:00,0000-00-00 00:00:00,,montana,unknown,usa,laurel,53350,1028593020


In [15]:
# 伝票，レシート(注文履歴)
sales_slip_df = pd.read_csv(
        DATASET_PATH.format(file_name='sales_slip.csv')
    )

sales_slip_df.head()

Unnamed: 0,member_id,goods_id,category,sub_category,delivery_type,is_canceled,order_date,ship_date,amount,price,last_update,time
0,2046517,577461,Electronics and Computers,Portable Audio and Accessories,1.0,0,1900-01-01 00:00:00,2021-01-01,1,1,2012-06-04 04:35:00,1338784500
1,2046511,577461,Electronics and Computers,Portable Audio and Accessories,1.0,0,1900-01-01 00:00:00,2021-01-01,1,1,2012-06-04 04:57:21,1338785841
2,2046531,577458,Electronics and Computers,Camera and Photo and Video,1.0,0,1900-01-01 00:00:00,2021-01-01,1,1,2012-06-04 04:54:30,1338785670
3,2039923,577461,Electronics and Computers,Portable Audio and Accessories,1.0,0,1900-01-01 00:00:00,2021-01-01,1,1,2012-06-04 04:58:15,1338785895
4,2046509,577461,Electronics and Computers,Portable Audio and Accessories,1.0,0,1900-01-01 00:00:00,2021-01-01,1,1,2012-06-04 04:58:17,1338785897


In [16]:
# 原価と売価
price_list_df = pd.read_csv(
        DATASET_PATH.format(file_name='price_list.csv')
    )

price_list_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,goods_id,selling_price,cost_price,rate
0,528919,2648,2949,0.9
1,528261,952,1299,0.73
2,528637,8571,16617,0.52
3,528714,1904,3292,0.58
4,529093,1886,1925,0.98


## データの加工

ユーザの1日毎&sub_category毎の購入履歴を作成する.

<table>
    <tr>
        <th>member_id</th>
        <th>order_date</th>
        <th>sub_category</th>
        <th>survival_time</th>
        <th>event</th>
        <th>...</th>
    </tr>
    <tr>
        <td rowspan="4">2046511</td>
        <td rowspan="2">2007-01-09</td>
        <td>Portable Audio and Accessories</td>
        <td>21</td>
        <td>1</td>
        <td>...</td>
    </tr>
    <tr>
        <td>Camera and Photo and Video</td>
        <td>60</td>
        <td>1</td>
        <td>...</td>
    </tr>
    <tr>
        <td>2007-01-20</td>
        <td>Video Games </td>
        <td>123</td>
        <td>1</td>
        <td>...</td>
    </tr>
    <tr>
        <td>2007-01-30</td>
        <td>Portable Audio and Accessories</td>
        <td>60</td>
        <td>0</td>
        <td>...</td>
    </tr>
    <tr>
        <td>2046509</td>
        <td>2009-06-23</td>
        <td>Baby Registry </td>
        <td>60</td>
        <td>0</td>
        <td>...</td>
    </tr>
    <tr>
        <td>...</td>
        <td>...</td>
        <td>...</td>
        <td>...</td>
        <td>...</td>
        <td>...</td>
    </tr>
</table>

### 加工準備

In [17]:
# categoryとsub_categoryを1つのカラムへ
sales_slip_df['category_path'] = sales_slip_df['category'] + '>' + sales_slip_df['sub_category']

In [18]:
# order_date(注文日)に"0000-00-00 00:00:00"という日付データが含まれるので一時的に修正
sales_slip_df['order_date'].replace({'0000-00-00 00:00:00': '1900-01-01 00:00:00'}, inplace=True)

# order_date(注文日),ship_date(配送日)カラムを日付型(date)へ
sales_slip_df['order_date'] = pd.to_datetime(sales_slip_df['order_date']).dt.date
sales_slip_df['ship_date'] = pd.to_datetime(sales_slip_df['ship_date']).dt.date

### 無効なデータの除外

In [19]:
from datetime import datetime


where_exclude = (
    (pd.to_datetime(sales_slip_df['order_date']).dt.year < 1950) # order_dateが1950年より以前
    | (pd.to_datetime(sales_slip_df['order_date']) > datetime.now())  # order_dateが今日よりも後
    | (pd.to_datetime(sales_slip_df['ship_date']).dt.year < 1950) # ship_dateが1950年より以前
    | (pd.to_datetime(sales_slip_df['ship_date']) > datetime.now())  # ship_dateが今日よりも後
)
exclude_indexs = sales_slip_df[where_exclude].index
sales_slip_df.drop(exclude_indexs, inplace=True)

In [20]:
where_exclude.any()

True

In [21]:
sales_slip_df['order_date'].value_counts().sort_index()

2004-12-28        8
2005-01-04        1
2005-01-05       18
2005-01-06        4
2005-01-07       37
2005-01-11     9907
2005-01-17       22
2005-01-18    22355
2005-01-19      260
2005-01-25    21763
2005-01-26       12
2005-02-01    21636
2005-02-08    24563
2005-02-15    26518
2005-02-21        1
2005-02-22    27791
2005-03-01    28827
2005-03-04        4
2005-03-08    27704
2005-03-15    24586
2005-03-22    30780
2005-03-29    27406
2005-04-05    26113
2005-04-12    29432
2005-04-19    30152
2005-04-26    25923
2005-05-03    25422
2005-05-10    26063
2005-05-17    26350
2005-05-24    29403
              ...  
2013-07-02    20744
2013-07-05       18
2013-07-08        4
2013-07-09    22380
2013-07-11       15
2013-07-12        1
2013-07-16    21059
2013-07-23    21687
2013-07-30    18935
2013-08-06    18708
2013-08-08      190
2013-08-09     5678
2013-08-20    31680
2013-08-27    17578
2013-09-03    15738
2013-09-10    19117
2013-09-17    19071
2013-09-24    21128
2013-10-01    19405


In [22]:
sales_slip_df['ship_date'].value_counts().sort_index()

2004-12-30        8
2005-01-10       40
2005-01-11      128
2005-01-13       85
2005-01-14     8389
2005-01-16        4
2005-01-18      176
2005-01-21    20015
2005-01-24       11
2005-01-28    21387
2005-02-04    20786
2005-02-05       15
2005-02-06       31
2005-02-08       67
2005-02-11    23037
2005-02-18    29528
2005-02-24        7
2005-02-25    25610
2005-03-03       12
2005-03-04    26405
2005-03-07       11
2005-03-11    25608
2005-03-14       41
2005-03-18    28095
2005-03-21       21
2005-03-23        5
2005-03-25    31038
2005-04-01    29172
2005-04-04       28
2005-04-08    24302
              ...  
2013-08-30    17807
2013-09-06    15985
2013-09-13    19374
2013-09-20    19299
2013-09-27    21516
2013-10-04    19585
2013-10-11    16049
2013-10-18    16096
2013-10-25    19715
2013-11-01    23608
2013-11-08    21971
2013-11-15    19482
2013-11-22    18314
2013-11-29    19676
2013-12-06    19548
2013-12-13    18966
2013-12-20    44670
2013-12-27     9979
2014-01-03        2


### 分析対象データの抽出

> - `is_canceled`が0のデータ
> - `member_id`が0以外のデータ
> - 再購買(同一memberが以前注文したカテゴリの商品を再び購入)された`category_path`の注文データ

In [23]:
sales_slip_df.head()

Unnamed: 0,member_id,goods_id,category,sub_category,delivery_type,is_canceled,order_date,ship_date,amount,price,last_update,time,category_path
1024,121168,664900,Home and Garden and Tools,Fine Art,,0,NaT,2013-12-20,1,1886,2013-12-17 10:25:14,1387275914,Home and Garden and Tools>Fine Art
1025,1487069,632279,Home and Garden and Tools,Bedding and Bath,,0,NaT,2014-04-11,1,2682,2013-12-17 10:15:43,1387275343,Home and Garden and Tools>Bedding and Bath
1026,1038419,618926,Electronics and Computers,Office and School Supplies,,0,NaT,2014-02-21,1,3420,2013-12-17 10:15:55,1387275355,Electronics and Computers>Office and School Su...
1027,122883,618926,Electronics and Computers,Office and School Supplies,,0,NaT,2014-02-14,1,3420,2013-12-17 10:15:34,1387275334,Electronics and Computers>Office and School Su...
1028,2245152,632279,Home and Garden and Tools,Bedding and Bath,,0,NaT,2014-04-11,1,2682,2013-12-17 10:16:01,1387275361,Home and Garden and Tools>Bedding and Bath


In [24]:
print(sales_slip_df['order_date'].isnull().any())
print(sales_slip_df['order_date'].isnull().all())
print(sales_slip_df['order_date'].isnull().value_counts())
print(sales_slip_df['ship_date'].isnull().any())

True
False
False    13779871
True        56936
Name: order_date, dtype: int64
False


In [25]:
print(sales_slip_df['delivery_type'].value_counts())
print(sales_slip_df['delivery_type'].isnull().value_counts())

1.0    13786871
Name: delivery_type, dtype: int64
False    13786871
True        49936
Name: delivery_type, dtype: int64


In [26]:
sales_slip_df['member_id'].value_counts()

0          933979
1385684      3751
1833816      3710
647431       3062
1478814      2978
850537       2475
623453       2412
130897       2393
1351089      2389
1421483      2387
802239       2354
999829       2351
1336981      2301
777155       2224
640123       2203
916524       2196
363735       2142
344049       2110
572435       2040
288865       2029
1963430      1954
838979       1893
517094       1892
491706       1879
1213999      1872
407528       1871
1091162      1857
800268       1857
266844       1837
1916325      1810
            ...  
1566880         1
1529634         1
1558692         1
1550504         1
1542316         1
1079238         1
1071042         1
1370304         1
1362116         1
1300402         1
1267618         1
1435872         1
1411308         1
1193870         1
1394932         1
1185674         1
1415034         1
1439606         1
1755396         1
1390446         1
1730832         1
1722644         1
1406822         1
1357662         1
1374038   

In [27]:
# 再購買された注文データ
is_re_purchase = sales_slip_df.duplicated(['member_id', 'category_path'], keep=False)
# しかし、同日内に再購買された注文データは除く
not_same_day = ~sales_slip_df.duplicated(['member_id', 'ship_date', 'category_path'], keep=False)
re_purchase_categories = sales_slip_df[is_re_purchase&not_same_day]['category_path'].unique()
is_in_re_purchase_categories = sales_slip_df['category_path'].isin(re_purchase_categories)

where_include = (
    (sales_slip_df['is_canceled'] == 0)
    & (sales_slip_df['member_id'] != 0)
    & is_in_re_purchase_categories
)

sales_slip_df = sales_slip_df.loc[where_include]

In [28]:
sales_slip_df.head()

Unnamed: 0,member_id,goods_id,category,sub_category,delivery_type,is_canceled,order_date,ship_date,amount,price,last_update,time,category_path
1024,121168,664900,Home and Garden and Tools,Fine Art,,0,NaT,2013-12-20,1,1886,2013-12-17 10:25:14,1387275914,Home and Garden and Tools>Fine Art
1025,1487069,632279,Home and Garden and Tools,Bedding and Bath,,0,NaT,2014-04-11,1,2682,2013-12-17 10:15:43,1387275343,Home and Garden and Tools>Bedding and Bath
1026,1038419,618926,Electronics and Computers,Office and School Supplies,,0,NaT,2014-02-21,1,3420,2013-12-17 10:15:55,1387275355,Electronics and Computers>Office and School Su...
1027,122883,618926,Electronics and Computers,Office and School Supplies,,0,NaT,2014-02-14,1,3420,2013-12-17 10:15:34,1387275334,Electronics and Computers>Office and School Su...
1028,2245152,632279,Home and Garden and Tools,Bedding and Bath,,0,NaT,2014-04-11,1,2682,2013-12-17 10:16:01,1387275361,Home and Garden and Tools>Bedding and Bath


### 教師データ&特徴ベクトル生成前の下準備

このセクションを実行することで、以下のデータセットができる

<table>
    <tr>
        <th>member_id</th>
        <th>order_date</th>
        <th>sub_category</th>
        <th>...</th>
    </tr>
    <tr>
        <td rowspan="4">2046511</td>
        <td rowspan="2">2007-01-09</td>
        <td>Portable Audio and Accessories</td>
        <td>...</td>
    </tr>
    <tr>
        <td>Camera and Photo and Video</td>
        <td>...</td>
    </tr>
    <tr>
        <td>2007-01-20</td>
        <td>Video Games </td>
        <td>...</td>
    </tr>
    <tr>
        <td>2007-01-30</td>
        <td>Portable Audio and Accessories</td>
        <td>...</td>
    </tr>
    <tr>
        <td>2046509</td>
        <td>2009-06-23</td>
        <td>Baby Registry </td>
        <td>...</td>
    </tr>
    <tr>
        <td>...</td>
        <td>...</td>
        <td>...</td>
        <td>...</td>
    </tr>
</table>

In [29]:
from tqdm import tqdm


check_cols = ['member_id', 'ship_date', 'category_path']
is_duplicated = sales_slip_df.duplicated(check_cols, keep=False)


# メンバーID, 配送日, 購入カテゴリが重複してないデータ
not_duplicated_df = sales_slip_df[~is_duplicated][['member_id', 'ship_date', 'category_path', 'amount', 'price']]


# 重複しているデータ
duplicated_df = sales_slip_df[is_duplicated][['member_id', 'ship_date', 'category_path', 'amount', 'price']]

fixed_duplication_df = duplicated_df.groupby(['member_id', 'ship_date', 'category_path'])[['amount', 'price']].mean().reset_index()

# 重複、非重複データを結合
category_order_df = pd.concat([fixed_duplication_df, not_duplicated_df], ignore_index=True)
category_order_df.reset_index(drop=True, inplace=True)

In [30]:
category_order_df.head()

Unnamed: 0,member_id,ship_date,category_path,amount,price
0,5,2009-12-18,Electronics and Computers>Portable Audio and A...,1.0,1858.0
1,17,2007-05-18,Sports and Outdoors>Team Sports,1.0,2400.0
2,25,2005-02-04,Movies and Music and Games>CDs and Vinyl,1.0,2000.0
3,25,2008-11-21,Electronics and Computers>Software,1.0,1980.0
4,25,2010-02-05,Home and Garden and Tools>Kitchen and Bath Fix...,1.0,1685.5


In [31]:
print('どこからの列で欠損しているか？\n', category_order_df.isnull().any())
print(fixed_duplication_df.shape[0] + not_duplicated_df.shape[0])
print(category_order_df.shape[0])
print((category_order_df.groupby(check_cols).size() == 1).all())
print('インデックスは重複しているか？', category_order_df.index.duplicated().any())

どこからの列で欠損しているか？
 member_id        False
ship_date        False
category_path    False
amount           False
price            False
dtype: bool
11901287
11901287
True
インデックスは重複しているか？ False


### 教師データの生成

<table>
    <tr>
        <th>member_id</th>
        <th>order_date</th>
        <th>category_path</th>
        <th>survival_time</th>
        <th>event</th>
        <th>...</th>
    </tr>
    <tr>
        <td rowspan="4">2046511</td>
        <td rowspan="2">2007-01-09</td>
        <td>Electronics and Computers>Portable Audio and Accessories</td>
        <td>21</td>
        <td>1</td>
        <td>...</td>
    </tr>
    <tr>
        <td>Sports and Outdoors>Team Sport</td>
        <td>60</td>
        <td>1</td>
        <td>...</td>
    </tr>
    <tr>
        <td>2007-01-20</td>
        <td>Home and Garden and Tools>Kitchen and Bath Fixtures</td>
        <td>123</td>
        <td>1</td>
        <td>...</td>
    </tr>
    <tr>
        <td>2007-01-30</td>
        <td>Electronics and Computers>Portable Audio and Accessories</td>
        <td>60</td>
        <td>0</td>
        <td>...</td>
    </tr>
    <tr>
        <td>2046509</td>
        <td>2009-06-23</td>
        <td>Electronics and Computers>Software</td>
        <td>60</td>
        <td>0</td>
        <td>...</td>
    </tr>
    <tr>
        <td>...</td>
        <td>...</td>
        <td>...</td>
        <td>...</td>
        <td>...</td>
        <td>...</td>
    </tr>
</table>

#### 階層インデックス化

教師データを生成するにあたり、分析のしやすさと生成速度を向上させるために、`member_id`, `category_path`, `ship_date`をマルチインデックスとするデータフレームを作成する。

In [32]:
# 各列のデータ型をチェック
print(category_order_df.info())

# member_id, category_pathがobjectのため、そのままインデックス化すると参照速度が低下するため、intにする.
# # ついでにほかのobjectもintへ
converted_list = ['member_id', 'amount', 'price']
category_order_df.loc[:, converted_list] = category_order_df[converted_list].convert_objects(convert_numeric=True)


categories = category_order_df['category_path'].unique().tolist()
category_path_id_df = pd.DataFrame(np.arange(1, len(categories)+1), index=categories, columns=['category_path_id'])

category_order_df = pd.merge(
    category_order_df, category_path_id_df,
    left_on='category_path', right_index=True,
    how='left'
)

# 各列のデータ型をチェック
print(category_order_df.info())

# 階層index化
category_order_df.set_index(['member_id', 'category_path_id', 'ship_date'], drop=False, inplace=True)
category_order_df.sort_index(inplace=True)

category_order_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11901287 entries, 0 to 11901286
Data columns (total 5 columns):
member_id        int64
ship_date        object
category_path    object
amount           float64
price            float64
dtypes: float64(2), int64(1), object(2)
memory usage: 454.0+ MB
None


For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  import sys


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11901287 entries, 0 to 11901286
Data columns (total 6 columns):
member_id           int64
ship_date           object
category_path       object
amount              float64
price               float64
category_path_id    int64
dtypes: float64(2), int64(2), object(2)
memory usage: 544.8+ MB
None


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,member_id,ship_date,category_path,amount,price,category_path_id
member_id,category_path_id,ship_date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,69,2005-09-09,3,2005-09-09,Sports and Outdoors>Outdoor Gear,1.0,1980.0,69
3,74,2005-09-09,3,2005-09-09,Electronics and Computers>Computer Accessories...,1.0,1980.0,74
5,1,2009-12-18,5,2009-12-18,Electronics and Computers>Portable Audio and A...,1.0,1858.0,1
5,15,2008-11-28,5,2008-11-28,Clothing and Shoes and Jewelry>Girls,1.0,2800.0,15
5,15,2010-06-11,5,2010-06-11,Clothing and Shoes and Jewelry>Girls,1.0,9505.0,15


#### 再購買してないが、再購買実績のあるカテゴリを購入しているケース

In [33]:
from datetime import date


is_calc_target = category_order_df.duplicated(['member_id', 'category_path_id'], keep=False)

# 再購買してないが、再購買実績のあるカテゴリを購入しているケース
not_repurchase_case_df = category_order_df.loc[~is_calc_target]
not_repurchase_case_df.loc[:, 'end_date'] = date(2015, 11, 13)
not_repurchase_case_df.loc[:, 'survival_time'] = (not_repurchase_case_df['end_date'] - not_repurchase_case_df['ship_date']).dt.days
not_repurchase_case_df.loc[:, 'event'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [34]:
not_repurchase_case_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,member_id,ship_date,category_path,amount,price,category_path_id,end_date,survival_time,event
member_id,category_path_id,ship_date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3,69,2005-09-09,3,2005-09-09,Sports and Outdoors>Outdoor Gear,1.0,1980.0,69,2015-11-13,3717,0
3,74,2005-09-09,3,2005-09-09,Electronics and Computers>Computer Accessories...,1.0,1980.0,74,2015-11-13,3717,0
5,1,2009-12-18,5,2009-12-18,Electronics and Computers>Portable Audio and A...,1.0,1858.0,1,2015-11-13,2156,0
5,23,2009-12-18,5,2009-12-18,Movies and Music and Games>Video Games,1.0,7429.0,23,2015-11-13,2156,0
5,25,2005-03-04,5,2005-03-04,Beauty and Health and Grocery>Specialty Diets,1.0,2980.0,25,2015-11-13,3906,0


#### 再購買のケース

In [35]:
# 再購買のケース
repurchase_case_df = category_order_df.loc[is_calc_target]
# repurchase_case_df.head()
# repurchase_case_df.shift(-1).head()

In [36]:
# データ量が多いため、groupbyやforloopは使用しないで再購買期間を算出する.
# sortしたdfである行とその１つ下の行のship_dateの差delta.dt.daysを計算する.
# その際、双方の行のmember_idとcategory_path_idが合致している行だけを計算する.
is_not_invalidate_df = (
    repurchase_case_df[['member_id', 'category_path_id']].shift(-1) == repurchase_case_df[['member_id', 'category_path_id']]
)
is_not_invalidate_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,member_id,category_path_id
member_id,category_path_id,ship_date,Unnamed: 3_level_1,Unnamed: 4_level_1
5,15,2008-11-28,True,True
5,15,2010-06-11,False,False
10,55,2005-01-14,True,True
10,55,2005-07-29,False,False
17,42,2007-06-08,True,True
17,42,2007-06-15,False,False
22,2,2007-08-17,True,True
22,2,2010-02-26,True,False
22,87,2005-09-09,True,True
22,87,2005-09-16,False,False


In [37]:
# 横(行)に対してすべてTrueか否や
is_not_invalidate_arr = is_not_invalidate_df.all(axis=1).values
# is_not_invalidate_arr

In [38]:
repurchase_case_df['end_date'] = np.where(is_not_invalidate_arr, repurchase_case_df['ship_date'].shift(-1), date(2015, 11, 13))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [39]:
repurchase_case_df.loc[:, 'survival_time'] = (repurchase_case_df['end_date'] - repurchase_case_df['ship_date']).dt.days
repurchase_case_df.loc[:, 'event'] = np.where(is_not_invalidate_arr, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [40]:
repurchase_case_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,member_id,ship_date,category_path,amount,price,category_path_id,end_date,survival_time,event
member_id,category_path_id,ship_date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5,15,2008-11-28,5,2008-11-28,Clothing and Shoes and Jewelry>Girls,1.0,2800.0,15,2010-06-11,560,1
5,15,2010-06-11,5,2010-06-11,Clothing and Shoes and Jewelry>Girls,1.0,9505.0,15,2015-11-13,1981,0
10,55,2005-01-14,10,2005-01-14,Electronics and Computers>Wearable Technology,1.0,8940.0,55,2005-07-29,196,1
10,55,2005-07-29,10,2005-07-29,Electronics and Computers>Wearable Technology,1.0,3000.0,55,2015-11-13,3759,0
17,42,2007-06-08,17,2007-06-08,Clothing and Shoes and Jewelry>Baby,3.0,2980.0,42,2007-06-15,7,1


In [41]:
print('survival_time column is null any? {0}'.format(repurchase_case_df['survival_time'].isnull().any()))

survival_time column is null any? False


#### 再購買ケースと非再購買ケースの結合

In [42]:
cols = [
    'member_id', 'category_path_id', 'category_path', 'ship_date', 'end_date', 
    'survival_time', 'event', 'price', 'amount'
]
category_order_df = pd.concat([not_repurchase_case_df, repurchase_case_df]).loc[:, cols]

In [43]:
category_order_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,member_id,category_path_id,category_path,ship_date,end_date,survival_time,event,price,amount
member_id,category_path_id,ship_date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3,69,2005-09-09,3,69,Sports and Outdoors>Outdoor Gear,2005-09-09,2015-11-13,3717,0,1980.0,1.0
3,74,2005-09-09,3,74,Electronics and Computers>Computer Accessories...,2005-09-09,2015-11-13,3717,0,1980.0,1.0
5,1,2009-12-18,5,1,Electronics and Computers>Portable Audio and A...,2009-12-18,2015-11-13,2156,0,1858.0,1.0
5,23,2009-12-18,5,23,Movies and Music and Games>Video Games,2009-12-18,2015-11-13,2156,0,7429.0,1.0
5,25,2005-03-04,5,25,Beauty and Health and Grocery>Specialty Diets,2005-03-04,2015-11-13,3906,0,2980.0,1.0


In [44]:
print('Q1. どこからのデータ列で欠損が発生しているか?')
print('A.')
print(category_order_df.isnull().any())
print('Q2. 重複している行があるのか？')
print('A2.')
print(category_order_df.duplicated().any())
print('Q3. 生存時間が負値になっている?')
print('A3.')
print((category_order_df['survival_time'] <= 0).any())
print('Q4. eventの頻度は?')
print('A4.')
print(category_order_df['event'].value_counts())

Q1. どこからのデータ列で欠損が発生しているか?
A.
member_id           False
category_path_id    False
category_path       False
ship_date           False
end_date            False
survival_time       False
event               False
price               False
amount              False
dtype: bool
Q2. 重複している行があるのか？
A2.
False
Q3. 生存時間が負値になっている?
A3.
False
Q4. eventの頻度は?
A4.
0    7793409
1    4107878
Name: event, dtype: int64


#### データセットの一時保存

In [45]:
# category_order_df.to_csv(
#     DATASET_PATH.format(file_name='category_order_' + date.today().strftime('%Y%m%d') + '.csv.gz'), 
#     compression='gzip',
#     encoding='UTF-8'
# )

### 特徴量の生成

In [29]:
DATASET_PATH = '../dataset/ECサンプルデータセット/{file_name}'

category_order_df = pd.read_csv(
        DATASET_PATH.format(file_name='category_order_20181111.csv.gz'),
        encoding="utf-8",
        header=0,
        index_col=[0,1,2]
    )

category_order_df.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,member_id.1,category_path_id.1,category_path,ship_date.1,end_date,survival_time,event,price,amount
member_id,category_path_id,ship_date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3,69,2005-09-09,3,69,Sports and Outdoors>Outdoor Gear,2005-09-09,2015-11-13,3717,0,1980.0,1.0
3,74,2005-09-09,3,74,Electronics and Computers>Computer Accessories...,2005-09-09,2015-11-13,3717,0,1980.0,1.0
5,1,2009-12-18,5,1,Electronics and Computers>Portable Audio and A...,2009-12-18,2015-11-13,2156,0,1858.0,1.0
5,23,2009-12-18,5,23,Movies and Music and Games>Video Games,2009-12-18,2015-11-13,2156,0,7429.0,1.0
5,25,2005-03-04,5,25,Beauty and Health and Grocery>Specialty Diets,2005-03-04,2015-11-13,3906,0,2980.0,1.0


In [18]:
category_order_df.rename(columns={
    'member_id.1': 'member_id', 
    'category_path_id.1': 'category_path_id',
    'ship_date.1': 'ship_date'
}, inplace=True)

category_order_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,member_id,category_path_id,category_path,ship_date,end_date,survival_time,event,price,amount
member_id,category_path_id,ship_date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3,69,2005-09-09,3,69,Sports and Outdoors>Outdoor Gear,2005-09-09,2015-11-13,3717,0,1980.0,1.0
3,74,2005-09-09,3,74,Electronics and Computers>Computer Accessories...,2005-09-09,2015-11-13,3717,0,1980.0,1.0
5,1,2009-12-18,5,1,Electronics and Computers>Portable Audio and A...,2009-12-18,2015-11-13,2156,0,1858.0,1.0
5,23,2009-12-18,5,23,Movies and Music and Games>Video Games,2009-12-18,2015-11-13,2156,0,7429.0,1.0
5,25,2005-03-04,5,25,Beauty and Health and Grocery>Specialty Diets,2005-03-04,2015-11-13,3906,0,2980.0,1.0


## データの時系列化
このセクションでは、教師データ・特徴量を時系列化します。


### 時系列データ化するカテゴルの選定

In [21]:
# すべての商品の購買データを時系列化するのは、時間がかかるため、
# 今回は生活必需品でかつ定期的な再購買が多いと予想される
# 「美容・健康・食品」カテゴリの購買データだけを時系列化し、分析する.
r = r'^Beauty and Health and Grocery>Grocery and Gourmet Food'
is_contain = category_order_df['category_path'].str.contains(r)
beauty_health_grocery_order_df = category_order_df[is_contain]

In [22]:
beauty_health_grocery_order_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,member_id,category_path_id,category_path,ship_date,end_date,survival_time,event,price,amount
member_id,category_path_id,ship_date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
19,31,2005-02-18,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980.0,1.0
42,31,2005-06-17,42,31,Beauty and Health and Grocery>Grocery and Gour...,2005-06-17,2015-11-13,3801,0,4200.0,1.0
112,31,2005-06-03,112,31,Beauty and Health and Grocery>Grocery and Gour...,2005-06-03,2015-11-13,3815,0,3980.0,1.0
145,31,2006-06-09,145,31,Beauty and Health and Grocery>Grocery and Gour...,2006-06-09,2015-11-13,3444,0,4800.0,1.0
180,31,2005-02-18,180,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980.0,1.0


In [23]:
beauty_health_grocery_order_df.event.value_counts()

0    88841
1    51758
Name: event, dtype: int64

In [24]:
beauty_health_grocery_order_df[beauty_health_grocery_order_df['event'] == 1]['survival_time'].value_counts().iloc[:15].sort_index()

7      1684
14     1300
21     1356
28     1156
35     1111
42      973
49     1009
56      959
63      931
70      818
77      754
84      794
91      800
98      731
105     735
Name: survival_time, dtype: int64

In [25]:
where = (
    (beauty_health_grocery_order_df[beauty_health_grocery_order_df['event'] == 1.0]['survival_time'] <= 365)
)
where.value_counts()

True     32214
False    19544
Name: survival_time, dtype: int64

### データフレームの時系列化

 - 1つのデータ行から$max_{t}$日まで$d$日毎にデータを作成する

In [26]:
def row_series_to_time_series_df(ser, max_t=180, interval=6):
    """
    Row data that is pandas.Series type converted topandas.DataFrame.
    """
    if type(ser) != pd.core.series.Series:
        raise TypeError('Seriesじゃないよ！')
        
    df = pd.DataFrame([], columns=ser.index.tolist())

    if ser['event'] == 0:
        # イベントが発生しなかった場合、max_tまでの時系列DataFrameをinterval間隔で生成する.
        df.loc[:, 'today'] = pd.date_range(
                ser['ship_date'], 
                periods=int(max_t/interval), 
                freq='{0}D'.format(interval)
            )
    else:
        # イベントが発生した場合、end_dateまでの時系列DataFrameをinterval間隔で生成する
        df.loc[:, 'today'] = pd.date_range(
                ser['ship_date'], 
                periods=np.ceil(ser['survival_time']/interval), 
                freq='{0}D'.format(interval)
            )

    df.loc[:, ser.index.tolist()] = ser.values
    df.set_index(['member_id', 'category_path_id', 'ship_date', 'today'])
    return df

In [27]:
from tqdm import tqdm


MAX_T = 180
INTERVAL_D = 10

time_series_df_list = []
for ix, gdf in tqdm(beauty_health_grocery_order_df.groupby(level=[0,1,2])):
    df = row_series_to_time_series_df(gdf.iloc[0], MAX_T, INTERVAL_D)
    time_series_df_list.append(df)

Exception in thread Thread-5:
Traceback (most recent call last):
  File "/Users/taiyou/.pyenv/versions/anaconda3-5.0.1/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Users/taiyou/.pyenv/versions/anaconda3-5.0.1/lib/python3.6/site-packages/tqdm/_tqdm.py", line 147, in run
    if instance.miniters > 1 and \
AttributeError: 'tqdm' object has no attribute 'miniters'

100%|██████████| 140599/140599 [37:43<00:00, 62.11it/s]


In [None]:
survival_time_series_df = pd.concat(time_series_df_list, ignore_index=True)
survival_time_series_df.head()

In [None]:
survival_time_series_df = survival_time_series_df.set_index(['member_id', 'category_path_id', 'ship_date', 'today'], drop=False)
survival_time_series_df.head()

In [None]:
survival_time_series_df['end_date'] = pd.to_datetime(survival_time_series_df['end_date'])
survival_time_series_df['ship_date'] = pd.to_datetime(survival_time_series_df['ship_date'])
survival_time_series_df['10days_later'] = pd.to_datetime(survival_time_series_df['10days_later'])

In [54]:
from datetime import timedelta


survival_time_series_df['{D}days_later'.format(D=INTERVAL_D)] = survival_time_series_df['today'] + timedelta(INTERVAL_D)
survival_time_series_df['passed_time'] = (
        survival_time_series_df['today'] - survival_time_series_df['ship_date']
    ).dt.days
will_event_happen_in_days = (
        (survival_time_series_df['event'] == 1) &
        (survival_time_series_df['end_date'] >= survival_time_series_df['today']) &
        (survival_time_series_df['end_date'] < survival_time_series_df['10days_later'])
    )
survival_time_series_df['will_event_happen_in_days'] = np.where(will_event_happen_in_days, 1, 0)

In [55]:
survival_time_series_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,member_id,category_path_id,category_path,ship_date,end_date,survival_time,event,price,amount,today,10days_later,passed_time,will_event_happen_in_days
member_id,category_path_id,ship_date,today,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
19,31,2005-02-18,2005-02-18,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-02-18,2005-02-28,0,0
19,31,2005-02-18,2005-02-28,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-02-28,2005-03-10,10,0
19,31,2005-02-18,2005-03-10,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-03-10,2005-03-20,20,0
19,31,2005-02-18,2005-03-20,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-03-20,2005-03-30,30,0
19,31,2005-02-18,2005-03-30,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-03-30,2005-04-09,40,0


In [63]:
survival_time_series_df.rename(columns={
    'ship_date': 'purchase_date', 'end_date': 'repurchase_date', 
    'today': 'start_date', '10days_later': 'end_date', 
}, inplace=True)

In [64]:
survival_time_series_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,member_id,category_path_id,category_path,purchase_date,repurchase_date,survival_time,event,price,amount,start_date,end_date,passed_time,will_event_happen_in_days
member_id,category_path_id,ship_date,today,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
19,31,2005-02-18,2005-02-18,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-02-18,2005-02-28,0,0
19,31,2005-02-18,2005-02-28,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-02-28,2005-03-10,10,0
19,31,2005-02-18,2005-03-10,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-03-10,2005-03-20,20,0
19,31,2005-02-18,2005-03-20,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-03-20,2005-03-30,30,0
19,31,2005-02-18,2005-03-30,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-03-30,2005-04-09,40,0
19,31,2005-02-18,2005-04-09,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-04-09,2005-04-19,50,0
19,31,2005-02-18,2005-04-19,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-04-19,2005-04-29,60,0
19,31,2005-02-18,2005-04-29,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-04-29,2005-05-09,70,0
19,31,2005-02-18,2005-05-09,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-05-09,2005-05-19,80,0
19,31,2005-02-18,2005-05-19,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-05-19,2005-05-29,90,0


In [65]:
print(survival_time_series_df.shape)
print(survival_time_series_df['will_event_happen_in_days'].value_counts())

(3717890, 13)
0    3670654
1      47236
Name: will_event_happen_in_days, dtype: int64


---
# モデルの概要

**ネットワーク**

**出力層**
$$
\frac {f\left(t\right)}{S\left(t\right)} = h\left( t \right) \approx \frac {1}{1 + \mathrm {exp}\left( - \mathrm {\boldsymbol {u} }\left(t\right) \right)}
$$

**損失関数**
$$
E\left( \mathrm {\boldsymbol {w}} \right) = \sum _{n=1}^{N}{\sum _{t_{n} = 1}^{ max_{t_{n}} }{\left\{ repurchase\_weight_{n} \cdot c_{nt}\log {h\left( t_{n} \right)} + \left(1 - c_{nt}\right) \log {\left[ 1 - h\left( t_{n} \right) \right]} \right\}}}
$$

# データセットの系列化

| `n` | `member_id` | `category_path_id` | 生存時間 | 打ち切り | 特徴ベクトル |
|:-----:|:-----------------:|:--------------------------:|:----------:|:---------:|:---------------:|
| `1` | 1 | 31 | $y\left( \tau + 1 \right)$ | $c\left( \tau + 1 \right)$ | $y\left( \tau \right), \dots, y\left( 1 \right)$ |
| `2` | 1 | 31 | $y\left( \tau + 2 \right)$ | $c\left( \tau + 2 \right)$ | $y\left( \tau + 1 \right), \dots, y\left( 2 \right)$ |
| `3` | 1 | 31 | $\vdots $ | $\vdots $ |
| `4` | 1 | 31 | $y\left( t + 1 \right)$ | $c\left( t + 1 \right)$ | $y\left( t \right), \dots, y\left( t - \tau + 1 \right)$ |

In [68]:
survival_time_series_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,member_id,category_path_id,category_path,purchase_date,repurchase_date,survival_time,event,price,amount,start_date,end_date,passed_time,will_event_happen_in_days
member_id,category_path_id,ship_date,today,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
19,31,2005-02-18,2005-02-18,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-02-18,2005-02-28,0,0
19,31,2005-02-18,2005-02-28,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-02-28,2005-03-10,10,0
19,31,2005-02-18,2005-03-10,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-03-10,2005-03-20,20,0
19,31,2005-02-18,2005-03-20,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-03-20,2005-03-30,30,0
19,31,2005-02-18,2005-03-30,19,31,Beauty and Health and Grocery>Grocery and Gour...,2005-02-18,2015-11-13,3920,0,1980,1,2005-03-30,2005-04-09,40,0


In [28]:
from tqdm import tqdm


maxlen = 25

x_cols = ['price', 'amount', 'passed_time']
y_col = ['will_event_happen_in_days']

X = np.array([])
Y = np.array([])

# 過去の購買履歴を考慮したデータ作り

# # ユーザとカテゴリごと
# for n, group_df in tqdm(survival_time_series_df.groupby(['member_id', 'category_path_id'])):
#     # そのカテゴリの再購買回数
#     for count in range(0, group_df.groupby(['ship_date'])):
#         # 経過日数
#         for t in range(0, group_df.iloc[['ship_date'] == count]['today']):
#             x_t = survival_time_series_df.loc[n]
#             np.append(X, x_t, axis=0)
#             np.append(Y, y_t, axis=0)
#             break

NameError: name 'survival_time_series_df' is not defined

In [None]:
# 過去の購買履歴を考慮していないデータ作り

# ユーザとカテゴリごと
for n, group_df in tqdm(survival_time_series_df.groupby(['member_id', 'category_path_id', 'ship_date'])):
    # 経過日数
    for t in range(0, group_df['today'].shape[0]):
        x_t = group_df.iloc[:t][x_cols].values
        y_t = group_df.iloc[:t][y_col].values

        np.append(X, x_t, axis=0)
        np.append(Y, y_t, axis=0)
        break

# データセットの分割

In [None]:
from sklearn.model_selection import train_test_split

# `member_id`,`category_path_id`単位で学習データと検証データを分割
survival_time_series_df.groupby(['member_id', 'category_path_id']).

---
# モデル設定

In [None]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers.recurrent import SimpleRNN


n_in = len(X_train[0][0])  # 1
n_hidden = 20
n_out = 1  # 1


def weight_variable(shape, name=None):
    return np.random.normal(scale=0.01, size=shape)


model = Sequential()
model.add(
    SimpleRNN(
        n_hidden,
        kernel_initializer=weight_variable,
        input_shape=(maxlen, n_in)
    )
)
model.add(Dense(n_out, kernel_initializer=weight_variable))
model.add(Activation('sigmoid'))

# モデルのコンパイル

In [None]:
from keras.optimizers import Adam


optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999)

model.compile(
    loss='mean_squared_error',
    optimizer=optimizer
)

---
# モデル学習

In [None]:
from keras.callbacks import EarlyStopping


epochs = 500
batch_size = 10

early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

model.fit(
    X_train, Y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(X_test, Y_test),
    callbacks=[early_stopping]
)

---
# モデル予測&評価

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html#sklearn.model_selection.TimeSeriesSplit

In [None]:
truncate = maxlen
Z = X[:1]  # 元データの最初の一部だけ切り出し

original = [f[i] for i in range(maxlen)]
predicted = [None for i in range(maxlen)]

for i in range(length_of_sequences - maxlen + 1):
    # 入力
    z_ = Z[-1:]
    # 出力
    y_ = model.predict(z_)
    sequence_ = np.concatenate(
        (z_.reshape(maxlen, n_in)[1:], y_),
        axis=0
    ).reshape(1, maxlen, n_in)

    Z = np.append(Z, sequence_, axis=0)
    predicted.append(y_.reshape(-1))

---
# グラフで可視化