In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import sys
import os
import matplotlib.pyplot as plt
import json
import logging
from MSSQLDB_Datamanager import DataManager

## logger 設定
now = datetime.now()
log_filename = 'DBA_finproject_{}.log'.format(now.strftime('%Y-%m-%d'))
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s')
log_dir = '.\\var\\log'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
file_handler = logging.FileHandler('./var/log/' + log_filename)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

## config 讀取
cfg_path = r".\config.json"
with open(cfg_path, 'r', encoding='utf-8-sig') as f:
    cfg = json.load(f)
# databse connection information
db_cfg = cfg['db_connect']
db_cfg['creator'] = __import__(db_cfg['creator'])

In [None]:
## 資料區間設定
time_start = '2022-03-01'
time_end = '2023-04-01'
## query 資料
DM = DataManager(logger, db_cfg ,time_start, time_end)
# 撈所有表
# Behavior,MemberData,OrderData,OrderSlave,SalePageData,SegmentData = DM.read_ALL_data_from_db()
# 只撈其中一張
OrderData = DM.read_data_from_db(TableName= 'OrderData')

In [None]:
OrderData = OrderData[(OrderData.StatusDef == 'Finish')|(OrderData.StatusDef == 'Return')].reset_index(drop=True)
OrderData.OrderDateTime = OrderData.OrderDateTime.apply(lambda x: pd.to_datetime(x).strftime('%Y-%m-%d %H:%M:%S'))

In [None]:
def cal_median(data):
    data.sort_values('OrderDateTime', inplace = True)
    data.reset_index(drop = True, inplace = True)
    data['interval'] = data.groupby('ShopMemberId', as_index = True).OrderDateTime.diff()
    data['interval'] = data['interval'].dt.total_seconds()
    data['interval'].fillna(0, inplace = True)
    eta = round(data[(data.StatusDef == 'Finish')&(data.interval != 0)].interval.median(), 0)
    return eta

In [None]:
def labeling(x, overall_interval_median):
    OrderData_id = OrderData[OrderData.ShopMemberId == x].copy()
    OrderData_id = OrderData_id[OrderData_id['StatusDef'] == 'Finish']
    OrderData_id.reset_index(drop= True, inplace=True)
    if len(OrderData_id) != 0:
        # 計算所有區間的frequency
        F = OrderData_id.groupby('ShopMemberId')['TradesGroupCode'].count()[0]
        if F > 1:
            # 個別ID中位數
            med = cal_median(OrderData_id)
            med = abs(med + (overall_interval_median - med) * 0.5)
        else:
            med = overall_interval_median
        # 最後一次購買
        last_buy_time = max(OrderData.OrderDateTime)
        # 三倍購物週期時間
        triple_period_time = last_buy_time - pd.Timedelta(seconds= 3* med)
        try:
            # 三倍購物週期內購物的次數
            purchase_times_in_triple_period_time = OrderData_id[(OrderData_id.OrderDateTime >= triple_period_time) & (OrderData_id.OrderDateTime <= last_buy_time)].groupby(["ShopMemberId", "TradesGroupCode"]).count().reset_index().ShopMemberId.count()
        except:
            purchase_times_in_triple_period_time = 0
        # rule based
        # 購買過一次以上且三倍購物週期內還有再購買
        if F>1 & purchase_times_in_triple_period_time > 0:
            return 'A'
        # 購買過一次以上三倍購物周期內沒有買
        elif F>1 & purchase_times_in_triple_period_time == 0:
            return 'P'
        # 購買過一次三倍購物週期內只買一次
        elif F ==1 & purchase_times_in_triple_period_time == 1:
            return 'N'
        else:
            return 'L'
    else:
        pass

In [None]:
from tqdm import tqdm
overall_median = cal_median(OrderData)
labeled_data = OrderData.copy()
labeled_data.drop_duplicates(subset='ShopMemberId', inplace= True)
labeled_data = labeled_data[['ShopMemberId']].reset_index(drop= True)
# for index in tqdm(range(len(labeled_data))):
#     labeled_data.loc[index, 'Label'] = labeling(labeled_data.loc[index, 'ShopMemberId'], overall_median)
  
labeled_data['Label'] = labeled_data['ShopMemberId'].apply(lambda x: labeling(x, overall_median))
labeled_data.dropna(inplace= True)
labeled_data.reset_index(drop= True, inplace= True)

In [None]:
labeled_data.Label.unique()