In [None]:
# 必要套件引入
import pandas as pd
import numpy as np

# import necessary module
from datetime import datetime
from tqdm.notebook import tqdm, tnrange
tqdm.pandas(desc="Loading...")
import swifter

# Setting the pyecharts config
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB

# Define pyecharts theme
from pyecharts.globals import ThemeType

# setting pyecharts' charts
from pyecharts.charts import Funnel, Bar, Line, Grid, TreeMap, Page, Pie, Boxplot
from pyecharts import options as opts
from pyecharts.globals import ThemeType

# setting plot
import matplotlib.pyplot as plt
import squarify

### 一、資料清洗的過程

In [None]:
# 引入分析所需資料
df = pd.read_csv('Data/UserBehavior.csv', nrows=10000000, header=None, parse_dates=True)

# 確認資料已被引入
print('Loading Finished! The file length is {} rows'.format(len(df)))

In [None]:
# 重新命名欄位
df.columns = ['User_Id', 'Item_Id', 'Category_Id', 'Behavior', 'Time']

# 確認表格已被重新命名
print('Renamed column Finished...')

In [None]:
# 清洗超出時間範圍的數據
df['Time'] = pd.to_datetime(df['Time'], unit="s")
mask_1 = df['Time'] > datetime(2017,11,24)
mask_2 = df['Time'] < datetime(2017,12,3)
df = df[mask_1 & mask_2]

# 重設表格索引
df = df.reset_index(drop=True)

# 確認資料已被清洗
print('Filtering data Finished!')

In [None]:
# 資料格式
df.info()

In [None]:
# 資料前五行
df.head()

#### i. 縮減資料量

In [None]:
# 縮減資料量
df_behav = df.loc[:, ['User_Id', 'Time', 'Behavior']]

# 重置表格索引
df_behav = df_behav.reset_index(drop=True)

# 新增所需欄位
df_behav['Date'] = df_behav.Time.dt.date
df_behav['Hour'] = df_behav.Time.dt.hour

In [None]:
# 轉換資料型態
df_behav['Date'] = df_behav.Date.astype('category')
df_behav['Hour'] = df_behav.Hour.astype('category')

df_behav['Behavior'] = df_behav.Behavior.astype('category')
df_behav['User_Id'] = df_behav.User_Id.astype('category')

In [None]:
df_behav.info()

In [None]:
df_behav.head()

### 二、分析內容

#### RFM 分析流程

In [None]:
# 縮減資料量
df_rfm = df_behav.query('Behavior == "buy"').loc[:, ['User_Id', 'Date', 'Time']]

# 顯示資料格式
df_rfm.info()

#### 對 Recency 進行分析

In [None]:
# 對 Recency 進行樞紐分析
r = df_rfm.groupby('User_Id')['Time'].max().reset_index().dropna(axis=0, how='any')

In [None]:
# 計算 Recency 的數值
r['Recency'] = (pd.to_datetime('2017-12-03') - r['Time']).dt.days
r.drop('Time', axis=1, inplace=True)

In [None]:
# 製作 Recency 的分布表格

def boxplot_r() -> Boxplot:
    '''
    製作 r 的四分位圖
    '''
    r_data = r.Recency.to_list()
    prepared = Boxplot().prepare_data([r_data])
    
    boxplot = (
        Boxplot()
        .add_xaxis(["Rrecency"])
        .add_yaxis("", prepared)
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title="Recency 資料分布"
            ),
            legend_opts=opts.LegendOpts(
                pos_top="5%"
            )
        )
    )
    return boxplot
boxplot_r().render_notebook()

#### 對 Frequency 進行分析

In [None]:
# 進行 F 的計算
f = df_rfm.groupby(['User_Id'])['Time'].count().reset_index()
f = f[f['Time'] > 0].reset_index(drop=True)
f.columns = ['User_Id', 'Frequency']
f.head()

In [None]:
# 製作 Frequency 的分布表格

def boxplot_f() -> Boxplot:
    '''
    製作 F 的四分位圖
    '''
    f_data = f.Frequency.to_list()
    prepared = Boxplot().prepare_data([f_data])
    
    boxplot = (
        Boxplot()
        .add_xaxis(["Frequency"])
        .add_yaxis("", prepared)
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title="Frequency 資料分布"
            ),
            legend_opts=opts.LegendOpts(
                pos_top="5%"
            )
        )
    )
    return boxplot
boxplot_f().render_notebook()

#### 進行分組建模

In [None]:
# 整合 R、F 兩個欄位
rfm_table = pd.merge(r, f, left_on='User_Id', right_on='User_Id', how='inner')
rfm_table.head()

In [None]:
# 查詢四分位數
rfm_table.quantile([.25, .5, .75])

In [None]:
# 進行分組
rfm_table['R_Score'] = pd.cut(rfm_table['Recency'], bins=[0, 2, 4, 11], labels=[3,2,1], right=False).astype('float')
rfm_table['F_Score'] = pd.cut(rfm_table['Frequency'], bins=[1, 2, 3, 4, 5, 100000], labels=[5,4,3,2,1], right=False).astype('float')

In [None]:
# 依平均進階分組
rfm_table['R_Big_then_Avg'] = (rfm_table['R_Score'] > rfm_table['R_Score'].mean()) * 1
rfm_table['F_Big_then_Avg'] = (rfm_table['F_Score'] > rfm_table['F_Score'].mean()) * 1

# 進行組別編號
rfm_table['Type'] = (rfm_table['R_Big_then_Avg'] * 10 + rfm_table['F_Big_then_Avg'] * 1)

#### 繪製用戶分配比例

In [None]:
# 進行資料對應轉換
mapping_type = {
    0: "重點挽留用戶",
    1: "一般保持用戶",
    10: "潛在消費用戶",
    11: "重要價值用戶"
}


# 分組計算個別數值
rfm_type = rfm_table.Type.value_counts().reset_index()
rfm_type.columns = ['Type', 'Count']

# 映射分類欄位
rfm_type['Type'] = rfm_type.Type.map(mapping_type)

# 計算比例
rfm_type['Ratio'] = round(100 * rfm_type.Count / rfm_type.Count.sum(), 1)

# 顯示結果
rfm_type

In [None]:
# 繪製 RFM 表格
def rfm_chart():
    pie = (
        Pie()
        .add("", [list(z) for z in zip(rfm_type.Type.to_list(), rfm_type.Ratio.to_list())])
        .set_global_opts(title_opts=opts.TitleOpts(title="RF 用戶分類圖"))
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}%"))
    )
    return pie
rfm_chart().render_notebook()

### AARRR 模型

#### 用戶一週活躍行為：

In [None]:
# 縮減資料量
df_date = df_behav.loc[:, ('User_Id', 'Date')]
df_date.info()

In [None]:
# 計算每日活躍用戶數
daily_active = df_date.groupby('Date')['User_Id'].nunique().reset_index()

# 重新命名欄位
daily_active.columns = ['Date', 'Volume']

In [None]:
# 繪製每日活躍人數的圖表
def daily_act():
    daily = daily_active.Volume.to_list()[1:]
    week = daily_active.Date.astype(str).str.slice(stop=10).to_list()[1:]
    line = (
        Line()
        .add_xaxis(week)
        .add_yaxis("活躍人數", daily)
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title="每日用戶活躍數"
            )
        )
    )
    return line
daily_act().render_notebook()

#### 於 12-02 的活躍高點佔總活躍人數比

In [None]:
# 計算高點人數以及該區間所有的用戶數
high_point = df_behav[df_behav['Date'] == "2017-12-02"].User_Id.nunique()
total_user = df_behav.User_Id.nunique()

# 打印出比例
print('活躍用戶佔比為：{:.2f}%'.format(100 * high_point / total_user))

### 一週內用戶行為變化

In [None]:
def week_behav(col):
    '''
    先把日期過濾成一週，再把數據進行透視分析以取出各項數據。
    '''
    df_week = df_behav.pivot_table(index="Date", columns="Behavior", aggfunc=['count']).iloc[:,:4]
    df_week.columns = ['buy', 'cart', 'fav', 'pv']
    behavior = df_week.reset_index()[col].to_list()[1:]
    return behavior
week_behav('pv')

In [None]:

def week_chart() -> Grid:
    week_day = ['Sat (11/25)', 'Sun (11/26)', 'Mon (11/27)', 'Tues (11/28)', 'Wed (11/29)', 'Thur (11/30)', 'Fri (12/01)', 'Sat (12/02)']
    bar = (
        Bar()
        .add_xaxis(week_day)
        .add_yaxis("購買量", week_behav("buy"), stack="stack_1", category_gap=50)
        .add_yaxis("購物車數量", week_behav("cart"), stack="stack_1", category_gap=50)
        .add_yaxis("我的最愛", week_behav("fav"), stack="stack_1", category_gap=50)
        .set_series_opts(
            label_opts=opts.LabelOpts(is_show=True)
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title="用戶行為變化週期曲線"
            ),
            legend_opts=opts.LegendOpts(
                pos_top="5%"
            )
        )
    )
    
    
    line = (
        Line()
        .add_xaxis(week_day)
        .add_yaxis("瀏覽人次", week_behav("pv"))
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title="用戶瀏覽人次",
                pos_top="48%",
            ),
            legend_opts=opts.LegendOpts(
                pos_top="50%"
            )
        )
    )
    
    # 把兩張表格合在一起
    grid = (
        Grid(init_opts=opts.InitOpts(width='1050px'))
        .add(bar, grid_opts=opts.GridOpts(pos_bottom="60%"))
        .add(line, grid_opts=opts.GridOpts(pos_top="60%"))
    )
    
    return grid

week_chart().render_notebook()

#### 銷售排行

In [None]:
# 引入必要套件
import json

# 設定搜尋資料庫
df_item = df[['Item_Id', 'Behavior']]

def search_item_sales(behavior_Type) -> list:
    '''
    篩選前 10 名的產品 ID
    '''
    behav = df_item[df_item['Behavior'] == behavior_Type]
    ranking = behav.groupby('Item_Id')['Behavior'].count().reset_index().sort_values('Behavior', ascending=False)
    top_twenty_rank = ranking.iloc[:10,:].reset_index(drop=True).to_dict()
    
    # 進行資料轉換並儲存成 list 型態以供 pyechart tree Map 使用
    item_list = []
    for rank in range(20):
        treemap = {}
        treemap['value'] = top_twenty_rank['Behavior'][rank]
        treemap['name'] = top_twenty_rank['Item_Id'][rank]
        item_list.append(treemap)
    
    return item_list

In [None]:

def item_ranking(Behavior_Type, title) -> TreeMap:
    data = [
        {
            "name": Behavior_Type,
            "children": search_item_sales(Behavior_Type),   
        }
    ]
    
    treemap = (
        TreeMap(
            init_opts=opts.InitOpts(theme="light")
        )
        .add("數量", data, is_selected=True, roam='move',
             label_opts=opts.LabelOpts(
                 position='inside'
             )   
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title=title
            )
        )
    )
    return treemap
item_ranking('buy', '商品銷售排行').render_notebook()

In [None]:
# 顯示購物車排行
item_ranking('cart', '購物車商品排行').render_notebook()

In [None]:
# 顯示我的最愛排行
item_ranking('fav', '我的最愛商品排行').render_notebook()

In [None]:
# 顯示瀏覽次數排行
item_ranking('pv', '商品瀏覽排行').render_notebook()