In [57]:
from odps_client import get_odps_sql_result_as_df, create_directory_if_not_exists
import pandas as pd

data_path = create_directory_if_not_exists("商城快捷登录用户转化分析")

In [None]:
merchant_df = get_odps_sql_result_as_df(
    """
SELECT  a.m_id AS cust_id
        ,CASE   WHEN invitecode = 'seeegj' THEN '自主注册'
                ELSE 'BD邀请'
        END 是否自主注册
        ,case when min_contact_create_time is null or register_time<min_contact_create_time then register_time 
                else min_contact_create_time end 注册日期
        ,CASE   WHEN operate_status = 0 THEN '已审核通过'
                WHEN operate_status = 2 THEN '待提交信息'
                WHEN operate_status = 3 THEN '已提交信息'
        END AS 审核状态
        ,CASE   WHEN b.cnt > 0 THEN '活跃用户'
                ELSE '未活跃'
        END 是否活跃
        ,size 是否单店,首次访问日期,最后访问日期
FROM    summerfarm_tech.ods_merchant_df a
LEFT JOIN   (
                SELECT  cust_id
                        ,COUNT(1) cnt
                        ,min(time) as 首次访问日期
                        ,max(time) as 最后访问日期
                FROM    summerfarm_tech.dwd_log_mall_di
                WHERE   ds >= '20240901'
                AND     envent_type = 'view'
                GROUP BY cust_id
            ) b ON      a.m_id = b.cust_id
LEFT JOIN   (   select m_id,min(create_time) as min_contact_create_time
                from summerfarm_tech.ods_contact_df
                where ds=max_pt('summerfarm_tech.ods_contact_df') group by m_id
            )c on c.m_id = a.m_id
WHERE   ds = MAX_PT("summerfarm_tech.ods_merchant_df")
AND     (size = '单店' or admin_id = 1058104)
AND     register_time >= '2024-09-01 00:00:00'
AND     islock = 0
AND     operate_status IN (0,2,3) -- 0:已审核通过，2:待提交信息，3:已提交信息
;
"""
)

merchant_df = merchant_df[merchant_df["注册日期"] >= "2024-09-01 00:00:00"]

def is_after_optimization(register_time: pd.Timestamp) -> str:
    if register_time >= pd.Timestamp("2024-09-21 00:00:00"):
        return "优化后注册"
    elif register_time < pd.Timestamp("2024-09-20 00:00:00"):
        return "优化前"
    else:
        return "发布当日"


# 创建"是否优化后注册"列
merchant_df["是否优化后注册"] = merchant_df["注册日期"].apply(is_after_optimization)

In [None]:
# 将注册日期列转换为日期格式
merchant_df["注册日期"] = pd.to_datetime(merchant_df["注册日期"]).dt.date
merchant_df["首次访问日期"] = pd.to_datetime(merchant_df["首次访问日期"]).dt.date
merchant_df["最后访问日期"] = pd.to_datetime(merchant_df["最后访问日期"]).dt.date

print(merchant_df["审核状态"].unique())

merchant_df.head(5)

In [None]:
from datetime import datetime,timedelta

user_orders_df=get_odps_sql_result_as_df("""
SELECT  m_id
        ,total_price
        ,order_no
        ,order_time 下单日期
FROM    summerfarm_tech.ods_orders_df
WHERE   ds = MAX_PT("summerfarm_tech.ods_orders_df")
AND     m_id in (select m_id FROM summerfarm_tech.ods_merchant_df where ds=max_pt('summerfarm_tech.ods_merchant_df') and register_time >= '2024-09-01 00:00:00')
AND     order_time >= '2024-09-01 00:00:00'
AND     status in (2,3,6);
""")

user_orders_df['下单日期'] = pd.to_datetime(user_orders_df['下单日期']).dt.date

user_orders_df.head(5)

In [None]:
import numpy as np
# 合并 user_view_data_df 和 user_click_analytics_df
merged_df = merchant_df.merge(
    user_orders_df,
    left_on=["cust_id"],
    right_on=["m_id"],
    how="left",
    suffixes=("", "_b"),
)

# 删除 '是否活跃' 列中包含 None 值的行
merged_df = merged_df.dropna(subset=['是否优化后注册'])

# 确保 '下单日期' 和 '注册日期' 列是日期时间格式
merged_df['下单日期'] = pd.to_datetime(merged_df['下单日期'])
merged_df['注册日期'] = pd.to_datetime(merged_df['注册日期'])

# 创建 '下单日距注册日天数' 列
merged_df['下单日距注册日天数'] = (merged_df['下单日期'] - merged_df['注册日期']).dt.days

# 重新分配 '是否活跃' 列的值
merged_df['是否活跃'] = np.where(merged_df['最后访问日期'] > merged_df['首次访问日期'], '活跃', '不活跃')
merged_df['total_price'] = merged_df['total_price'].astype(float)

merged_df.head(5)


In [None]:
merged_df.describe()

In [None]:
import pandasql

# 将列名中的中文字符替换为英文字符
merged_df.columns = merged_df.columns.str.replace("是否自主注册", "is_self_register")
merged_df.columns = merged_df.columns.str.replace("审核状态", "audit_status")
merged_df.columns = merged_df.columns.str.replace(
    "是否优化后注册", "is_optimized_register"
)
merged_df.columns = merged_df.columns.str.replace(
    "下单日距注册日天数", "days_since_register"
)
merged_df.columns = merged_df.columns.str.replace("是否活跃", "is_active")

df = pandasql.sqldf(
    """
SELECT is_self_register, audit_status, is_optimized_register,
       COUNT(DISTINCT cust_id) AS total_users,
       COUNT(DISTINCT CASE WHEN days_since_register <= 14 THEN cust_id END) AS users_ordered_within_14days,
       COUNT(DISTINCT CASE WHEN is_active = '活跃' THEN cust_id END) AS active_users,
       COUNT(DISTINCT CASE WHEN is_active = '不活跃' THEN cust_id END) AS inactive_users,
       SUM(CASE WHEN days_since_register <= 14 THEN total_price ELSE 0 END) AS gmv_within_14days,
       COUNT(DISTINCT CASE WHEN days_since_register <= 14 THEN order_no END) AS orders_within_14days,
       ROUND(1.0 * COUNT(DISTINCT CASE WHEN days_since_register <= 14 THEN cust_id END) / COUNT(DISTINCT cust_id), 2) AS uv_conversion_rate_within_14days,
       concat(min(CASE WHEN days_since_register <= 14 THEN `下单日期` END),'~',max(CASE WHEN days_since_register <= 14 THEN `下单日期` END)) as 下单日期范围,
       concat(min(`注册日期`),'~',max(`注册日期`)) as 注册日期范围,
       count(distinct `注册日期`) as `注册日期跨度(天)`
FROM merged_df
WHERE `注册日期` <= date('now', '-14 days')
GROUP BY is_self_register, audit_status, is_optimized_register
ORDER BY is_self_register, audit_status, is_optimized_register
"""
)

# 将列名改回中文
df.columns = [
    "是否自主注册",
    "审核状态",
    "是否优化后注册",
    "用户数",
    "14天内下单用户数",
    "活跃用户数",
    "不活跃用户数",
    "14天内GMV",
    "14天内订单数",
    "14天内下单UV转化率",
    "下单日期范围",
    "注册日期范围",
    "注册日期跨度(天)",
]

# 替换下单日期范围和注册日期范围中的 ' 00:00:00.000000'
df["下单日期范围"] = df["下单日期范围"].str.replace(" 00:00:00.000000", "")
df["注册日期范围"] = df["注册日期范围"].str.replace(" 00:00:00.000000", "")
df["14天内下单UV转化率(活跃用户)"] = round(
    df["14天内下单用户数"] * 1.00 / df["活跃用户数"], 2
)

df[
    [
        "是否自主注册",
        "审核状态",
        "是否优化后注册",
        "用户数",
        "活跃用户数",
        "不活跃用户数",
        "14天内下单用户数",
        "14天内GMV",
        "14天内订单数",
        "14天内下单UV转化率",
        "14天内下单UV转化率(活跃用户)",
        "下单日期范围",
        "注册日期范围",
        "注册日期跨度(天)",
    ]
]

In [58]:
date_range = df["注册日期范围"].iloc[0].split("~")[0]
date_range = date_range + "~" + df["注册日期范围"].iloc[1].split("~")[1]
df.to_csv(f"{data_path}/客户群分层分析_{date_range}.csv", index=False)