In [None]:
import os
from typing import List

from alibabacloud_sls20201230.client import Client as Sls20201230Client
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_sls20201230 import models as sls_20201230_models
from alibabacloud_tea_util import models as util_models
from alibabacloud_tea_util.client import Client as UtilClient
import os
import pandas as pd
from datetime import datetime, timedelta

import logging

# Configure the logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

ALIBABA_CLOUD_ACCESS_KEY_ID = os.environ["ALIBABA_CLOUD_ACCESS_KEY_ID"]
ALIBABA_CLOUD_ACCESS_KEY_SECRET = os.environ["ALIBABA_CLOUD_ACCESS_KEY_SECRET"]

config = open_api_models.Config(
    access_key_id=ALIBABA_CLOUD_ACCESS_KEY_ID,
    access_key_secret=ALIBABA_CLOUD_ACCESS_KEY_SECRET,
    read_timeout=120 * 1000,
    connect_timeout=10 * 1000,
    no_proxy="cn-hangzhou.log.aliyuncs.com",
)
# Endpoint 请参考 https://api.aliyun.com/product/Sls
config.endpoint = f"cn-hangzhou.log.aliyuncs.com"
sls_client = Sls20201230Client(config)
sls_client._read_timeout = 120 * 1000

In [None]:
# 获取SLS日志的查询结果
import time

headers = {
    "accept": "application/json",
    "user-agent": "AlibabaCloud API Workbench",
    "x-log-apiversion": "0.6.0",
    "x-log-bodyrawsize": "0",
    "x-log-signaturemethod": "hmac-sha256",
    "content-type": "application/json",
}


def get_sls_data_by_query(
    from_time: datetime,
    to_time: datetime,
    query: str = "",
    project: str = "xianmu-front-end-log",
    logstore: str = "xm-mall",
    retry_time: int = 1,
    line: int = 100,
    offset: int = 0,
) -> pd.DataFrame:
    if retry_time < 0:
        logging.error(f"超过了最多重试次数")
        return None
    logging.info(
        f"即将获取数据: =====>from_time:{from_time}, to_time:{to_time}, logstore:{logstore}, query:{query[0:150]}",
    )

    from_ = int(from_time.timestamp())
    to_ = int(to_time.timestamp())

    get_logs_v2headers = sls_20201230_models.GetLogsV2Headers(
        common_headers=headers, accept_encoding="lz4"
    )
    get_logs_v2request = sls_20201230_models.GetLogsV2Request(
        from_=from_,
        to=to_,
        query=query,
        line=line,
        offset=offset,
    )
    runtime = util_models.RuntimeOptions(
        connect_timeout=10 * 1000,
        read_timeout=120 * 1000,
        no_proxy="cn-hangzhou.log.aliyuncs.com",
        max_attempts=2,
    )

    product_view_data = []
    try:
        response = sls_client.get_logs_v2with_options(
            project=project,
            logstore=logstore,
            request=get_logs_v2request,
            headers=get_logs_v2headers,
            runtime=runtime,
        )
        product_view_data = response.body.data
        logging.info(f">=====数据条数:{len(product_view_data)}")
        return pd.DataFrame(product_view_data)
    except Exception as error:
        logging.error(
            f"查询SLS失败了,重试:{retry_time},project:{project},logstore:{logstore}, 错误:{error}"
        )
        # 5 秒后重试一次
        time.sleep(5)
        return get_sls_data_by_query(
            from_time=from_time,
            to_time=to_time,
            query=query,
            project=project,
            logstore=logstore,
            retry_time=retry_time - 1,
        )

In [None]:
# 获取SLS日志的查询结果
# 获取用户的分桶信息


def get_mid_variant_info(from_time: datetime, to_time: datetime):
    user_ab_test_info_df = None
    query = """
    ap:/abStrategy/userExperiments | select phone as cust_phone,uid as cust_id,
        json_extract_scalar(json_extract_scalar(ai, '$.rt'),'$.data["new-home"].variantId') variant_id,
        json_extract_scalar(json_extract_scalar(ai, '$.rt'),'$.data["new-home"].experimentId') experiment_id,
        count(1) request_cnt,
        date_format(min(__time__),'%Y-%m-%d %H:%i:%s') min_time,
        date_format(max(__time__),'%Y-%m-%d %H:%i:%s') max_time
    from log 
    where phone is not null and length(phone)>1 group by 1,2,3,4 
    having variant_id is not null limit 1000000
    """

    user_ab_test_info_df = get_sls_data_by_query(
        from_time=from_time,
        to_time=to_time,
        query=query,
    )
    if user_ab_test_info_df is None:
        logging.error(f"没有获取到商品的曝光数据:{from_time}~{to_time}")
        return
    
    return user_ab_test_info_df

In [None]:
# 获取用户的点击信息（点击包括了商品详情页的点击、加入购物车的点击、以及立即购买的点击）


def get_user_click_data(
    from_time: datetime = datetime.now() - timedelta(hours=1),
    to_time: datetime = datetime.now(),
    page_name: str = "/home",
):
    query = f"type:cl and pageName:{page_name} and sku"
    user_click_df = pd.DataFrame()
    offset = 0

    while True:
        batch_df = get_sls_data_by_query(
            from_time=from_time, to_time=to_time, query=query, offset=offset
        )

        if batch_df is None or len(batch_df) == 0:
            if offset == 0:
                logging.error(f"没有获取到商品的曝光数据:{from_time}~{to_time}")
            break

        user_click_df = pd.concat([user_click_df, batch_df], ignore_index=True)

        if len(batch_df) < 100:
            break

        offset = len(user_click_df)
        print("获取更多数据...")

    return user_click_df

In [None]:
# 这里是获取用户分桶数据并保存

from datetime import datetime, timedelta
from odps_client import write_pandas_df_into_odps

N = 2  # Number of days to loop over
end_date = datetime.now().date()

for i in range(N):
    current_date = end_date - timedelta(days=i)
    from_time = datetime.combine(current_date, datetime.min.time())
    to_time = from_time + timedelta(hours=24)
    ds = from_time.strftime("%Y%m%d")

    print(f"Processing data for {from_time} to {to_time}, ds:{ds}")

    df = get_mid_variant_info(from_time=from_time, to_time=to_time)
    df.drop(columns=["__source__", "__time__"], inplace=True)

    table_name = "summerfarm_ds.temp_mall_new_home_ab_info_di"

    partition_spec = f"ds={ds}"
    is_ok = write_pandas_df_into_odps(
        df=df,
        table_name=table_name,
        partition_spec=partition_spec,
        overwrite=True,
    )

    print(f"save {len(df)} records into table:{table_name}, ds:{ds}")

In [None]:
# 这里是获取用户点击数据并保存

from datetime import datetime, timedelta
from odps_client import write_pandas_df_into_odps

N = 8  # Number of days to loop over
end_date = datetime.now().date()

user_click_all_df = pd.DataFrame()

for page_name in ["/goods", "/search/goods", "/home", "/goods/category"]:
    for i in range(N):
        current_date = end_date - timedelta(days=i)
        from_time = datetime.combine(current_date, datetime.min.time())
        to_time = from_time + timedelta(hours=24)
        ds = from_time.strftime("%Y%m%d")

        print(f"Processing data for {from_time} to {to_time}, ds:{ds}")

        df = get_user_click_data(
            from_time=from_time, to_time=to_time, page_name=page_name
        )
        df.drop(columns=["__source__", "__time__"], inplace=True)

        user_click_all_df = pd.concat([user_click_all_df, df], ignore_index=True)

        print(
            f"save {len(df)} records into user_click_all_df:{len(user_click_all_df)}, ds:{ds}"
        )

In [None]:
user_click_all_df.drop_duplicates(
    inplace=True,
    subset=["uid", "cid", "sid", "__tag__:__receive_time__", "__time_ns_part__"],
)

user_click_all_df.to_csv("./user_click_all_df.csv", index=False)


In [None]:
from odps_client import get_odps_sql_result_as_df

user_view_sql="""
SELECT  ds
        ,cust_id
        ,b.variant_id
        ,page_name
        ,COUNT(1) as sku_view_cnt
FROM    summerfarm_tech.dwd_log_mall_di a
LEFT JOIN(
        SELECT cust_id as mid,min(variant_id) variant_id
        FROM summerfarm_ds.temp_mall_new_home_ab_info_di
        WHERE ds>='20240910'
        GROUP BY cust_id
) b ON b.mid=a.cust_id
WHERE   a.ds >= '20240910'
AND     a.spu_id IS NOT NULL
and     a.cust_id is not null
AND     a.page_name in ('/goods','/search/goods','/home','/goods/category')
group by a.ds
        ,a.cust_id
        ,b.variant_id
        ,a.page_name
;
"""

user_view_data_df=get_odps_sql_result_as_df(sql=user_view_sql)

In [None]:
# Create new column '是否点开购物车卡片'
user_click_all_df['是否点开购物车卡片'] = '否'
user_click_all_df.loc[(user_click_all_df['bid'].str.contains('name:加入购物车', na=False) | 
                       (user_click_all_df['pid'] == '唤起购买')), '是否点开购物车卡片'] = '是'

In [None]:
user_click_all_clean_df = user_click_all_df[
    ["uid", "phone", "__tag__:__receive_time__", "是否点开购物车卡片",'pageName']
].copy()
user_click_all_clean_df.rename(
    columns={"__tag__:__receive_time__": "receive_time"}, inplace=True
)

# Convert receive_time to datetime
user_click_all_clean_df['receive_time'] = pd.to_datetime(user_click_all_clean_df['receive_time'], unit='s')

# Create ds column as the date of receive_time in yyyyMMdd format
user_click_all_clean_df['ds'] = user_click_all_clean_df['receive_time'].dt.strftime('%Y%m%d')


In [None]:
import pandasql
import pandasql.sqldf

user_click_analytics_df = pandasql.sqldf("""
                                       select uid,ds,pageName
                                        ,count(case when `是否点开购物车卡片` = '否' then 1 end) as sku_click_cnt
                                        ,count(case when `是否点开购物车卡片` = '是' then 1 end) as sku_add_cart_click_cnt
                                        ,count(1) as total_sku_click_cnt
                                       from user_click_all_clean_df
                                       group by uid,ds,pageName
                                       """)

In [None]:
# Convert join keys to the same type (string)
user_view_data_df['cust_id'] = user_view_data_df['cust_id'].astype(str)
user_view_data_df['ds'] = user_view_data_df['ds'].astype(str)
user_view_data_df['page_name'] = user_view_data_df['page_name'].astype(str)

user_click_analytics_df['uid'] = user_click_analytics_df['uid'].astype(str)
user_click_analytics_df['ds'] = user_click_analytics_df['ds'].astype(str)
user_click_analytics_df['pageName'] = user_click_analytics_df['pageName'].astype(str)

# Merge user_view_data_df with user_click_analytics_df
merged_df = user_view_data_df.merge(
    user_click_analytics_df,
    left_on=['cust_id', 'ds', 'page_name'],
    right_on=['uid', 'ds', 'pageName'],
    how='left',
    suffixes=('', '_b')
)
