In [None]:
import pandas as pd

# Display all columns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)

all_df=pd.read_csv("./8月召回任务销售拜访记录 8.13~8.20 共3931条.csv")

print(all_df.columns)

In [None]:
# Import the base64 encoding library.
import base64, os, time
import logging

# Configure the logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

proxy_object = {"http": "http://127.0.0.1:8001", "https": "http://127.0.0.1:8001"}


from openai import AzureOpenAI

client_gpt4o = AzureOpenAI(
    api_version="2024-03-01-preview",
    azure_endpoint="https://xm-ai-us2.openai.azure.com",
    api_key=os.getenv("AZURE_GPT4O_API_KEY", ""),
)

client_gpt4o_mini = AzureOpenAI(
    api_version="2024-03-01-preview",
    azure_endpoint="https://xm-ai-us.openai.azure.com",
    api_key=os.getenv("AZURE_GPT4O_MINI_API_KEY", ""),
)


def call_azure_openai(messages=[], retrying=1, is_gpt4o=False) -> (str, bool):
    if retrying < 0:
        return "超过了最大重试次数", False
    completion = None
    ## gpt3.5:  gpt-35-turbo-16k,
    ## got4o:   gpt-4o
    ## got4o-mini:   gpt-4o-mini
    model = "gpt-4o-mini"
    client_to_use = client_gpt4o_mini
    if is_gpt4o:
        logging.info(f"using GPT-4o...:{messages}")
        model = "gpt-4o"
        client_to_use = client_gpt4o
    try:
        completion = client_to_use.chat.completions.create(
            model=model,
            temperature=0.1,
            max_tokens=4095,
            messages=messages,
            response_format={"type": "json_object"},
        )
        response = completion.choices[0].message.content
        if (
            len(completion.choices) <= 0
            or f"{completion.choices[0].finish_reason}" == "content_filter"
        ):
            return f"azure过滤了本次请求:{completion.choices[0].to_dict()}", False
        if response is None:
            logging.info(f"azure API返回了异常:{completion.to_dict()}")
            time.sleep(10)
            return call_azure_openai(
                messages=messages,
                retrying=retrying - 1,
                is_gpt4o=is_gpt4o,
            )
        logging.info(f"total usage:{completion.usage}")
        return response, True
    except Exception as e:
        logging.info(
            f"请求azure接口报错了:{e}\n messages:{messages}, completion:{completion}"
        )
        if retrying <= 0 or "Error code: 400" in f"{e}":
            return f"{e}", False
        logging.info(f"重试中...{retrying}, messages:{messages}")
        return call_azure_openai(
            messages=messages,
            retrying=retrying - 1,
            is_gpt4o=is_gpt4o,
        )


def call_ai_api_to_get_insigns(visit_text=""):
    result = {}
    json_text, is_ok = call_azure_openai(
        is_gpt4o=False,
        messages=[
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": """
用户会给发给你一系列销售员对客户的拜访记录，请你用JSON回答以下几个问题：
- 客户是否长时间未下单？
- 是否见到核心KP？
- 客户长时间未下单的原因？
- 本次拜访销售做了什么准备？
- 销售向客户推荐了哪些具体的活动？
- 销售向客户推荐了哪些具体的商品？
- 客户的主要采买渠道是？
- 客户不愿意当场下单的原因？
- 拜访记录完整性打分？（0-100分，100分表示非常完整，0分表示非常不完整）

**请注意，‘安佳’，‘铁塔’一般来说是商品名字，而不太可能是活动名字，活动名字一般带有‘专享’、‘清仓’、‘特价’、‘活动’等字样**
**请你完全基于销售员的拜访记录内容来回答以上问题，如果拜访内容中找不到问题的答案，请回答‘无’**
**请你用问题的标题做JSON的key，答案做value，比如：{"客户是否长时间未下单": "是"}**
""",
                    }
                ],
            },
            {
                "role": "user",
                "content": [{"type": "text", "text": visit_text}],
            },
        ],
    )

    logging.info(f"json_text:{json_text}, visit_text:{visit_text}")
    return json_text


# top2_df = all_df.head(2).copy()
from datetime import datetime
date_of_now=datetime.now().strftime("%Y-%m-%d")
# all_df["AI分析"] = all_df["跟进情况描述"].apply(call_ai_api_to_get_insigns)
# all_df.to_csv(f"./8月召回任务销售拜访记录8.13~8.20共3931条_GPT4o_mini_{date_of_now}.csv", index=False)
# all_df=pd.read_csv(f"./8月召回任务销售拜访记录8.13~8.20共3931条_GPT4o_mini_2024-08-26.csv")
# all_df.head(10)[["客户名称", "AI分析", "跟进情况描述"]]

# top100_df = all_df.head(100).copy()
# top100_df["AI分析"] = top100_df["跟进情况描述"].apply(call_ai_api_to_get_insigns)
# top100_df.to_csv("./8月召回任务销售拜访记录8.13~8.20top100条_GPT4o_mini.csv")
# top100_df.head(100)[["客户名称", "AI分析", "跟进情况描述"]]

In [None]:
import json

keys = []

for index, row in all_df.iterrows():
    ai_result = json.loads(row["AI分析"])
    if not keys:
        keys = list(ai_result.keys())
        logging.info(f"keys: {keys}")
        break


def extract_ai_result(ai_result, key):
    return json.loads(ai_result).get(key, "未知")


for key in keys:
    all_df[key] = all_df["AI分析"].apply(lambda x: extract_ai_result(x, key))

display_keys = ["客户ID", "客户名称", "AI分析", "跟进情况描述"]
display_keys.extend(keys)
all_df.head(10)[display_keys]

In [None]:
logging.info(f"columns:{all_df.columns}")

In [None]:
import pandasql

all_str_df = all_df.astype(str)

# Split the '客户长时间未下单原因' column into two columns
all_str_df[["no_purchase_reason_lv1", "no_purchase_reason_lv2"]] = all_str_df[
    "客户长时间未下单原因"
].str.split("-", n=1, expand=True)

# Split the '客户不愿意拜访期间下单的原因' column into two columns
all_str_df[["wont_purchase_reason_lv1", "wont_purchase_reason_lv2"]] = all_str_df[
    "客户不愿意拜访期间下单的原因"
].str.split("-", n=1, expand=True)

# Set wont_purchase_reason_lv2 with default value from no_purchase_reason_lv1 if it's missing
all_str_df["no_purchase_reason_lv2"] = all_str_df["no_purchase_reason_lv2"].fillna(all_str_df["no_purchase_reason_lv1"])
all_str_df["wont_purchase_reason_lv2"] = all_str_df["wont_purchase_reason_lv2"].fillna(all_str_df["wont_purchase_reason_lv1"])


# # SQL query with COALESCE to set default value
# static_df = pandasql.sqldf(
#     """
#     SELECT 
#         no_purchase_reason_lv2 长时间不下单具体原因,
#         COUNT(1) AS 出现次数,
#         COUNT(DISTINCT `实际拜访BD`) AS 出现bd个数
#     FROM all_str_df
#     GROUP BY 1
#     ORDER BY 出现次数 DESC, 出现bd个数 DESC;
#     """
# )

# static_df.head(500)

In [None]:
import pandas as pd

reasons = {
    '主类别': [
        '价格因素', '价格因素', '价格因素', '价格因素',
        '库存和需求', '库存和需求', '库存和需求', '库存和需求', '库存和需求',
        '供应商选择', '供应商选择', '供应商选择', '供应商选择',
        '营业状况', '营业状况', '营业状况', '营业状况', '营业状况', '营业状况', '营业状况',
        '产品质量', '产品质量',
        '配送问题', '配送问题', '配送问题',
        '采购决策', '采购决策', '采购决策',
        '操作和习惯', '操作和习惯', '操作和习惯',
        '产品匹配度', '产品匹配度', '产品匹配度',
        '客户服务', '客户服务',
        '个人原因', '个人原因', '个人原因',
        '公司政策', '公司政策',
        '其他', '其他', '其他', '其他'
    ],
    '具体原因': [
        '价格高/贵', '价格没有优势', '其他平台/供应商更便宜', '需要特价/优惠',
        '还有库存/货', '用量少/小', '暂时不需要/不缺货', '刚进货/补货', '季节性需求减少',
        '有固定/其他供应商', '在其他渠道购买', '公司/总部统一采购', '转换供应商/平台',
        '生意不好/淡季', '暂停营业', '放假', '装修', '搬迁', '倒闭/关店', '转让',
        '品质差/不稳定', '不新鲜',
        '配送不方便/不及时', '运费问题', '配送门槛高',
        '老板/负责人不在', '换人负责采购', 'KP不在/联系不上',
        '不习惯使用平台', '觉得操作麻烦', '忘记平台有某些产品',
        '需要的产品平台没有', '规格不合适', '品类调整',
        '售后服务不好', '客诉问题未解决',
        '出差/旅游', '怀孕/生育', '照顾家人',
        '公司管控严格', '不允许外采',
        '刚开业/新店筹备', '账号问题', '天气原因', '资金问题'
    ]
}

reasons_df = pd.DataFrame(reasons)
reasons_df

In [None]:
normalized_reason_df=pd.read_csv("./客户反馈-原因归类.csv")
normalized_reason_df.head(2)

In [None]:
# Perform the join
joined_df = all_str_df.merge(
    normalized_reason_df,
    how="left",
    left_on="no_purchase_reason_lv2",
    right_on="客户反馈",
)

joined_df = joined_df.merge(
    reasons_df, how="left", left_on="原因归类", right_on="具体原因"
)

# Display the result
joined_df.head(2)[["客户反馈", "原因归类", "跟进情况描述", "客户ID", "客户名称", "主类别"]]

In [None]:
# SQL query with COALESCE to set default value

total_rows=len(joined_df)
total_bd_count=len(joined_df['实际拜访BD'].unique())

static2_df = pandasql.sqldf(
    f"""
    SELECT 
        COALESCE(主类别,no_purchase_reason_lv1)主类别,
        COALESCE(原因归类,no_purchase_reason_lv2) as 原因归类,
        {total_rows} AS 总拜访记录数,
        COUNT(1) AS 出现次数,
        ROUND(COUNT(1)*100.00/{total_rows}, 2) AS 原因占比pt,
        COUNT(DISTINCT `实际拜访BD`) AS 反馈BD人数,
        {total_bd_count} AS 总BD人数,
        ROUND(100.00*COUNT(DISTINCT `实际拜访BD`)/{total_bd_count},2)||'%' AS 反馈BD人数pt
    FROM joined_df
    GROUP BY 1, 2
    ORDER BY 出现次数 DESC, 反馈BD人数 DESC;
    """
)

# 2. 计算累积百分比
static2_df['累计原因百分比'] = static2_df['原因占比pt'].cumsum()
static2_df.head(100)

In [None]:
joined_df_clean = joined_df[
    [
        "任务类型",
        "任务ID",
        "任务名称",
        "任务生效开始时间",
        "任务失效时间",
        "客户ID",
        "客户名称",
        "客户业态",
        "客户分组",
        "注册省份",
        "行政市",
        "最新归属BD_ID",
        "最新归属bd_name",
        "最新M1",
        "最新M2",
        "最新M3",
        "销售区域",
        "是否完成拜访",
        "实际拜访BD",
        "拜访方式",
        "拜访时间",
        "拜访后客户首次登录时间",
        "拜访后首次点击时间",
        "拜访后首次加购时间",
        "拜访后7天合计实付GMV",
        "活动期间累计实付GMV",
        "活动期间下单天数",
        "活动期间交易超过30的天数",
        "跟进情况描述",
        "客户长时间未下单原因",
        "本次拜访前销售做了什么准备",
        "销售向客户推荐了平台的什么活动",
        "销售向客户推荐了哪些具体的商品",
        "客户主要采买渠道",
        "客户不愿意拜访期间下单的原因",
    ]
]
joined_df_clean.head(2)

joined_df_clean.to_csv("./8月召回任务销售拜访记录8.13~8.20-AI分析结果.csv", index=False)

In [None]:
import sys,os
# Expand the `~` to the full path and append it to `sys.path`
full_path = os.path.expanduser('~/Documents/github/aliyun-devops')
sys.path.append(full_path)

from odps_client import get_odps_sql_result_as_df
from datetime import datetime, timedelta

ds = (datetime.now() - timedelta(days=1)).strftime("%Y%m%d")

sql = f"""
SELECT  m_id
        ,admin_id
        ,admin_name AS 拜访人
        ,b.m1_name as M1负责人
        ,b.m2_name as M2负责人
        ,b.m3_name as M3负责人
        ,b.zone_name as 销售区域
        ,follow_up_way as 拜访方式
        ,condition as 拜访内容
        ,CASE   WHEN `status` = 0 THEN '未跟进'
                WHEN `status` = 1 THEN '已跟进'
                WHEN `status` = 2 THEN '已跟进且下单'
                WHEN `status` = 3 THEN '联系不上'
                WHEN `status` = 4 THEN '放弃跟进'
                WHEN `status` = 9 THEN '重置'
                ELSE '未知状态'
        END AS 拜访状态
        ,add_time as 拜访时间
        ,CASE   WHEN visit_objective = 0 THEN '拉新'
                WHEN visit_objective = 1 THEN '催月活'
                WHEN visit_objective = 2 THEN '客户维护'
                WHEN visit_objective = 3 THEN '拓品'
                WHEN visit_objective = 4 THEN '售后处理'
                WHEN visit_objective = 5 THEN '催省心送'
                ELSE '未知目的'
        END AS 拜访目的
        ,CASE   WHEN visit_type = 0 THEN '普通拜访'
                WHEN visit_type = 1 THEN '陪访'
                ELSE '未知类型'
        END AS 拜访类型
        ,'https://azure.summerfarm.net/'|| a.follow_up_pic as 拜访图片
        ,a.ds
FROM    summerfarm_tech.ods_follow_up_record_di a
INNER JOIN summerfarm_tech.dim_bd_df b
ON      b.ds = MAX_PT("summerfarm_tech.dim_bd_df")
AND     a.admin_id = b.bd_id
WHERE   a.ds = '{ds}'
;
"""

bd_follow_up_record_df = get_odps_sql_result_as_df(sql=sql)
bd_follow_up_record_df[["拜访人", "拜访内容", "拜访状态", "拜访目的", "拜访类型", "拜访时间", "销售区域", "m1负责人", "m2负责人", "m3负责人", "拜访图片"]].to_csv(
    f"./{ds}_BD拜访记录_全部_{len(bd_follow_up_record_df)}条.csv", index=False
)
bd_follow_up_record_df.head(2)

In [None]:
# Extract and analyze key information from the visit records
# Parsing the "拜访内容" column for key details

def extract_info(record):
    info = {
        '是否见到核心KP': '',
        '客户长时间未下单原因': '',
        '不愿当场下单原因': '',
        '竞对信息': '',
        '拜访前准备': '',
        '向客户推荐什么活动及商品': ''
    }
    if "是否见到核心KP" in record:
        info['是否见到核心KP'] = "是" if "是否见到核心KP-#是#" in record else "否"
    if "客户长时间未下单原因" in record:
        start_idx = record.find("客户长时间未下单原因-") + len("客户长时间未下单原因-")
        end_idx = record.find("；", start_idx)
        info['客户长时间未下单原因'] = record[start_idx:end_idx]
    if "不愿当场下单原因" in record:
        start_idx = record.find("不愿当场下单原因-") + len("不愿当场下单原因-")
        end_idx = record.find("；", start_idx)
        info['不愿当场下单原因'] = record[start_idx:end_idx]
    if "竞对信息" in record:
        start_idx = record.find("竞对信息-") + len("竞对信息-")
        end_idx = record.find("；", start_idx)
        info['竞对信息'] = record[start_idx:end_idx] if end_idx != -1 else record[start_idx:]
    if "本次拜访前做了什么准备" in record:
        start_idx = record.find("本次拜访前做了什么准备-") + len("本次拜访前做了什么准备-")
        end_idx = record.find("；", start_idx)
        info['拜访前准备'] = record[start_idx:end_idx] if end_idx != -1 else record[start_idx:]
    if "向客户推荐什么活动及商品" in record:
        start_idx = record.find("向客户推荐什么活动及商品-") + len("向客户推荐什么活动及商品-")
        end_idx = record.find("；", start_idx)
        info['向客户推荐什么活动及商品'] = record[start_idx:end_idx] if end_idx != -1 else record[start_idx:]
    return info

# Apply the extraction to each row
bd_follow_up_record_df['拜访内容JSON'] = bd_follow_up_record_df['拜访内容'].apply(extract_info)

bd_follow_up_record_df.head(20)

In [None]:
for city in bd_follow_up_record_df['销售区域'].unique():
    # Filter the dataframe for the current city
    city_df = bd_follow_up_record_df[bd_follow_up_record_df['销售区域'] == city]
    
    # Create a valid filename by replacing any characters that might be problematic in filenames
    safe_city_name = ''.join(c if c.isalnum() or c in ('-', '_') else '_' for c in city)
    
    # Save the city's records to a CSV file
    filename = f"{safe_city_name}_{ds}_拜访记录.csv"
    city_df[['拜访人','拜访目的', '拜访类型','拜访内容']].to_csv(filename, index=False, encoding='utf-8-sig')
    
    print(f"Saved {len(city_df)} records for {city} to {filename}")

In [None]:
shanghai_pudong_df=bd_follow_up_record_df[bd_follow_up_record_df['销售区域'] == '浦东']
shanghai_pudong_df["AI分析"] = shanghai_pudong_df["拜访内容"].apply(call_ai_api_to_get_insigns)
shanghai_pudong_df[["销售区域","拜访人","拜访内容","AI分析"]].to_csv("./上海浦东_20240826_拜访记录_AI分析.csv", index=False)
shanghai_pudong_df[["销售区域","拜访人","拜访内容","AI分析"]].head(10)

In [None]:
import json

keys = []

ai_analysis_df=shanghai_pudong_df[["销售区域","拜访人","拜访内容","AI分析"]].copy()

for index, row in ai_analysis_df.iterrows():
    ai_result = json.loads(row["AI分析"])
    if not keys:
        keys = list(ai_result.keys())
        logging.info(f"keys: {keys}")
        break


def extract_ai_result(ai_result, key):
    return json.loads(ai_result).get(key, "未知")


for key in keys:
    ai_analysis_df[key] = ai_analysis_df["AI分析"].apply(lambda x: extract_ai_result(x, key))

display_keys = ["销售区域","拜访人","拜访内容"]
display_keys.extend(keys)
ai_analysis_df.head(10)[display_keys]
ai_analysis_df[display_keys].to_csv("./上海浦东_20240826_拜访记录_AI分析_展开.csv", index=False)