In [None]:
from multiprocessing import process
from nt import error
import pandas as pd
import logging
from pathlib import Path

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.FileHandler("etl.log", encoding='utf-8'), logging.StreamHandler()]
)

order_data_locate = Path('../raw/order_table_cleaned.csv')
review_data_locate = Path('../raw/olist_order_reviews_dataset.csv')
save_cleaned = Path('../processed')
save_cleaned.mkdir(parents=True,exist_ok=True)

def data_quality_report(df,stage_name):
    """
    输出数据质量报告（重复率、缺失率等）
    :param df: DataFrame
    :param stage_name: 阶段名称，如 '原始数据' 或 '清洗后的数据'
    :return: dict (用于后续保存或对比)
    """
    total_rows = len(df)
    total_cols = df.shape[1]
    missing_count = df.isnull().sum().sum()
    duplicated_count = df.duplicated().sum()

    # 计算比例
    missing_rate = (missing_count / (total_rows * total_cols) * 100) if total_rows > 0 else 0
    duplicated_rate = (duplicated_count / total_rows * 100) if total_rows > 0 else 0

    # 打印日志
    logging.info(f'[DATA QUALITY] {stage_name} - 行数: {total_rows} | 列数: {total_cols} | 缺失率: {missing_rate:.2f}% | 重复率: {duplicated_rate:.2f}%')

    # 返回结构化结果，方便后面汇总
    return {
        'stage': stage_name,
        'rows': total_rows,
        'cols': total_cols,
        'missing_count': missing_count,
        'missing_rate': round(missing_rate, 2),
        'duplicated_count': duplicated_count,
        'duplicated_rate': round(duplicated_rate, 2)
    }


def extract_data(path_list):
    """
        从指定路径加载CSV文件
    :param path_list: CSV文件路径列表
    :return: 加载后的DataFrame (订单和评论宽表)
    """
    logging.info(f'[EXTRACT] 正在加载数据: {path_list}')
    try:
        order_table = pd.read_csv(path_list[0])
        review_table = pd.read_csv(path_list[1])
    except FileNotFoundError:
        logging.error(f'文件未找到: {path_list}')
        return pd.DataFrame()

    cond = order_table['order_status'] == 'delivered'
    
    order_table = order_table.loc[cond,['order_id','days']]
    order_review_wide = review_table.merge(order_table,how='inner',on='order_id')
    row,col = order_review_wide.shape
    logging.info(f'加载完成，共{row}行，{col}列')
    return order_review_wide

def fill_date(time_col_data,data_col):
    """
    对时间列缺失值进行填充,填充固定值1970-01-01 00:00:00  仅当占位使用代替NULL无实际意义
    :param time_col_data 时间列数据
    :param data_col: 时间列名
    :return: 填充后的无缺失值的时间列数据
    """   
    try:
        filled = time_col_data.fillna('1970-01-01 00:00:00')
        logging.info(f'对{data_col}列填充完成')
        return filled
    except Exception as e:
        logging.info(f'对{data_col}列填充失败,失败原因{e}')
       
def drop_duplicated(order_review_wide):
    """
    对DataFrame删除重复值
    :param order_table: 输入DataFrame
    :return: 处理后的DataFrame
    """
    try:
        drop_duplicated_data = order_review_wide.drop_duplicates()
        return drop_duplicated_data
    except Exception as e:
        logging.info(f'删除重复值失败，失败原因{e}')

def feature_engineering(order_review_wide):
    # 设定区间范围和标签
    bins = [0, 7, 14, 30, 180]
    labels = ['0-7天', '8-14天', '15-30天', '>30天']

    order_review_wide['bucket'] = pd.cut(
        order_review_wide['days'],
        bins=bins,
        labels=labels,
        right=True,
        include_lowest=True
    )
    return order_review_wide

def transform_data(order_review_wide):
    """
    对订单表进行数据转换,包括缺失值填充，重复值处理，异常数据清理
    :param order_review_wide: 评分+订单宽表
    :return: 转换后的订单表
    """
    # 1.判断数据是否为空
    if order_review_wide.empty:
        logging.warning('输入数据为空，跳过 transform 阶段')
        return order_review_wide

    # 2.缺失值处理
    logging.info('[TRANSFORM] 查看缺失值......')
    null_count =  order_review_wide.isnull().sum().sum()
    if null_count > 0:
        # 保存缺失列数据分别情况
        loss_data = order_review_wide.isnull().sum()
        loss_data = loss_data[loss_data>0]
        logging.info(f'缺失列情况: \n{loss_data}')

        for index in loss_data.index:
            if index.endswith(('_date','_timestamp')):
                logging.info(f'{index} 为时间列,缺失数:{loss_data[index]}进行时间填充')
                order_review_wide[index] = fill_date(order_review_wide[index], index)
            else:
                logging.info(f'{index} 为普通列，缺失数:{loss_data[index]}，填充为“U” 表示未知')
                order_review_wide[index] = order_review_wide[index].fillna('U')
        logging.info('缺失值填充完成')
    else:
        logging.info('无缺失值')

    # 3.重复值清理
    duplicated_count = order_review_wide.duplicated().sum()
    if duplicated_count !=0:
        logging.info(f'检测到{duplicated_count}条重复值，执行删除...')
        order_review_wide = drop_duplicated(order_review_wide)
        logging.info('重复值删除完成')
    else:
        logging.info('无重复值')
    

    # 4.特征工程
    order_review_wide = feature_engineering(order_review_wide)

    return order_review_wide

def load_data(order_review_wide):
    """
        将DataFrame保存至指定路径的CSV文件
    :param order_review_wide: 输入DataFrame
    :param output_path: 输出文件路径
    :return: None
    """
    path = save_cleaned / 'order_review_wide.csv'
    order_review_wide.to_csv(path,index=False)
    logging.info(f'[LOAD] 数据已保存至{path}')

def etl_pipeline(input_path):
    order_review_wide_raw = extract_data(input_path)

    quality_before = data_quality_report(order_review_wide_raw, '原始数据')

    order_review_wide_processed = transform_data(order_review_wide_raw)
    quality_after = data_quality_report(order_review_wide_processed, '清洗后数据')

    load_data(order_review_wide_processed)

    # 生成质量报告汇总表
    report_df = pd.DataFrame([quality_before, quality_after])
    report_path = save_cleaned / 'data_quality_report.csv'
    report_df.to_csv(report_path, index=False, encoding='utf-8-sig')
    logging.info(f' 数据质量报告已保存至: {report_path}')


if __name__ == '__main__':
    path_list = [order_data_locate,review_data_locate]
    etl_pipeline(path_list)

    

2025-10-16 15:42:08,277 [INFO] [EXTRACT] 正在加载数据: [WindowsPath('../raw/order_table_cleaned.csv'), WindowsPath('../raw/olist_order_reviews_dataset.csv')]
2025-10-16 15:42:09,031 [INFO] 加载完成，共96095行，8列
2025-10-16 15:42:09,107 [INFO] [DATA QUALITY] 原始数据 - 行数: 96095 | 列数: 8 | 缺失率: 18.48% | 重复率: 0.00%
2025-10-16 15:42:09,107 [INFO] [TRANSFORM] 查看缺失值......
2025-10-16 15:42:09,131 [INFO] 缺失列情况: 
review_comment_title      84887
review_comment_message    57183
dtype: int64
2025-10-16 15:42:09,131 [INFO] review_comment_title 为普通列，缺失数:84887，填充为“U” 表示未知
2025-10-16 15:42:09,137 [INFO] review_comment_message 为普通列，缺失数:57183，填充为“U” 表示未知
2025-10-16 15:42:09,147 [INFO] 缺失值填充完成
2025-10-16 15:42:09,213 [INFO] 无重复值
2025-10-16 15:42:09,292 [INFO] [DATA QUALITY] 清洗后数据 - 行数: 96095 | 列数: 9 | 缺失率: 0.00% | 重复率: 0.00%
2025-10-16 15:42:09,534 [INFO] [LOAD] 数据已保存至..\processed\order_review_wide.csv
2025-10-16 15:42:09,536 [INFO]  数据质量报告已保存至: ..\processed\data_quality_report.csv
