In [None]:
from xinghe.spark import *
from app.common.json_util import *
from xinghe.s3 import *
from xinghe.s3.read import *
from xinghe.ops.spark import spark_resize_file

config = {
    "spark_conf_name": "spark_4",
    "skip_success_check": True,
}

import base64
import re
import random
import time
import sys
import pickle
import zlib
import uuid
import traceback
import numpy as np
import pandas as pd
from typing import List, Dict, Union
from copy import deepcopy, copy
from urllib.parse import quote, unquote, urlparse, parse_qs
from datetime import datetime
from lxml import html
from collections import defaultdict
from func_timeout import FunctionTimedOut, func_timeout

from pyspark.sql import Row, DataFrame
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col, collect_list, struct, expr, rand, count, pandas_udf, PandasUDFType, \
    round as _round, lit, to_json, sum as _sum, collect_list, first
from pyspark.sql.types import StructType, StructField, IntegerType, BinaryType, StringType, MapType, LongType

TIMEOUT_SECONDS = 3600 * 5
MAX_OUTPUT_ROW_SIZE = 1024 * 1024 * 1024 * 1.5
SIMILARITY_THRESHOLD = 0.95
ERROR_PATH = "s3://xxx/"
INPUT_PATH = "s3://xxx/"
OUTPUT_PATH = "s3://xxx/"

# utils

In [None]:
def get_s3_doctor(target_theme):
    partition_id = str(uuid.uuid4())
    current_time = datetime.now().strftime("%Y%m%d")
    error_log_path = f"{ERROR_PATH}{target_theme}/{current_time}/{partition_id}.jsonl"
    s3_doc_writer = S3DocWriter(path=error_log_path)
    return s3_doc_writer


def read_to_index(_iter):
    s3_doc_writer = get_s3_doctor("layout_index")
    error_info = None
    for fpath in _iter:
        current_layout_id = None
        start_offset = None
        layout_length = 0
        idx = 0
        for row in read_s3_rows(fpath):
            idx += 1
            try:
                detail_data = json_loads(row.value)
                layout_id = detail_data["layout_id"]
                offset, length = map(int, row.loc.split("bytes=")[-1].split(","))
                if layout_id == current_layout_id:
                    layout_length += length
                    continue
                else:
                    if current_layout_id is not None:
                        line = {
                            "layout_id": current_layout_id,
                            "url_host_name": detail_data["url_host_name"],
                            "count": idx - 1,
                            "file": {
                                "filepath": fpath,
                                "offset": start_offset,
                                "length": layout_length,
                                "record_count": idx - 1,
                                "timestamp": int(time.time())
                            }
                        }
                        yield line
                        idx = 1
                    current_layout_id = layout_id
                    start_offset = offset
                    layout_length = 0
                    layout_length += length
            except Exception as e:
                error_info = {
                    "error_type": type(e).__name__,
                    "error_message": str(e),
                    "traceback": traceback.format_exc(),
                    "input_data": row.value if hasattr(row, 'value') else str(row),
                    "timestamp": datetime.now().isoformat()
                }
                s3_doc_writer.write(error_info)
                continue
    if current_layout_id is not None:
        line = {
            "layout_id": current_layout_id,
            "url_host_name": detail_data["url_host_name"],
            "count": idx,
            "file": {
                "filepath": fpath,
                "offset": start_offset,
                "length": layout_length,
                "record_count": idx,
                "timestamp": int(time.time())
            }
        }
        yield line
    if error_info:
        s3_doc_writer.flush()


# main func

In [None]:
def create_spark(spark_name: str):
    global spark
    spark = new_spark_session(f"layout.index.{spark_name}", config)
    sc = spark.sparkContext
    sc.setLogLevel("ERROR")


def parse_input_path(input_path: str):
    try:
        with open("./is_index_complated.txt", "r", encoding="utf-8") as f:
            content = f.read()
            already_exist = [i for i in content.split(",") if i] if content else []
    except:
        already_exist = []
    input_path_lst = [i for i in list(list_s3_objects(input_path, recursive=False)) if i not in already_exist]
    return input_path_lst


def create_index_df(input_path_lst: List):
    schema = StructType([
        StructField("layout_id", StringType(), True),
        StructField("url_host_name", StringType(), True),
        StructField("count", LongType(), True),
        StructField("file", StructType([
            StructField("filepath", StringType(), True),
            StructField("offset", LongType(), True),
            StructField("length", LongType(), True),
            StructField("record_count", LongType(), True),
            StructField("timestamp", IntegerType(), True),
        ]), True),
    ])
    page_content = sc.parallelize(input_path_lst, len(input_path_lst))
    dump_html_df = page_content.mapPartitions(read_to_index).toDF(schema)
    merge_index(dump_html_df)


def merge_index(dump_html_df: DataFrame):
    result_df = dump_html_df.groupBy("layout_id") \
        .agg(
        _sum("count").alias("count"),
        collect_list("file").alias("files"),
        first("url_host_name").alias("url_host_name")
    )
    write_by_two(result_df)


def write_by_two(result_df: DataFrame):
    struct_col = struct(result_df["layout_id"], result_df["count"], result_df["files"], result_df["url_host_name"])
    output_df = result_df.withColumn("value", to_json(struct_col)).select("value")

    output_file_size_gb = 2
    resize_func = spark_resize_file(output_file_size_gb)
    new_output_df = resize_func(output_df)

    config["skip_output_version"] = True
    config['skip_output_check'] = True

    write_any_path(new_output_df, OUTPUT_PATH, config)


def close_spark():
    spark.stop()


def main():
    spark_name = INPUT_PATH.split("/")[-1]
    create_spark(spark_name)
    input_path_lst = parse_input_path(INPUT_PATH)
    for batch in input_path_lst:
        batch_lst = [i for i in list(list_s3_objects(batch, recursive=True)) if i.endswith(".jsonl")]
        create_index_df(batch_lst)
        with open("./is_index_complated.txt", "a", encoding="utf-8") as f:
            f.write(batch + ",")
    close_spark()

In [None]:
main()