In [None]:
from xinghe.spark import *
from app.common.json_util import *
from xinghe.s3 import *
from xinghe.s3.read import *

config = {
    "spark_conf_name": "spark_4",
    "skip_success_check": True,
    "spark.executorEnv.LLM_WEB_KIT_CFG_PATH": "/share/xxx.jsonc",
    "spark.yarn.queue": "pipeline.ehtml",
    "spark.sql.shuffle.partitions": 100000,
}

from llm_web_kit.libs.standard_utils import compress_and_decompress_str
from llm_web_kit.html_layout.html_layout_cosin import cluster_html_struct, get_feature, similarity, sum_tags

import base64
import re
import random
import time
import sys
import pickle
import zlib
import uuid
import traceback
import numpy as np
import pandas as pd
from typing import List, Dict, Union
from copy import deepcopy, copy
from urllib.parse import quote, unquote, urlparse, parse_qs
from datetime import datetime
from lxml import html
from collections import defaultdict
from func_timeout import FunctionTimedOut, func_timeout

from pyspark.sql import Row, DataFrame
from pyspark.sql.functions import row_number, col, collect_list, struct, expr, rand, count, pandas_udf, PandasUDFType, \
    round as _round, lit, to_json, from_json, explode
from pyspark.sql.types import StructType, StructField, IntegerType, BinaryType, StringType, LongType

import os

os.environ["LLM_WEB_KIT_CFG_PATH"] = "/share/renpengli/.llm-web-kit.jsonc"

TIMEOUT_SECONDS = 60 * 2
MAX_OUTPUT_ROW_SIZE = 1024 * 1024 * 1024 * 1.7
MAX_OUTPUT_FILE_SIZE = 1024 * 1024 * 1024 * 10
SIMILARITY_THRESHOLD = 0.95
NUM_PARTITIONS = 100000
WRITE_NUM_PARTITIONS = 20000
ERROR_PATH = "s3://xxx/"
INPUT_PATH = "s3://xxx/"
BASE_OUTPUT_PATH = "s3://xxx/"


# utils

In [None]:
def get_s3_doctor(target_theme):
    partition_id = str(uuid.uuid4())
    current_time = datetime.now().strftime("%Y%m%d")
    error_log_path = f"{ERROR_PATH}{target_theme}/{current_time}/{partition_id}.jsonl"
    s3_doc_writer = S3DocWriter(path=error_log_path)
    return s3_doc_writer


def parse_output_data(row_data):
    row_data.update({"layout_id": '_'.join([row_data["url_host_name"], str(row_data["layout_id"])])})
    new_row_data_json = json_dumps(row_data)
    if len(new_row_data_json) < MAX_OUTPUT_ROW_SIZE:
        return {"value": new_row_data_json, "layout_id": row_data["layout_id"]}
    return None


def calculating_similarity(feature_dict, feature, max_layer_n):
    for k, v in feature_dict.items():
        if any(similarity(feature, h["feature"], max_layer_n) >= SIMILARITY_THRESHOLD for h in v):
            return int(k)
    return -2


def parse_similarity(_iter):
    s3_doc_writer = get_s3_doctor("parse_similarity")
    error_info = None
    is_no_layout_id = False
    for row in _iter:
        layout_dict = json_loads(row.layout_dict)
        layout_list = layout_dict.get("layout_list", [])
        if len(layout_list) == 1 and layout_list[0] == -1:
            is_no_layout_id = True
        feature_dict = layout_dict.get("feature_dict", {})
        max_layer_n = layout_dict.get("max_layer_n", 5)
        domain = row.domain
        count = row.count
        file_d = row.file
        offset = file_d.offset
        length = file_d.length
        record_count = file_d.record_count
        idx = 0
        try:
            for detail_data in read_s3_lines_with_range(file_d["filepath"], use_stream=True,
                                                        bytes_range=(offset, offset + length)):
                idx += 1
                if idx > record_count:
                    break
                detail_data = json_loads(detail_data)
                if is_no_layout_id is True:
                    layout_id = -1
                else:
                    try:
                        feature = get_feature(detail_data["html"])
                        if feature is None or not feature.get("tags"):
                            layout_id = -3
                        else:
                            layout_id = func_timeout(TIMEOUT_SECONDS, calculating_similarity,
                                                     (feature_dict, feature, max_layer_n,))
                    except FunctionTimedOut as e:
                        error_info = {
                            "error_type": type(e).__name__,
                            "error_message": str(e),
                            "traceback": traceback.format_exc(),
                            "input_data": str(detail_data),
                            "timestamp": datetime.now().isoformat()
                        }
                        s3_doc_writer.write(error_info)
                        continue
                    except Exception as e:
                        error_info = {
                            "error_type": type(e).__name__,
                            "error_message": str(e),
                            "traceback": traceback.format_exc(),
                            "input_data": str(detail_data),
                            "timestamp": datetime.now().isoformat()
                        }
                        s3_doc_writer.write(error_info)
                        continue
                line = {
                    "track_id": detail_data["track_id"],
                    "html": detail_data["html"],
                    "url": detail_data["url"],
                    "layout_id": layout_id,
                    "max_layer_n": max_layer_n,
                    "url_host_name": domain,
                    "raw_warc_path": detail_data["raw_warc_path"]
                }
                json_line = parse_output_data(line)
                if json_line is not None:
                    yield json_line
        except Exception as e:
            error_info = {
                "error_type": type(e).__name__,
                "error_message": str(e),
                "traceback": traceback.format_exc(),
                "input_data": str(row),
                "timestamp": datetime.now().isoformat()
            }
            s3_doc_writer.write(error_info)
            continue

    if error_info:
        s3_doc_writer.flush()


def save_s3_by_layout(outdata_list):
    s3_doc_writer = get_s3_doctor("similarity_write")
    error_info = None
    json_line = None
    s3_writer = None
    index = 0
    output_file_size = 0
    for index, row in enumerate(outdata_list):
        try:
            if output_file_size > MAX_OUTPUT_FILE_SIZE:
                if json_line:
                    s3_writer.flush()
                    s3_writer = None
                    output_file_size = 0
            json_line = json_loads(row.value)
            if s3_writer:
                output_file_size += s3_writer.write(json_line)
            else:
                partition_id = str(uuid.uuid4())
                output_file = f"{OUTPUT_PATH}{partition_id}.jsonl.gz"
                s3_writer = S3DocWriter(output_file)
                output_file_size += s3_writer.write(json_line)
        except Exception as e:
            error_info = {
                "error_type": type(e).__name__,
                "error_message": str(e),
                "traceback": traceback.format_exc(),
                "input_data": row.value,
                "timestamp": datetime.now().isoformat()
            }
            s3_doc_writer.write(error_info)
            continue

    if json_line:
        s3_writer.flush()
    if error_info:
        s3_doc_writer.flush()
    yield {"write_size": index}

# main func

In [None]:
def parse_input_path(input_path):
    try:
        with open("./is_similarity_complated.txt", "r", encoding="utf-8") as f:
            content = f.read()
            already_exist = [i for i in content.split(",") if i] if content else []
    except:
        already_exist = []
    input_path_lst = [i for i in [f.replace("s3", "s3a") for f in list(list_s3_objects(input_path, recursive=False))] if
                      i not in already_exist]
    return input_path_lst


def parse_path(batch):
    path_list = batch.split('/')
    global OUTPUT_PATH
    OUTPUT_PATH = f"{BASE_OUTPUT_PATH}{path_list[-3]}/{path_list[-2]}/"
    return path_list


def create_spark(spark_name: str):
    global spark
    spark = new_spark_session(f"layout.similarity.{spark_name}", config)
    sc = spark.sparkContext
    sc.setLogLevel("ERROR")


def get_domain_df(batch):
    input_f_df = spark.read.format("json").load(batch).filter(col("layout_dict").isNotNull())
    input_df = input_f_df.withColumn("file", explode(col("files"))).drop("files")
    similarity_every_domain(input_df)


def similarity_every_domain(input_df: DataFrame):
    schema = StructType([
        StructField('value', StringType(), True),
        StructField('layout_id', StringType(), True),
    ])

    all_domain_df = input_df.repartition(NUM_PARTITIONS).rdd.mapPartitions(parse_similarity).toDF(schema)
    write_by_layoutid(all_domain_df)


def write_by_layoutid(all_domain_df: DataFrame):
    output_schema = StructType([
        StructField('write_size', IntegerType(), True)
    ])

    final_df = all_domain_df.repartition(WRITE_NUM_PARTITIONS, col("layout_id")).sortWithinPartitions(col("layout_id"))
    out_df = final_df.rdd.mapPartitions(save_s3_by_layout)
    out_df.count()


def close_spark():
    spark.stop()


def main():
    input_path_lst = parse_input_path(INPUT_PATH)
    for batch in input_path_lst:
        path_list = parse_path(batch)
        spark_name = '_'.join([path_list[-3], path_list[-2]])
        create_spark(spark_name)
        get_domain_df(batch)
        close_spark()
        with open("./is_similarity_complated.txt", "a", encoding="utf-8") as f:
            f.write(batch + ",")

In [None]:
main()