In [None]:
from pyspark.sql import Row
from xinghe.spark import *
from app.common.json_util import *
from xinghe.s3 import *

config = {
    "spark_conf_name": "spark_4",
    "skip_success_check": True,
}

from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType, StructField, StringType
import zlib
import base64
from typing import Union

MAX_OUTPUT_ROW_SIZE = 1024 * 1024 * 1024 * 1.5
DUMPS = [
    ...
]
CC_WARC = 's3://xx/'
base_unique_path = "s3://xxx/"
output_path = "s3://xxx/"

spark = new_spark_session("cc_dumps.dedup.thr", config)
sc = spark.sparkContext

In [None]:
def compress_and_decompress_str(input_data: Union[str, bytes], compress: bool = True, base: bool = False) -> Union[str, bytes]:
    try:
        if compress:
            # 确保输入是字节串
            if isinstance(input_data, str):
                input_bytes = input_data.encode('utf-8')
            elif isinstance(input_data, bytes):
                input_bytes = input_data
            else:
                raise TypeError("Input must be a string or bytes object.")

            if base:
                # 压缩并转换为Base64字符串
                compressed_bytes = zlib.compress(input_bytes)
                return base64.b64encode(compressed_bytes).decode('utf-8')
            else:
                return zlib.compress(input_bytes)
                
        else:
            # 解码Base64字符串并解压缩
            if isinstance(input_data, str):
                compressed_bytes = base64.b64decode(input_data)
            elif isinstance(input_data, bytearray):
                compressed_bytes = bytes(input_data)
            elif isinstance(input_data, bytes):
                compressed_bytes = input_data
            else:
                raise TypeError("Input must be a Base64 encoded string or bytes object.")

            decompressed_bytes = zlib.decompress(compressed_bytes)
            return decompressed_bytes.decode('utf-8')  # 假设原始数据是UTF-8编码的字符串

    except (zlib.error, base64.binascii.Error, UnicodeDecodeError) as e:
        raise ValueError(f"Error during compression/decompression: {e}")



# html source

In [None]:
warc_paths = []
for dump in DUMPS:
    dump_path = f'{CC_WARC}{dump}/'
    warc_paths.extend([x for x in list(list_s3_objects(dump_path, recursive=True)) if "/warc/" in x])


In [None]:
def parse_path_to_html(iter):
    for fpath in iter:
        for zz in read_s3_rows(fpath):
            try:
                detail_datas = json_loads(zz.value)
            except:
                continue
            filename = zz.loc
            if detail_datas.get("html", ""):
                detail_datas["raw_warc_path"] = filename
                fpath_path = fpath.split('/')
                detail_datas["sub_path"] = f"{fpath_path[4]}/{fpath_path[-1].replace('.warc.gz', '')}"
                yield Row(**{"value": json_dumps(detail_datas), "track_id": detail_datas["track_id"]})

In [None]:
html_schema = StructType([
    StructField("track_id", StringType(), True),
    StructField("value", StringType(), True),
])
page_content = sc.parallelize(warc_paths, len(warc_paths))
dump_html_df = page_content.mapPartitions(parse_path_to_html).toDF(html_schema)

# unique id

In [None]:
input_paths = [f"{base_unique_path}{i}/" for i in DUMPS]
unique_id_df = read_any_path(spark, ','.join(input_paths), config)

In [None]:
unique_schema = StructType([
    StructField("track_id", StringType(), True),
])

dump_ods_df_with_struct = unique_id_df.withColumn("jsocn_strut", from_json(unique_id_df.value, unique_schema))
unique_id_v_df = dump_ods_df_with_struct.select("json_struct.*")

In [None]:
inner_df = dump_html_df.join(unique_id_v_df, on='track_id', how='inner')

# write gz

In [None]:
output_df = inner_df.select("value")

In [None]:
config["skip_output_version"] = True
config["output_compression"] = "gz"
write_any_path(output_df, output_path, config)