# 1. 导入依赖

In [1]:
from ccnet_spark import open_read, parse_warc_file,compute_hashes,NaiveHashSet
from pathlib import Path
import numpy as np
import time
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StringType,IntegerType,StructType, StructField
from pyspark.sql.functions import udf, explode

# 初始化 SparkSession
spark = SparkSession.builder.appName("CCNETSpark").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/29 11:43:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/29 11:43:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# 2. 读取文件数据，处理成pandas DataFrame

## 2.1 获取cache文件路径

In [2]:
cache_data="/Users/zz/github/cache_data/2019-09/"
def getWETURL(segment: int):
    cache_file_prefix = "CC-MAIN-20190215183319-20190215205319-"
    cache_file_sufix = ".warc.wet.gz"
    segment_str = str(segment).zfill(5)  # Pad with leading zeros
    return cache_data+cache_file_prefix + segment_str + cache_file_sufix
url = getWETURL(3)
print(url)  # Output: CC-MAIN-20190215183319-20190215205319-00003.warc.wet.gz

/Users/zz/github/cache_data/2019-09/CC-MAIN-20190215183319-20190215205319-00003.warc.wet.gz


## 2.2 处理文件，存入pandas DataFrame

In [3]:
file_path=Path(getWETURL(0))
file=open_read(file_path)
s=time.time()
pandas_df = parse_warc_file(file, 30)

random_save_n=5
pandas_df = pandas_df.sample(n=random_save_n, random_state=1)
e=time.time()
print(f"parse one seg to pd_df consume:{e-s} s")

2024-03-29 11:44 INFO 61632:ccnet_spark.load - Opening /Users/zz/github/cache_data/2019-09/CC-MAIN-20190215183319-20190215205319-00000.warc.wet.gz with mode 'rt'
2024-03-29 11:44 INFO 61632:root - Created DataFrame with 43855 documents


parse one seg to pd_df consume:3.609127998352051 s


# 3. 将pandas DataFrame 转换成spark DataFrame

In [4]:
# 将 pandas DataFrame 转换为 Spark DataFrame
spark_df = spark.createDataFrame(pandas_df)
spark_df.show()

                                                                                

+--------------------+--------------------+--------------------+------+------+--------------------+--------------------+--------------------+
|                 url|       date_download|              digest|length|nlines|       source_domain|               title|         raw_content|
+--------------------+--------------------+--------------------+------+------+--------------------+--------------------+--------------------+
|https://www.telel...|2019-02-15T19:35:48Z|sha1:VZYTYZZ7EH6E...|  4758|   111|    www.telelynx.com|sean, Author at T...|English\tEnglish\...|
|http://www.ma.hu/...|2019-02-15T19:26:14Z|sha1:FA3DLWLJZKFI...|  4180|    70|           www.ma.hu|Kiégett a Hűvösvö...|hirdetés\nma.hu n...|
|http://angagement...|2019-02-15T18:57:03Z|sha1:EMAN4TLHXTXM...|  1326|    65|   angagement.com.ua|Ballroom and lati...|→ по-русски\nCost...|
|http://resistther...|2019-02-15T19:55:31Z|sha1:UWDJJTE42LC7...|   912|    23|resisttherock.uco...|Comments or Sugge...|Unwanted Resistan...|
|http:

# 4. hash计算

## 4.1 定义UDF,将doc 分割成paragraph 

In [5]:
# 定义一个函数，用于分割文本
def split_raw_content(content):
    lines = content.split('\n')
    line_ids = range(1, len(lines) + 1)  # 生成行号
    return list(zip(line_ids, lines))

# 注册为UDF
split_udf = udf(split_raw_content, ArrayType(StructType([
    StructField("raw_line_id", IntegerType(), False),
    StructField("raw_line", StringType(), False)
])))


## 4.2 udf 处理添加新字段

In [6]:
# 假设spark_df是您的DataFrame
# 使用UDF对raw_content字段进行处理
split_result = spark_df.withColumn("split_content", split_udf(spark_df["raw_content"]))

## 4.3 将新字段展开获取paragraph级别row

In [8]:
# Explode the split_content column and select the desired columns
exploded_df = split_result.select("url","digest","length","nlines","source_domain","title", explode(split_result.split_content).alias("exploded_content"))

# Split the exploded_content struct into separate columns
exploded_df = exploded_df.withColumn("raw_line_id", exploded_df.exploded_content.raw_line_id)
exploded_df = exploded_df.withColumn("raw_line", exploded_df.exploded_content.raw_line)

# Drop the exploded_content column if needed
exploded_df = exploded_df.drop("exploded_content")

# Show the resulting dataframe
exploded_df.show()

+--------------------+--------------------+------+------+----------------+--------------------+-----------+------------------------+
|                 url|              digest|length|nlines|   source_domain|               title|raw_line_id|                raw_line|
+--------------------+--------------------+------+------+----------------+--------------------+-----------+------------------------+
|https://www.telel...|sha1:VZYTYZZ7EH6E...|  4758|   111|www.telelynx.com|sean, Author at T...|          1|    English\tEnglish\ten|
|https://www.telel...|sha1:VZYTYZZ7EH6E...|  4758|   111|www.telelynx.com|sean, Author at T...|          2|繁體中文\tChinese (Tr...|
|https://www.telel...|sha1:VZYTYZZ7EH6E...|  4758|   111|www.telelynx.com|sean, Author at T...|          3|                    Home|
|https://www.telel...|sha1:VZYTYZZ7EH6E...|  4758|   111|www.telelynx.com|sean, Author at T...|          4|                Products|
|https://www.telel...|sha1:VZYTYZZ7EH6E...|  4758|   111|www.telelynx.com