# 1. 导入依赖

In [1]:
from ccnet_spark import open_read, parse_warc_file,compute_hashes,NaiveHashSet
from pathlib import Path
import numpy as np
import time
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StringType,IntegerType,StructType, StructField
from pyspark.sql.functions import udf, explode

# 初始化 SparkSession
spark = SparkSession.builder.appName("CCNETSpark").getOrCreate()

24/03/29 13:36:42 WARN Utils: Your hostname, MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.110.96 instead (on interface en0)
24/03/29 13:36:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/29 13:36:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# 2. 读取文件数据，处理成pandas DataFrame

## 2.1 获取cache文件路径

In [2]:
cache_data="/Users/zz/github/cache_data/2019-09/"
def getWETURL(segment: int):
    cache_file_prefix = "CC-MAIN-20190215183319-20190215205319-"
    cache_file_sufix = ".warc.wet.gz"
    segment_str = str(segment).zfill(5)  # Pad with leading zeros
    return cache_data+cache_file_prefix + segment_str + cache_file_sufix
url = getWETURL(3)
print(url)  # Output: CC-MAIN-20190215183319-20190215205319-00003.warc.wet.gz

/Users/zz/github/cache_data/2019-09/CC-MAIN-20190215183319-20190215205319-00003.warc.wet.gz


## 2.2 处理文件，存入pandas DataFrame

In [3]:
file_path=Path(getWETURL(0))
file=open_read(file_path)
s=time.time()
pandas_df = parse_warc_file(file, 30)

random_save_n=5000
pandas_df = pandas_df.sample(n=random_save_n, random_state=1)
e=time.time()
print(f"parse one seg to pd_df consume:{e-s} s")

2024-03-29 13:36 INFO 96498:ccnet_spark.load - Opening /Users/zz/github/cache_data/2019-09/CC-MAIN-20190215183319-20190215205319-00000.warc.wet.gz with mode 'rt'
2024-03-29 13:36 INFO 96498:root - Created DataFrame with 43855 documents


parse one seg to pd_df consume:3.516249895095825 s


# 3. 将pandas DataFrame 转换成spark DataFrame

In [4]:
# 将 pandas DataFrame 转换为 Spark DataFrame
spark_df = spark.createDataFrame(pandas_df)
spark_df.show()

24/03/29 13:37:03 WARN TaskSetManager: Stage 0 contains a task of very large size (4244 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+--------------------+--------------------+--------------------+------+------+--------------------+-----------------------------+-----------------------------+
|                 url|       date_download|              digest|length|nlines|       source_domain|                        title|                  raw_content|
+--------------------+--------------------+--------------------+------+------+--------------------+-----------------------------+-----------------------------+
|https://www.telel...|2019-02-15T19:35:48Z|sha1:VZYTYZZ7EH6E...|  4758|   111|    www.telelynx.com|         sean, Author at T...|         English\tEnglish\...|
|http://www.ma.hu/...|2019-02-15T19:26:14Z|sha1:FA3DLWLJZKFI...|  4180|    70|           www.ma.hu|         Kiégett a Hűvösvö...|         hirdetés\nma.hu n...|
|http://angagement...|2019-02-15T18:57:03Z|sha1:EMAN4TLHXTXM...|  1326|    65|   angagement.com.ua|         Ballroom and lati...|         → по-русски\nCost...|
|http://resistther...|2019-02-15T19:55:3

# 4. hash计算

## 4.1 定义UDF,将doc 分割成paragraph 

In [5]:
# 定义一个函数，用于分割文本
def split_raw_content(content):
    lines = content.split('\n')
    line_ids = range(1, len(lines) + 1)  # 生成行号
    return list(zip(line_ids, lines))

# 注册为UDF
split_udf = udf(split_raw_content, ArrayType(StructType([
    StructField("raw_line_id", IntegerType(), False),
    StructField("raw_line", StringType(), False)
])))


## 4.2 udf 处理添加新字段

In [6]:
# 假设spark_df是您的DataFrame
# 使用UDF对raw_content字段进行处理
split_result = spark_df.withColumn("split_content", split_udf(spark_df["raw_content"]))

## 4.3 将新字段展开获取paragraph级别row

In [7]:
# Explode the split_content column and select the desired columns
exploded_df = split_result.select("url","length","nlines","title", explode(split_result.split_content).alias("exploded_content"))

# Split the exploded_content struct into separate columns
exploded_df = exploded_df.withColumn("raw_line_id", exploded_df.exploded_content.raw_line_id)
exploded_df = exploded_df.withColumn("raw_line", exploded_df.exploded_content.raw_line)

# Drop the exploded_content column if needed
exploded_df = exploded_df.drop("exploded_content")

# Show the resulting dataframe
exploded_df.show()

24/03/29 13:37:06 WARN TaskSetManager: Stage 1 contains a task of very large size (4244 KiB). The maximum recommended task size is 1000 KiB.
[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+------+------+--------------------+-----------+------------------------+
|                 url|length|nlines|               title|raw_line_id|                raw_line|
+--------------------+------+------+--------------------+-----------+------------------------+
|https://www.telel...|  4758|   111|sean, Author at T...|          1|    English\tEnglish\ten|
|https://www.telel...|  4758|   111|sean, Author at T...|          2|繁體中文\tChinese (Tr...|
|https://www.telel...|  4758|   111|sean, Author at T...|          3|                    Home|
|https://www.telel...|  4758|   111|sean, Author at T...|          4|                Products|
|https://www.telel...|  4758|   111|sean, Author at T...|          5|              Digital TV|
|https://www.telel...|  4758|   111|sean, Author at T...|          6|    DVB / ISDB-T TV S...|
|https://www.telel...|  4758|   111|sean, Author at T...|          7|            AddTV System|
|https://www.telel...|  4758|   111|sean, Author at T.

                                                                                

## 4.4 查看第[100，100+10)行

In [8]:
selected_rows = exploded_df.offset(99).limit(10)
# 显示结果
selected_rows.show()

24/03/29 13:37:07 WARN TaskSetManager: Stage 2 contains a task of very large size (4244 KiB). The maximum recommended task size is 1000 KiB.
[Stage 2:>                                                          (0 + 8) / 8]

+--------------------+------+------+--------------------+-----------+--------------------+
|                 url|length|nlines|               title|raw_line_id|            raw_line|
+--------------------+------+------+--------------------+-----------+--------------------+
|https://www.telel...|  4758|   111|sean, Author at T...|        100|              Search|
|https://www.telel...|  4758|   111|sean, Author at T...|        101|         Latest News|
|https://www.telel...|  4758|   111|sean, Author at T...|        102|IBC2019, RAI Amst...|
|https://www.telel...|  4758|   111|sean, Author at T...|        103|CABSAT Dubai 2019...|
|https://www.telel...|  4758|   111|sean, Author at T...|        104|UHD/HD DVB-T2 STB...|
|https://www.telel...|  4758|   111|sean, Author at T...|        105|            Facebook|
|https://www.telel...|  4758|   111|sean, Author at T...|        106|© 2018 Copyright ...|
|https://www.telel...|  4758|   111|sean, Author at T...|        107|   - made by bouncin|

                                                                                

## 4.5 添加hash 列

In [9]:
import hashlib
from pyspark.sql.functions import udf
from pyspark.sql.types import BinaryType
from ccnet_spark import normalize_for_dedup
HASH_SIZE = 10  # Define the desired size of the hash

@udf(returnType=BinaryType())
def compute_hashes(line):
    if not line:
        return None
    normalized_line = normalize_for_dedup(line)  # Assuming normalize_for_dedup is defined
    line_hash = hashlib.sha1(bytes(normalized_line, encoding="utf-8")).digest()[:HASH_SIZE]
    return line_hash

# Assuming you have a dataframe named 'df' with a 'raw_line' column
hash_df = exploded_df.withColumn("hash_value", compute_hashes(exploded_df.raw_line))

# Show the resulting dataframe
hash_df.show()

24/03/29 13:37:12 WARN TaskSetManager: Stage 5 contains a task of very large size (4244 KiB). The maximum recommended task size is 1000 KiB.


+--------------------+------+------+--------------------+-----------+------------------------+--------------------+
|                 url|length|nlines|               title|raw_line_id|                raw_line|          hash_value|
+--------------------+------+------+--------------------+-----------+------------------------+--------------------+
|https://www.telel...|  4758|   111|sean, Author at T...|          1|    English\tEnglish\ten|[A7 E1 3C F2 70 F...|
|https://www.telel...|  4758|   111|sean, Author at T...|          2|繁體中文\tChinese (Tr...|[3E DB 9E EF B5 2...|
|https://www.telel...|  4758|   111|sean, Author at T...|          3|                    Home|[E8 32 49 BD 3B A...|
|https://www.telel...|  4758|   111|sean, Author at T...|          4|                Products|[FB DC 4F 23 F9 3...|
|https://www.telel...|  4758|   111|sean, Author at T...|          5|              Digital TV|[59 44 27 AB 00 F...|
|https://www.telel...|  4758|   111|sean, Author at T...|          6|    DVB

## 4.5根据 hash 去重

In [10]:
deduplicated_df = hash_df.dropDuplicates(['hash_value'])

In [15]:
origin_count = hash_df.count()
remain_count = deduplicated_df.count()
print(f"Number of origin rows:{origin_count},remain_count:{remain_count} keep:{remain_count/origin_count*100} %")

24/03/29 13:41:58 WARN TaskSetManager: Stage 33 contains a task of very large size (4244 KiB). The maximum recommended task size is 1000 KiB.
24/03/29 13:42:01 WARN TaskSetManager: Stage 36 contains a task of very large size (4244 KiB). The maximum recommended task size is 1000 KiB.
[Stage 36:>                                                         (0 + 8) / 8]

Number of origin rows:798437,remain_count:442901 keep:55.471001469120296%


                                                                                