# ccnet spark pipeline 实现

## 1. 导入依赖

In [26]:
from ccnet_spark.text_normalizer import normalize
from ccnet_spark.pipe_preprocess import load_segments
from ccnet_spark.pipe_hash import compute_hashes,split_doc2para
from ccnet_spark.pipe_lid import predictLang,predictScore
from ccnet_spark.pipe_tokenized import doSentencePiece
from ccnet_spark.pipe_perplexity import doDocLM
from ccnet_spark.pipe_ppbucket import doPPBucket
import time
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import explode
from pyspark.sql.functions import sum as spark_sum

# 初始化 SparkSession
spark = SparkSession.builder.appName("CCNETSpark")  \
                    .config("spark.executor.memory", "100g") \
                    .config("spark.driver.memory", "32g") \
                    .config("spark.driver.maxResultSize", "32g") \
                    .config('spark.sql.execution.arrow.pyspark.enabled', 'true') \
                    .getOrCreate()
sc = spark.sparkContext

## 2. 参数配置

In [2]:
def getModePara(mode):
    if(mode=="test"):
        cache_folder="/root/wxl_folder/cache_data/"
        date="2019-09" ## hardcode ,现在只能是这个
        segments=[i for i in range(10)]
        min_len=300
        isSample=True
        sampleRate=0.01
    else:
        cache_folder="/root/wxl_folder/cache_data/"
        date="2019-09" ## hardcode ,现在只能是这个
        segments=[i for i in range(10)]
        min_len=300
        isSample=True
        sampleRate=0.01
    return [cache_folder,date,segments,min_len,isSample,sampleRate]

In [3]:
mode="test"
cache_folder,date,segments,min_len,isSample,sampleRate=getModePara(mode)

## 2.1 读取文件数据，处理成spark DataFrame

In [4]:
s=time.time()
spark_df=load_segments(spark,segments,cache_folder,date=date,isSample=isSample,sampleRate=sampleRate,min_len=min_len)
e=time.time()
print(f"load {len(segments)} segments,time consume:{e-s}s")
if(mode=="test"):
    doc_count=spark_df.count()
    print(f"load {doc_count} docs from:{len(segments)} segments")

2024-04-02 12:36 INFO 679731:root - load segment 0, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-02 12:36 INFO 679731:root - load segment 1, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-02 12:36 INFO 679731:root - load segment 2, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-02 12:36 INFO 679731:root - load segment 3, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-02 12:36 INFO 679731:root - load segment 4, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-02 12:36 INFO 679731:root - load segment 5, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-02 12:36 INFO 679731:root - load segment 6, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-02 12:36 INFO 679731:root - load segment 7, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-02 12:36 INFO 679731:root - load segment 8, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-02 12:36 INFO 679731:root - load segment 9, with sampleRate:1.0%,

load 10 segments,time consume:3.18243408203125s
load 4243 docs from:10 segments


## 3 字段分析
1. wet 文件本身带有长度："length": length,这个是从wet的"Content-Length:"读出来的，和我计算len(raw_content）有出入。考虑原因是原先的length不只是说raw_content，还包括title等。

In [5]:
if(mode=="test"):
    print("=== TestMode Log:")
    s=time.time()
    print(spark_df.summary())
    tmp_df = spark_df.withColumn("compute_length", F.length(spark_df["raw_content"]))
    tmp_df.select("url","length","raw_content","title","nlines","compute_length").show(5)
    e=time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:
DataFrame[summary: string, url: string, date_download: string, digest: string, length: string, nlines: string, source_domain: string, title: string, raw_content: string]
+--------------------+------+---------------------------------+-------------------------------------+------+--------------+
|                 url|length|                      raw_content|                                title|nlines|compute_length|
+--------------------+------+---------------------------------+-------------------------------------+------+--------------+
|http://www.drperi...|  9959|OA登录 人力资源系统 协同办公系...|宋志平到山东济南、烟台和青州交流调...|    94|          3474|
|http://www.lojapl...|  4502|             JavaScript parece...|                 Suporte Celular A...|   260|          4314|
|https://www.sinto...|   978|             Encontro Sintonia...|                 Bandinhas em Desf...|    30|           823|
|https://www.birot...| 10050|             biroto Startseite...|                 Saint-Jacques-de-.

### 3.1 修改length

In [6]:
spark_df=spark_df.withColumn("length", F.length(spark_df["raw_content"]))

## 4. hash计算

### 4.2 udf 处理添加新字段

In [9]:
# 假设spark_df是您的DataFrame
# 使用UDF对raw_content字段进行处理
split_result = spark_df.withColumn("split_content", split_doc2para(spark_df["raw_content"]))
if(mode=="test"):
    print("=== TestMode Log:")
    s=time.time()
    print(split_result.summary())
    split_result.select("url","length","nlines","raw_content","split_content").show(5)
    e=time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:
DataFrame[summary: string, url: string, date_download: string, digest: string, length: string, nlines: string, source_domain: string, title: string, raw_content: string]
+--------------------+------+------+---------------------------------+----------------------------+
|                 url|length|nlines|                      raw_content|               split_content|
+--------------------+------+------+---------------------------------+----------------------------+
|http://www.drperi...|  3474|    94|OA登录 人力资源系统 协同办公系...|[{0, OA登录 人力资源系统 ...|
|http://www.lojapl...|  4314|   260|             JavaScript parece...|        [{0, JavaScript p...|
|https://www.sinto...|   823|    30|             Encontro Sintonia...|        [{0, Encontro Sin...|
|https://www.birot...|  9823|   226|             biroto Startseite...|        [{0, biroto Start...|
|http://e-66.ru/de...|  2053|    87|             В каталоге компан...|        [{0, В каталоге к...|
+--------------------+------+----

### 4.3 将新字段展开获取paragraph级别row

In [10]:
# Explode the split_content column and select the desired columns
exploded_df = split_result.select("url","date_download","digest","length","nlines","source_domain","title","raw_content", explode(split_result.split_content).alias("exploded_content"))

# Split the exploded_content struct into separate columns
exploded_df = exploded_df.withColumn("raw_line_id", exploded_df.exploded_content.raw_line_id)
exploded_df = exploded_df.withColumn("raw_line", exploded_df.exploded_content.raw_line)

# Drop the exploded_content column if needed
exploded_df = exploded_df.drop("exploded_content")

if(mode=="test"):
    exploded_df.cache()
    print("=== TestMode Log:")
    s=time.time()
    print(exploded_df.summary())
    exploded_df.select("url","raw_content","raw_line_id","raw_line").show(5)
    e=time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:
DataFrame[summary: string, url: string, date_download: string, digest: string, length: string, nlines: string, source_domain: string, title: string, raw_content: string, raw_line_id: string, raw_line: string]
+--------------------+---------------------------------+-----------+----------------------------------+
|                 url|                      raw_content|raw_line_id|                          raw_line|
+--------------------+---------------------------------+-----------+----------------------------------+
|http://www.drperi...|OA登录 人力资源系统 协同办公系...|          0|OA登录 人力资源系统 协同办公系统 ||
|http://www.drperi...|OA登录 人力资源系统 协同办公系...|          1|                         邮箱登录||
|http://www.drperi...|OA登录 人力资源系统 协同办公系...|          2|                         下载专区||
|http://www.drperi...|OA登录 人力资源系统 协同办公系...|          3|                         信息报送||
|http://www.drperi...|OA登录 人力资源系统 协同办公系...|          4|                         档案系统||
+--------------------+--------------

### 4.4 添加hash 列

In [11]:
# Assuming you have a dataframe named 'df' with a 'raw_line' column
hash_df = exploded_df.withColumn("hash_value", compute_hashes(exploded_df.raw_line))

# Show the resulting dataframe
if(mode=="test"):
    print("=== TestMode Log:")
    s=time.time()
    print(hash_df.summary())
    hash_df.show(5)
    e=time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:
DataFrame[summary: string, url: string, date_download: string, digest: string, length: string, nlines: string, source_domain: string, title: string, raw_content: string, raw_line_id: string, raw_line: string]
+--------------------+--------------------+--------------------+------+------+--------------+-------------------------------------+---------------------------------+-----------+----------------------------------+--------------------+
|                 url|       date_download|              digest|length|nlines| source_domain|                                title|                      raw_content|raw_line_id|                          raw_line|          hash_value|
+--------------------+--------------------+--------------------+------+------+--------------+-------------------------------------+---------------------------------+-----------+----------------------------------+--------------------+
|http://www.drperi...|2019-02-15T20:06:54Z|sha1:TETCOD7OGM66...|  3474|

### 4.5根据 hash 去重

In [12]:
deduplicated_df = hash_df.dropDuplicates(['hash_value'])
# Show the resulting dataframe
if(mode=="test"):
    print("=== TestMode Log:")
    deduplicated_df.cache()
    s=time.time()
    print(deduplicated_df.summary())
    deduplicated_df.select("url","length","nlines","raw_content","raw_line_id","hash_value").show(5)
    e=time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:
DataFrame[summary: string, url: string, date_download: string, digest: string, length: string, nlines: string, source_domain: string, title: string, raw_content: string, raw_line_id: string, raw_line: string]


[Stage 18:>                                                         (0 + 1) / 1]

+--------------------+------+------+---------------------------------+-----------+--------------------+
|                 url|length|nlines|                      raw_content|raw_line_id|          hash_value|
+--------------------+------+------+---------------------------------+-----------+--------------------+
|http://provincia....|  1545|    29|             Questo sito usa c...|          6|[00 00 B7 6B E5 F...|
|https://www.lean....| 10704|   204|             X\nmenu\nMaking t...|        184|[00 02 FD 23 BA F...|
|https://www.vinod...| 16756|   887|             en\nit\nArea Clie...|        104|[00 06 57 8E 98 6...|
|http://www.xinxia...|  2580|   109|自贸区公司注册 | 崇明公司注册 ...|        107|[00 06 D4 BA 9B E...|
|https://www.healt...|  1481|    51|             Health Chronicle\...|         43|[00 07 84 57 41 6...|
+--------------------+------+------+---------------------------------+-----------+--------------------+
only showing top 5 rows

time consume:22.999247789382935s


                                                                                

### 4.6 聚合
将段落重新聚合为doc

In [13]:
from pyspark.sql import functions as F

"url","date_download","digest","length","nlines","source_domain","title","raw_content",
group_df = deduplicated_df.groupBy("digest").agg(
    F.first("url").alias("url"),
    F.first("date_download").alias("date_download"),
    F.first("source_domain").alias("source_domain"),
    F.first("length").alias("original_length"),
    F.first("nlines").alias("original_nlines"),
    F.first("title").alias("title"),
    F.concat_ws("\n", F.collect_list("raw_line").alias("raw_content")).alias("raw_content"),
    F.count("raw_line_id").alias("nlines"),
    F.collect_list("raw_line_id").alias("line_ids"),
)
group_df=group_df.withColumn("length", F.length(group_df["raw_content"]))
if(mode=="test"):
    print("=== TestMode Log:")
    group_df.cache()
    s=time.time()
    group_df.select("url","original_length","original_nlines","raw_content","length","nlines").show(5)
    e=time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:




+--------------------+---------------+---------------+--------------------+------+------+
|                 url|original_length|original_nlines|         raw_content|length|nlines|
+--------------------+---------------+---------------+--------------------+------+------+
|https://www.anime...|           1194|             20|No Bootleg Policy...|  1149|    18|
|https://www.campi...|           6957|            362|Réfrigérateurs fi...|  6481|   318|
|http://www.petfoo...|           9091|            257|How to Satisfy Ca...|  7841|   151|
|https://tipsavenu...|          13346|             97|Oficjalna strona ...| 12642|    73|
|https://gitlab.co...|           1295|             98|Snippets\nClone w...|   858|    48|
+--------------------+---------------+---------------+--------------------+------+------+
only showing top 5 rows

time consume:10.880084037780762s


                                                                                

### 4.7 计算留存比例

In [15]:
if(mode=="test"):
    print("=== TestMode Log:")
    s = time.time()
    origin_chars = spark_df.agg(spark_sum("length")).collect()[0][0]
    remain_chars = group_df.agg(spark_sum("length")).collect()[0][0]
    e = time.time()
    print(f"origin chars:{origin_chars/1000/1000}M,remain_chars:{remain_chars/1000/1000}M \n \
            keep chars:{round(remain_chars/origin_chars*100,3)} % time consume:{e-s}")
else:
    print("=== DevMode Log:")
    s = time.time()
    origin_chars = spark_df.agg(spark_sum("length")).collect()[0][0]
    remain_chars = group_df.agg(spark_sum("length")).collect()[0][0]
    e = time.time()
    print(f"origin chars:{origin_chars/1000/1000}M,remain_chars:{remain_chars/1000/1000}M \n \
            keep chars:{round(remain_chars/origin_chars*100,3)} % time consume:{e-s}s")

=== TestMode Log:
origin chars:26.911109M,remain_chars:22.360021M 
             keep chars:83.088 % time consume:0.6906998157501221


## 5. 语言识别导入

In [20]:
lang_df = group_df.withColumn("lang", predictLang("raw_content"))
lang_df = lang_df.withColumn("score", predictScore("raw_content"))

if(mode=="test"):
    print("=== TestMode Log:")
    s = time.time()
    lang_df.select("url","raw_content","lang","score").show(5)
    e = time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:
+--------------------+--------------------+----+-----+
|                 url|         raw_content|lang|score|
+--------------------+--------------------+----+-----+
|https://www.anime...|No Bootleg Policy...|  en| 0.76|
|https://www.campi...|Réfrigérateurs fi...|  fr| 0.93|
|http://www.petfoo...|How to Satisfy Ca...|  en| 0.62|
|https://tipsavenu...|Oficjalna strona ...|  pl|  1.0|
|https://gitlab.co...|Snippets\nClone w...|null| null|
+--------------------+--------------------+----+-----+
only showing top 5 rows

time consume:0.17756080627441406s




## 6. MultiSentencePiece 分词

In [23]:
lm_df = lang_df.withColumn("tokenized", doSentencePiece("raw_content","lang"))
if(mode=="test"):
    print("=== TestMode Log:")
    s = time.time()
    lm_df.select("url","raw_content","lang","score","tokenized").show(5)
    e = time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:




+--------------------+--------------------+----+-----+--------------------+
|                 url|         raw_content|lang|score|           tokenized|
+--------------------+--------------------+----+-----+--------------------+
|https://www.anime...|No Bootleg Policy...|  en| 0.76|▁no ▁bootleg ▁pol...|
|https://www.campi...|Réfrigérateurs fi...|  fr| 0.93|▁refrigerateur s ...|
|http://www.petfoo...|How to Satisfy Ca...|  en| 0.62|▁how ▁to ▁satisfy...|
|https://tipsavenu...|Oficjalna strona ...|  pl|  1.0|▁oficjaln a ▁stro...|
|https://gitlab.co...|Snippets\nClone w...|null| null|                null|
+--------------------+--------------------+----+-----+--------------------+
only showing top 5 rows

time consume:0.4362320899963379s


## 7. 困惑度

In [25]:
doclm_df = lm_df.withColumn("perplexity", doDocLM("tokenized","lang"))
if(mode=="test"):
    print("=== TestMode Log:")
    s = time.time()
    doclm_df.select("url","raw_content","lang","score","tokenized","perplexity").show(5)
    e = time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:




+--------------------+--------------------+----+-----+--------------------+----------+
|                 url|         raw_content|lang|score|           tokenized|perplexity|
+--------------------+--------------------+----+-----+--------------------+----------+
|https://www.anime...|No Bootleg Policy...|  en| 0.76|▁no ▁bootleg ▁pol...|    1466.7|
|https://www.campi...|Réfrigérateurs fi...|  fr| 0.93|▁refrigerateur s ...|    1108.2|
|http://www.petfoo...|How to Satisfy Ca...|  en| 0.62|▁how ▁to ▁satisfy...|    1324.0|
|https://tipsavenu...|Oficjalna strona ...|  pl|  1.0|▁oficjaln a ▁stro...|     210.4|
|https://gitlab.co...|Snippets\nClone w...|null| null|                null|      null|
+--------------------+--------------------+----+-----+--------------------+----------+
only showing top 5 rows

time consume:0.7668919563293457s


                                                                                

## 8. PerplexityBucket

In [27]:
bucket_df = doclm_df.withColumn("bucket", doPPBucket("perplexity","lang"))
if(mode=="test"):
    print("=== TestMode Log:")
    s = time.time()
    bucket_df.select("url","raw_content","lang","score","tokenized","perplexity","bucket").show(50)
    e = time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:


[Stage 46:>                                                         (0 + 1) / 1]

+--------------------+---------------------------------+----+-----+-----------------------------+----------+------+
|                 url|                      raw_content|lang|score|                    tokenized|perplexity|bucket|
+--------------------+---------------------------------+----+-----+-----------------------------+----------+------+
|https://www.anime...|             No Bootleg Policy...|  en| 0.76|         ▁no ▁bootleg ▁pol...|    1466.7|  tail|
|https://www.campi...|             Réfrigérateurs fi...|  fr| 0.93|         ▁refrigerateur s ...|    1108.2|  tail|
|http://www.petfoo...|             How to Satisfy Ca...|  en| 0.62|         ▁how ▁to ▁satisfy...|    1324.0|  tail|
|https://tipsavenu...|             Oficjalna strona ...|  pl|  1.0|         ▁oficjaln a ▁stro...|     210.4|middle|
|https://gitlab.co...|             Snippets\nClone w...|null| null|                         null|      null|   all|
|http://www.palosv...|             Calculator source...|  en| 0.77|     

                                                                                

## 9. dropKeys

In [None]:
drop_df = bucket_df.drop("tokenized")
if(mode=="test"):
    print("=== TestMode Log:")
    s = time.time()
    print(drop_df.summary())
    e = time.time()
    print(f"time consume:{e-s}s")

## 10. split by lang

In [None]:
if(mode=="test"):
    print("=== TestMode Log:")
    s = time.time()
    selected_df = drop_df.filter((drop_df.lang == "en") & (drop_df.bucket == "head"))
    # selected_df.select("url","raw_content","lang","bucket").show(50)
    result_pdf = selected_df.select("*").toPandas()
    e = time.time()
    print(f"time consume:{e-s}s")


## 11. save

In [None]:
result_pdf

In [None]:
drop_df.show()
