In [1]:
from ccnet_spark.text_normalizer import normalize
from ccnet_spark.pipe_preprocess import load_segments
from ccnet_spark.pipe_hash import compute_hashes,split_doc2para
from ccnet_spark.pipe_lid import predictLang,predictScore
from ccnet_spark.pipe_tokenized import doSentencePiece
from ccnet_spark.pipe_perplexity import doDocLM
from ccnet_spark.pipe_ppbucket import doPPBucket
from ccnet_spark.pipe_save import save_partation,load_partation
import time
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import explode
from pyspark.sql.functions import sum as spark_sum

# 初始化 SparkSession
spark = SparkSession.builder.appName("CCNETSpark")  \
                    .config("spark.executor.memory", "100g") \
                    .config("spark.driver.memory", "32g") \
                    .config("spark.driver.maxResultSize", "32g") \
                    .config('spark.sql.execution.arrow.pyspark.enabled', 'true') \
                    .getOrCreate()
sc = spark.sparkContext

24/04/02 14:26:49 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
def getModePara(mode):
    if(mode=="test"):
        cache_folder="/root/wxl_folder/cache_data/"
        date="2019-09" ## hardcode ,现在只能是这个
        segments=[i for i in range(10)]
        min_len=300
        isSample=True
        sampleRate=0.01
    else:
        cache_folder="/root/wxl_folder/cache_data/"
        date="2019-09" ## hardcode ,现在只能是这个
        segments=[i for i in range(10)]
        min_len=300
        isSample=True
        sampleRate=0.01
    return [cache_folder,date,segments,min_len,isSample,sampleRate]
mode="test"
cache_folder,date,segments,min_len,isSample,sampleRate=getModePara(mode)

In [3]:
spark_df=load_segments(spark,segments,cache_folder,date=date,isSample=isSample,sampleRate=sampleRate,min_len=min_len)
spark_df=spark_df.withColumn("length", F.length(spark_df["raw_content"]))
split_result = spark_df.withColumn("split_content", split_doc2para(spark_df["raw_content"]))
exploded_df=split_result.withColumn("exploded_content", explode(split_result.split_content))
exploded_df = exploded_df.withColumn("raw_line_id", exploded_df.exploded_content.raw_line_id) \
                         .withColumn("raw_line", exploded_df.exploded_content.raw_line) \
                         .drop("exploded_content")
hash_df = exploded_df.withColumn("hash_value", compute_hashes(exploded_df.raw_line))
deduplicated_df = hash_df.dropDuplicates(['hash_value'])

2024-04-02 14:26 INFO 708667:ccnet_spark.pipe_preprocess - Opening /root/wxl_folder/cache_data/2019-09/CC-MAIN-20190215183319-20190215205319-00000.warc.wet.gz with mode 'rt'
2024-04-02 14:26 INFO 708667:root - Created DataFrame with 42618 documents
2024-04-02 14:26 INFO 708667:root - load segment 0, 426 docs, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-02 14:27 INFO 708667:root - load segment 0, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-02 14:27 INFO 708667:ccnet_spark.pipe_preprocess - Opening /root/wxl_folder/cache_data/2019-09/CC-MAIN-20190215183319-20190215205319-00001.warc.wet.gz with mode 'rt'
2024-04-02 14:27 INFO 708667:root - Created DataFrame with 41939 documents
2024-04-02 14:27 INFO 708667:root - load segment 1, 419 docs, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-02 14:27 INFO 708667:root - load segment 1, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-02 14:27 INFO 708667:ccnet_spark.pipe_preprocess - Opening /ro

In [4]:
group_df = deduplicated_df.groupBy("digest").agg(
    F.first("url").alias("url"),
    F.first("date_download").alias("date_download"),
    F.first("source_domain").alias("source_domain"),
    F.first("cc_segment").alias("cc_segment"),
    F.first("length").alias("original_length"),
    F.first("nlines").alias("original_nlines"),
    F.first("title").alias("title"),
    F.concat_ws("\n", F.collect_list("raw_line").alias("raw_content")).alias("raw_content"),
    F.count("raw_line_id").alias("nlines"),
    F.collect_list("raw_line_id").alias("line_ids"),
)
group_df=group_df.withColumn("length", F.length(group_df["raw_content"]))

In [5]:
lang_df = group_df.withColumn("lang", predictLang("raw_content"))
lang_df = lang_df.withColumn("score", predictScore("raw_content"))
lm_df = lang_df.withColumn("tokenized", doSentencePiece("raw_content","lang"))
doclm_df = lm_df.withColumn("perplexity", doDocLM("tokenized","lang"))
bucket_df = doclm_df.withColumn("bucket", doPPBucket("perplexity","lang"))
drop_df = bucket_df.drop("tokenized")

In [6]:
save_partation(drop_df,cache_folder,date,isSample,sampleRate,min_len)

24/04/02 14:28:25 WARN TaskSetManager: Stage 10 contains a task of very large size (1223 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [7]:
selected_bucket="head"
selected_lang="en"
df_en_head=load_partation(spark,selected_lang,selected_bucket,cache_folder,date,isSample,sampleRate,min_len)

In [9]:
df_en_head.select("url","raw_content","perplexity","length","cc_segment").show()
print(df_en_head.count())

+--------------------+--------------------+----------+------+----------+
|                 url|         raw_content|perplexity|length|cc_segment|
+--------------------+--------------------+----------+------+----------+
|https://www.filmf...|Other festival fi...|      90.6|  2658|         5|
|https://phiquyenc...|Tham Khảo\nAt thi...|     282.8| 16135|         1|
|http://icetulip.c...|Man orders Kindle...|     225.7|376028|         8|
|http://www.djvinc...|DJVI has speciali...|     338.4|  1810|         7|
|https://mybeachge...|Property Inquiry\...|     207.0|  2701|         2|
|https://brightkit...|The English Law o...|     317.9| 10923|         9|
|https://arousingg...|Repeat forever.\n...|     336.5| 11391|         6|
|http://tatteredan...|I have not found ...|     307.5| 27630|         9|
|https://scrappera...|About Me!\nMy Cur...|     328.1|  2818|         7|
|https://ppmforums...|Blood color probl...|     189.1| 23599|         9|
|http://www.georgi...|This Agreement is...|     324