In [1]:
from ccnet_spark.text_normalizer import normalize
from ccnet_spark.pipe_preprocess import load_segments
from ccnet_spark.pipe_hash import compute_hashes,split_doc2para
from ccnet_spark.pipe_lid import predictLang
from ccnet_spark.pipe_tokenized import doSentencePiece
from ccnet_spark.pipe_perplexity import doDocLM
from ccnet_spark.pipe_ppbucket import doPPBucket
from ccnet_spark.pipe_save import save_partation,load_partation
import time
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import explode
from pyspark.sql.functions import sum as spark_sum

# 初始化 SparkSession
spark = SparkSession.builder.appName("CCNETSpark")  \
                    .config("spark.executor.memory", "100g") \
                    .config("spark.driver.memory", "32g") \
                    .config("spark.driver.maxResultSize", "32g") \
                    .config('spark.sql.execution.arrow.pyspark.enabled', 'true') \
                    .getOrCreate()

24/04/03 10:49:49 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/04/03 10:49:49 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
def getModePara(mode):
    if(mode=="test"):
        n_segments=10
        cache_folder="/root/wxl_folder/cache_data/"
        date="2019-09" ## hardcode ,现在只能是这个
        segments=[i for i in range(n_segments)]
        min_len=300
        isSample=True
        sampleRate=0.01
        num_partitions=1
    else:
        n_segments=4
        cache_folder="/root/wxl_folder/cache_data/"
        date="2019-09" ## hardcode ,现在只能是这个
        segments=[i for i in range(n_segments)]
        min_len=300
        isSample=False
        sampleRate=1
        num_partitions=4
    return [cache_folder,date,segments,min_len,isSample,sampleRate,num_partitions]
mode="test"
cache_folder,date,segments,min_len,isSample,sampleRate,num_partitions=getModePara(mode)

In [3]:
spark_df=load_segments(spark,segments,cache_folder,date=date,isSample=isSample,sampleRate=sampleRate,min_len=min_len)
spark_df=spark_df.withColumn("length", F.length(spark_df["raw_content"]))
split_result = spark_df.withColumn("split_content", split_doc2para(spark_df["raw_content"]))
exploded_df=split_result.withColumn("exploded_content", explode(split_result.split_content)) \
                        .drop("split_content")
hash_df = exploded_df.withColumn("hash_value", compute_hashes(exploded_df.exploded_content.raw_line))
deduplicated_df = hash_df.dropDuplicates(['hash_value'])

2024-04-03 10:49 INFO 852135:root - load segment 0, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-03 10:49 INFO 852135:root - load segment 1, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-03 10:49 INFO 852135:root - load segment 2, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-03 10:49 INFO 852135:root - load segment 3, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-03 10:49 INFO 852135:root - load segment 4, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-03 10:49 INFO 852135:root - load segment 5, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-03 10:49 INFO 852135:root - load segment 6, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-03 10:49 INFO 852135:root - load segment 7, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-03 10:49 INFO 852135:root - load segment 8, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-03 10:49 INFO 852135:root - load segment 9, with sampleRate:1.0%,

In [4]:
group_df = deduplicated_df.groupBy("digest").agg(
    F.first("url").alias("url"),
    F.first("date_download").alias("date_download"),
    F.first("source_domain").alias("source_domain"),
    F.first("cc_segment").alias("cc_segment"),
    F.first("length").alias("original_length"),
    F.first("nlines").alias("original_nlines"),
    F.first("title").alias("title"),
    F.count("exploded_content.raw_line_id").alias("nlines"),
    F.sort_array(F.collect_list("exploded_content")).alias("exploded_content")
)
group_df = group_df.withColumn("raw_content", F.concat_ws("\n", "exploded_content.raw_line")) 
group_df = group_df.withColumn("raw_line_id", group_df.exploded_content.raw_line_id) 
group_df = group_df.withColumn("length", F.length("raw_content")).drop("exploded_content")

In [5]:
lang_df = group_df.withColumn("lang_score", predictLang("raw_content"))
lang_df = lang_df.withColumn("lang", lang_df.lang_score.lang) \
                         .withColumn("score", lang_df.lang_score.score) \
                         .drop("lang_score")
lm_df = lang_df.withColumn("tokenized", doSentencePiece("raw_content","lang"))
doclm_df = lm_df.withColumn("perplexity", doDocLM("tokenized","lang"))
bucket_df = doclm_df.withColumn("bucket", doPPBucket("perplexity","lang"))
drop_df = bucket_df.drop("tokenized")

In [6]:
save_partation(drop_df,cache_folder,date,isSample,sampleRate,min_len)

                                                                                

In [7]:
selected_bucket="head"
selected_lang="en"
df_en_head=load_partation(spark,selected_lang,selected_bucket,cache_folder,date,isSample,sampleRate,min_len)

In [9]:
df_en_head.show()

+--------------------+--------------------+--------------------+--------------------+----------+---------------+---------------+--------------------+------+--------------------+--------------------+------+-----+----------+
|              digest|                 url|       date_download|       source_domain|cc_segment|original_length|original_nlines|               title|nlines|         raw_content|         raw_line_id|length|score|perplexity|
+--------------------+--------------------+--------------------+--------------------+----------+---------------+---------------+--------------------+------+--------------------+--------------------+------+-----+----------+
|sha1:6G2HUH7IWO3Z...|https://www.filmf...|2019-02-15T18:56:31Z| www.filmfestival.be|         5|           2987|             73|Archived: Black j...|    44|8 > 18 Oct 2019\n...|[0, 1, 5, 6, 12, ...|  2658| 0.66|      89.4|
|sha1:7WDMW36WBB53...|https://phiquyenc...|2019-02-15T19:46:35Z|   phiquyenchinh.org|         1|          18

In [8]:
df_en_head.select("url","raw_content","perplexity","length","cc_segment").show()
print(df_en_head.count())

+--------------------+--------------------+----------+------+----------+
|                 url|         raw_content|perplexity|length|cc_segment|
+--------------------+--------------------+----------+------+----------+
|https://www.filmf...|8 > 18 Oct 2019\n...|      89.4|  2658|         5|
|https://phiquyenc...|Phi Quyền Chính -...|     281.2| 16135|         1|
|http://icetulip.c...|Client Services\n...|     225.5|376034|         8|
|http://www.djvinc...|Home / Our Team /...|     333.7|  1820|         7|
|https://mybeachge...|Orange Beach\nGul...|     199.5|  2701|         2|
|https://brightkit...|This website uses...|     317.2| 10923|         9|
|https://arousingg...|← Civil War: Mr. ...|     335.6| 11391|         6|
|http://www.news-r...|China Lake\nPolic...|     335.2|   647|         6|
|http://tatteredan...|CHRISTMESS GIFT I...|     306.3| 27630|         9|
|https://ppmforums...|:: Home :: Get Ho...|     189.9| 23599|         9|
|http://www.georgi...|Call us now: 678-...|     323