In [1]:
from ccnet_spark.text_normalizer import normalize
from ccnet_spark.pipe_preprocess import load_segments
from ccnet_spark.pipe_hash import compute_hashes,split_doc2para
from ccnet_spark.pipe_lid import predictLang
from ccnet_spark.pipe_tokenized import doSentencePiece
from ccnet_spark.pipe_perplexity import doDocLM
from ccnet_spark.pipe_ppbucket import doPPBucket
from ccnet_spark.pipe_save import save_partation,load_partation
import time
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import explode
from pyspark.sql.functions import sum as spark_sum
from pyspark.sql.functions import col
# 初始化 SparkSession
spark = SparkSession.builder.appName("CCNETSpark")  \
                    .config("spark.executor.memory", "100g") \
                    .config("spark.driver.memory", "32g") \
                    .config("spark.driver.maxResultSize", "32g") \
                    .config('spark.sql.execution.arrow.pyspark.enabled', 'true') \
                    .getOrCreate()

ModuleNotFoundError: No module named 'ccnet_spark'

In [2]:
def getModePara(mode):
    if(mode=="test"):
        n_segments=10
        cache_folder="/root/wxl_folder/cache_data/"
        date="2019-09" ## hardcode ,现在只能是这个
        segments=[i for i in range(n_segments)]
        min_len=300
        isSample=True
        sampleRate=0.01
        num_partitions=1
    else:
        n_segments=4
        cache_folder="/root/wxl_folder/cache_data/"
        date="2019-09" ## hardcode ,现在只能是这个
        segments=[i for i in range(n_segments)]
        min_len=300
        isSample=False
        sampleRate=1
        num_partitions=4
    return [cache_folder,date,segments,min_len,isSample,sampleRate,num_partitions]
mode="test"
cache_folder,date,segments,min_len,isSample,sampleRate,num_partitions=getModePara(mode)

In [3]:
spark_df=load_segments(spark,segments,cache_folder,date=date,isSample=isSample,sampleRate=sampleRate,min_len=min_len)
spark_df=spark_df.withColumn("length", F.length(spark_df["raw_content"]))
split_result = spark_df.withColumn("split_content", split_doc2para(spark_df["raw_content"]))
exploded_df=split_result.withColumn("exploded_content", explode(split_result.split_content)) \
                        .drop("split_content")
hash_df = exploded_df.withColumn("hash_value", compute_hashes(exploded_df.exploded_content.raw_line))


2024-04-07 09:29 INFO 981237:root - load segment 0, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-07 09:29 INFO 981237:root - load segment 1, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-07 09:29 INFO 981237:root - load segment 2, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-07 09:29 INFO 981237:root - load segment 3, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-07 09:29 INFO 981237:root - load segment 4, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-07 09:29 INFO 981237:root - load segment 5, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-07 09:29 INFO 981237:root - load segment 6, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-07 09:29 INFO 981237:root - load segment 7, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-07 09:29 INFO 981237:root - load segment 8, with sampleRate:1.0%,min_len:300,with date:2019-09
2024-04-07 09:29 INFO 981237:root - load segment 9, with sampleRate:1.0%,

In [4]:
#deduplicated_df = hash_df.dropDuplicates(['hash_value']) # 第一种是保留一次重复行

## 第二种是不保留重复行
# 使用 groupBy 和 count 识别重复行
duplicate_counts = hash_df.groupBy("hash_value").count().where(col("count") > 1)
# 根据重复行的信息，使用 filter 过滤掉重复行
deduplicated_df = hash_df.join(duplicate_counts, on="hash_value", how="left_anti")

In [5]:
deduplicated_df.show()

                                                                                

+--------------------+--------------------+--------------------+--------------------+------+------+--------------+-------------------------------------+---------------------------------+----------+------------------------------+
|          hash_value|                 url|       date_download|              digest|length|nlines| source_domain|                                title|                      raw_content|cc_segment|              exploded_content|
+--------------------+--------------------+--------------------+--------------------+------+------+--------------+-------------------------------------+---------------------------------+----------+------------------------------+
|[51 1F 35 D0 30 B...|http://www.drperi...|2019-02-15T20:06:54Z|sha1:TETCOD7OGM66...|  3474|    94|www.drperi.com|宋志平到山东济南、烟台和青州交流调...|OA登录 人力资源系统 协同办公系...|         0| {0, OA登录 人力资源系统 协...|
|[14 29 D1 A8 69 C...|http://www.drperi...|2019-02-15T20:06:54Z|sha1:TETCOD7OGM66...|  3474|    94|www.drperi.com|宋志平到山东济南、

In [6]:
group_df = deduplicated_df.groupBy("digest").agg(
    F.first("url").alias("url"),
    F.first("date_download").alias("date_download"),
    F.first("source_domain").alias("source_domain"),
    F.first("cc_segment").alias("cc_segment"),
    F.first("length").alias("original_length"),
    F.first("nlines").alias("original_nlines"),
    F.first("title").alias("title"),
    F.count("exploded_content.raw_line_id").alias("nlines"),
    F.sort_array(F.collect_list("exploded_content")).alias("exploded_content")
)
group_df = group_df.withColumn("raw_content", F.concat_ws("\n", "exploded_content.raw_line")) 
group_df = group_df.withColumn("raw_line_id", group_df.exploded_content.raw_line_id) 
group_df = group_df.withColumn("length", F.length("raw_content")).drop("exploded_content")

In [7]:
lang_df = group_df.withColumn("lang_score", predictLang("raw_content"))
lang_df = lang_df.withColumn("lang", lang_df.lang_score.lang) \
                         .withColumn("score", lang_df.lang_score.score) \
                         .drop("lang_score")
lm_df = lang_df.withColumn("tokenized", doSentencePiece("raw_content","lang"))
doclm_df = lm_df.withColumn("perplexity", doDocLM("tokenized","lang"))
bucket_df = doclm_df.withColumn("bucket", doPPBucket("perplexity","lang"))
drop_df = bucket_df.drop("tokenized")

In [8]:
save_partation(drop_df,cache_folder,date,isSample,sampleRate,min_len)

                                                                                

In [9]:
selected_bucket="head"
selected_lang="en"
df_en_head=load_partation(spark,selected_lang,selected_bucket,cache_folder,date,isSample,sampleRate,min_len)

In [10]:
df_en_head.show()

+--------------------+--------------------+--------------------+--------------------+----------+---------------+---------------+--------------------+------+--------------------+--------------------+------+-----+----------+
|              digest|                 url|       date_download|       source_domain|cc_segment|original_length|original_nlines|               title|nlines|         raw_content|         raw_line_id|length|score|perplexity|
+--------------------+--------------------+--------------------+--------------------+----------+---------------+---------------+--------------------+------+--------------------+--------------------+------+-----+----------+
|sha1:3MY7PKVVTUN2...|http://www.cactus...|2019-02-15T19:12:15Z| www.cactusranch.com|         8|           1542|             25|          On Gilling|    19|On Gilling\nPoste...|[0, 2, 3, 4, 5, 6...|  1381| 0.99|     339.1|
|sha1:6G2HUH7IWO3Z...|https://www.filmf...|2019-02-15T18:56:31Z| www.filmfestival.be|         5|           2

In [11]:
df_en_head.select("url","raw_content","perplexity","length","cc_segment").show()
print(df_en_head.count())

+--------------------+--------------------+----------+------+----------+
|                 url|         raw_content|perplexity|length|cc_segment|
+--------------------+--------------------+----------+------+----------+
|http://www.cactus...|On Gilling\nPoste...|     339.1|  1381|         8|
|https://www.filmf...|8 > 18 Oct 2019\n...|      72.4|  2526|         5|
|https://phiquyenc...|Phi Quyền Chính -...|     279.6| 15952|         1|
|http://icetulip.c...|Client Services\n...|     225.5|376028|         8|
|http://www.djvinc...|Home / Our Team /...|     326.8|  1785|         7|
|https://mybeachge...|Orange Beach\nGul...|     188.9|  2650|         2|
|https://brightkit...|This website uses...|     320.9| 10836|         9|
|https://arousingg...|Civil War: Mr. Fa...|     335.3| 11257|         6|
|https://www.waiko...|Sunday Worship Op...|      78.6|   317|         7|
|http://dedicatedd...|Dedicated Dental ...|     312.3|  4250|         3|
|https://www.killc...|Military Issue\nH...|     280