# ccnet spark pipeline 实现

## 1. 导入依赖

In [1]:
from ccnet_spark import open_read, parse_warc_file,compute_hashes,NaiveHashSet, text_normalizer
from pathlib import Path
import numpy as np
import time
import pandas as pd
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StringType,IntegerType,StructType, StructField
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, explode
from pyspark.sql.functions import sum as spark_sum
from cachetools import cached ### model 缓存

# 初始化 SparkSession
spark = SparkSession.builder.appName("CCNETSpark")  \
                    .config("spark.executor.memory", "100g") \
                    .config("spark.driver.memory", "32g") \
                    .config("spark.driver.maxResultSize", "32g") \
                    .config('spark.sql.execution.arrow.enabled', 'true') \
                    .getOrCreate()
sc = spark.sparkContext

24/04/01 17:40:29 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## 2. 读取文件数据，处理成pandas DataFrame

### 2.1 获取cache文件路径

In [2]:
cache_data="../cache_data/2019-09/"
def getWETURL(segment: int):
    cache_file_prefix = "CC-MAIN-20190215183319-20190215205319-"
    cache_file_sufix = ".warc.wet.gz"
    segment_str = str(segment).zfill(5)  # Pad with leading zeros
    return cache_data+cache_file_prefix + segment_str + cache_file_sufix
url = getWETURL(3)
print(url)  # Output: CC-MAIN-20190215183319-20190215205319-00003.warc.wet.gz

../cache_data/2019-09/CC-MAIN-20190215183319-20190215205319-00003.warc.wet.gz


### 2.2 处理文件，存入pandas DataFrame

In [3]:
def getpdf(segment,isPart:bool):
    file_path=Path(getWETURL(segment))
    file=open_read(file_path)
    s=time.time()
    pandas_df = parse_warc_file(file, 30)
    if(isPart):
        random_save_n=100
        pandas_df = pandas_df.sample(n=random_save_n, random_state=1)
    e=time.time()
    print(f"====== parse segment:{segment} to pd_df consume:{e-s} s")
    return pandas_df

## 3. 读取 spark dataframe 文件

In [4]:
def getsdf(segment,isPart:bool):
    inner_path = "_part" if isPart else "_all"
    output_path = cache_data+"cache_parquet/"+str(segment)+  inner_path +".parquet"  # 设置输出路径
    # 检查本地文件是否存在
    if not os.path.exists(output_path):
        print(f"======process to parquet of segment {segment}{inner_path}")
        # 处理文件并生成 Spark DataFrame
        pdf = getpdf(segment,isPart=isPart)
        pdf.to_parquet(output_path)  # 保存为 parquet 文件
        spark_df = spark.createDataFrame(pdf)
    else:
        print(f"======read parquet of segment {segment}{inner_path} from cache")
        pdf = pd.read_parquet(output_path)
        spark_df = spark.createDataFrame(pdf)
    return spark_df
def getsdfs(segments,isPart:bool = False):
    merged_sdf=None
    for seg in segments:
        if(merged_sdf):
            merged_sdf = merged_sdf.unionAll(getsdf(seg,isPart)) # Merge DataFrames
        else:
            merged_sdf = getsdf(seg,isPart)
    return merged_sdf

### 3.1 load spark DataFrame

In [5]:
def getModePara(mode):
    if(mode=="test"):
        para={
            "isTest":True,
            "isPart":True,
            "segments":5,
        }
        return para
    else:
        para={
            "isTest":False,
            "isPart":False,
            "segments":40,
        }
        return para

In [6]:
mode="test"
mode_para=getModePara(mode)
segments=[i for i in range(mode_para["segments"])]
isPart=mode_para["isPart"]

In [7]:
s=time.time()
spark_df = getsdfs(segments,isPart=isPart)
num_docs=spark_df.count()
e=time.time()
print(f"load {len(segments)} segments,with {num_docs} docs,comsume:{e-s}s")



24/04/01 17:40:30 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.
24/04/01 17:40:30 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.
24/04/01 17:40:30 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.


load 5 segments,with 5000 docs,comsume:3.980591297149658s


### 3.2 字段分析
1. wet 文件本身带有长度："length": length,这个是从wet的"Content-Length:"读出来的，和我计算len(raw_content）有出入。考虑原因是原先的length不只是说raw_content，还包括title等。

In [8]:
if(mode_para["isTest"]):
    print("=== TestMode Log:")
    s=time.time()
    print(spark_df.summary())
    tmp_df = spark_df.withColumn("compute_length", F.length(spark_df["raw_content"]))
    tmp_df.select("url","length","nlines","compute_length").show(5)
    e=time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:
DataFrame[summary: string, url: string, date_download: string, digest: string, length: string, nlines: string, source_domain: string, title: string, raw_content: string]
+--------------------+------+------+--------------+
|                 url|length|nlines|compute_length|
+--------------------+------+------+--------------+
|https://www.telel...|  4758|   111|          4669|
|http://www.ma.hu/...|  4180|    70|          3716|
|http://angagement...|  1326|    65|          1231|
|http://resistther...|   912|    23|           868|
|http://klimadiagr...|  1918|    51|          1851|
+--------------------+------+------+--------------+
only showing top 5 rows

time consume:0.6367359161376953s


### 3.3 修改length

In [9]:
spark_df=spark_df.withColumn("length", F.length(spark_df["raw_content"]))

## 4. hash计算

### 4.1 定义UDF,将doc 分割成paragraph 

In [10]:
# 定义一个函数，用于分割文本
def split_raw_content(content):
    lines = content.split('\n')
    line_ids = range(0, len(lines))  # 生成行号
    return list(zip(line_ids, lines))

# 注册为UDF
split_udf = udf(split_raw_content, ArrayType(StructType([
    StructField("raw_line_id", IntegerType(), False),
    StructField("raw_line", StringType(), False)
])))

### 4.2 udf 处理添加新字段

In [11]:
# 假设spark_df是您的DataFrame
# 使用UDF对raw_content字段进行处理
split_result = spark_df.withColumn("split_content", split_udf(spark_df["raw_content"]))
if(mode_para["isTest"]):
    print("=== TestMode Log:")
    s=time.time()
    print(split_result.summary())
    split_result.select("url","length","nlines","raw_content","split_content").show(5)
    e=time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:
DataFrame[summary: string, url: string, date_download: string, digest: string, length: string, nlines: string, source_domain: string, title: string, raw_content: string]
+--------------------+------+------+--------------------+--------------------+
|                 url|length|nlines|         raw_content|       split_content|
+--------------------+------+------+--------------------+--------------------+
|https://www.telel...|  4669|   111|English\tEnglish\...|[{0, English\tEng...|
|http://www.ma.hu/...|  3716|    70|hirdetés\nma.hu n...|[{0, hirdetés}, {...|
|http://angagement...|  1231|    65|→ по-русски\nCost...|[{0, → по-русски}...|
|http://resistther...|   868|    23|Unwanted Resistan...|[{0, Unwanted Res...|
|http://klimadiagr...|  1851|    51|Das Klima in Karl...|[{0, Das Klima in...|
+--------------------+------+------+--------------------+--------------------+
only showing top 5 rows

time consume:1.0078210830688477s


### 4.3 将新字段展开获取paragraph级别row

In [12]:
# Explode the split_content column and select the desired columns
exploded_df = split_result.select("url","date_download","digest","length","nlines","source_domain","title","raw_content", explode(split_result.split_content).alias("exploded_content"))

# Split the exploded_content struct into separate columns
exploded_df = exploded_df.withColumn("raw_line_id", exploded_df.exploded_content.raw_line_id)
exploded_df = exploded_df.withColumn("raw_line", exploded_df.exploded_content.raw_line)

# Drop the exploded_content column if needed
exploded_df = exploded_df.drop("exploded_content")

if(mode_para["isTest"]):
    exploded_df.cache()
    print("=== TestMode Log:")
    s=time.time()
    print(exploded_df.summary())
    exploded_df.select("url","raw_content","raw_line_id","raw_line").show(5)
    e=time.time()
    print(f"time consume:{e-s}s")

24/04/01 17:40:35 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.


=== TestMode Log:
DataFrame[summary: string, url: string, date_download: string, digest: string, length: string, nlines: string, source_domain: string, title: string, raw_content: string, raw_line_id: string, raw_line: string]
+--------------------+--------------------+-----------+------------------------+
|                 url|         raw_content|raw_line_id|                raw_line|
+--------------------+--------------------+-----------+------------------------+
|https://www.telel...|English\tEnglish\...|          0|    English\tEnglish\ten|
|https://www.telel...|English\tEnglish\...|          1|繁體中文\tChinese (Tr...|
|https://www.telel...|English\tEnglish\...|          2|                    Home|
|https://www.telel...|English\tEnglish\...|          3|                Products|
|https://www.telel...|English\tEnglish\...|          4|              Digital TV|
+--------------------+--------------------+-----------+------------------------+
only showing top 5 rows

time consume:1.00459671

### 4.4 添加hash 列

In [13]:
import hashlib
from pyspark.sql.functions import udf
from pyspark.sql.types import BinaryType
from ccnet_spark import normalize_for_dedup
from typing import Iterable, Iterator, Sequence, Sized, Tuple, Type
HASH_TYPE: Type[np.uint64] = np.uint64
HASH_SIZE = HASH_TYPE(0).nbytes 
print(f"HASH_SIZE:{HASH_SIZE}") # 8 Byte ==> 64bit
@udf(returnType=BinaryType())
def compute_hashes(line):
    if not line:
        return None
    normalized_line = normalize_for_dedup(line)  # Assuming normalize_for_dedup is defined
    line_hash = hashlib.sha1(bytes(normalized_line, encoding="utf-8")).digest()[:HASH_SIZE]
    return line_hash

# Assuming you have a dataframe named 'df' with a 'raw_line' column
hash_df = exploded_df.withColumn("hash_value", compute_hashes(exploded_df.raw_line))

# Show the resulting dataframe
if(mode_para["isTest"]):
    print("=== TestMode Log:")
    s=time.time()
    print(hash_df.summary())
    hash_df.show(5)
    e=time.time()
    print(f"time consume:{e-s}s")

HASH_SIZE:8
=== TestMode Log:
DataFrame[summary: string, url: string, date_download: string, digest: string, length: string, nlines: string, source_domain: string, title: string, raw_content: string, raw_line_id: string, raw_line: string]
+--------------------+--------------------+--------------------+------+------+----------------+--------------------+--------------------+-----------+------------------------+--------------------+
|                 url|       date_download|              digest|length|nlines|   source_domain|               title|         raw_content|raw_line_id|                raw_line|          hash_value|
+--------------------+--------------------+--------------------+------+------+----------------+--------------------+--------------------+-----------+------------------------+--------------------+
|https://www.telel...|2019-02-15T19:35:48Z|sha1:VZYTYZZ7EH6E...|  4669|   111|www.telelynx.com|sean, Author at T...|English\tEnglish\...|          0|    English\tEnglish\ten

### 4.5根据 hash 去重

In [14]:
deduplicated_df = hash_df.dropDuplicates(['hash_value'])
# Show the resulting dataframe
if(mode_para["isTest"]):
    print("=== TestMode Log:")
    deduplicated_df.cache()
    s=time.time()
    print(deduplicated_df.summary())
    deduplicated_df.select("url","length","nlines","raw_content","raw_line_id","hash_value").show(5)
    e=time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:
DataFrame[summary: string, url: string, date_download: string, digest: string, length: string, nlines: string, source_domain: string, title: string, raw_content: string, raw_line_id: string, raw_line: string]


24/04/01 17:40:37 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.
24/04/01 17:40:37 WARN TaskSetManager: Stage 7 contains a task of very large size (1316 KiB). The maximum recommended task size is 1000 KiB.
[Stage 8:>                                                          (0 + 1) / 1]

+--------------------+------+------+------------------------------+-----------+--------------------+
|                 url|length|nlines|                   raw_content|raw_line_id|          hash_value|
+--------------------+------+------+------------------------------+-----------+--------------------+
|http://www.region...|  5765|   213|          Salta al contenut...|         22|[00 00 B7 6B E5 F...|
|http://www.darulh...| 28049|   839|          Slå på/av meny\nD...|        738|[00 02 2B BA 78 8...|
|http://outdoormag...|  5695|   156|          O nas\nRedakcja\n...|        128|[00 02 FD 23 BA F...|
|http://stk1031.bl...|  4265|   246|きまぐれあれやこれ\n2018年5...|        167|[00 06 62 88 AC 5...|
|https://jakandjil...|  3258|   305|          A moda está em tu...|         21|[00 08 83 89 3F 2...|
+--------------------+------+------+------------------------------+-----------+--------------------+
only showing top 5 rows

time consume:35.5981240272522s


                                                                                

### 4.6 聚合
将段落重新聚合为doc

In [15]:
from pyspark.sql import functions as F

"url","date_download","digest","length","nlines","source_domain","title","raw_content",
group_df = deduplicated_df.groupBy("digest").agg(
    F.first("url").alias("url"),
    F.first("date_download").alias("date_download"),
    F.first("source_domain").alias("source_domain"),
    F.first("length").alias("original_length"),
    F.first("nlines").alias("original_nlines"),
    F.first("title").alias("title"),
    F.concat_ws("\n", F.collect_list("raw_line").alias("raw_content")).alias("raw_content"),
    F.count("raw_line_id").alias("nlines"),
    F.collect_list("raw_line_id").alias("line_ids"),
)
group_df=group_df.withColumn("length", F.length(group_df["raw_content"]))
if(mode_para["isTest"]):
    print("=== TestMode Log:")
    group_df.cache()
    s=time.time()
    group_df.select("url","original_length","original_nlines","raw_content","length","nlines").show(5)
    e=time.time()
    print(f"time consume:{e-s}s")

24/04/01 17:41:13 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.


=== TestMode Log:


24/04/01 17:41:21 WARN MemoryStore: Not enough space to cache rdd_124_7 in memory! (computed 670.4 MiB so far)
24/04/01 17:41:21 WARN MemoryStore: Not enough space to cache rdd_124_5 in memory! (computed 681.7 MiB so far)
24/04/01 17:41:21 WARN BlockManager: Persisting block rdd_124_5 to disk instead.
24/04/01 17:41:21 WARN BlockManager: Persisting block rdd_124_7 to disk instead.
24/04/01 17:41:21 WARN MemoryStore: Not enough space to cache rdd_124_9 in memory! (computed 674.1 MiB so far)
24/04/01 17:41:21 WARN BlockManager: Persisting block rdd_124_9 to disk instead.

+--------------------+---------------+---------------+---------------------------+------+------+
|                 url|original_length|original_nlines|                raw_content|length|nlines|
+--------------------+---------------+---------------+---------------------------+------+------+
|http://mylandia.r...|           1671|             88|       Отдушка Лунный цв...|  1143|    42|
|http://www.recenz...|           4757|             46|       Tym, co niesamowi...|  4600|    35|
|http://kuraruk.ha...|           1435|            100|9月の旅：東京編\nKYOTO (...|  1062|    66|
|https://www.govtj...|          15891|            386|       AP Fire Departmen...| 14328|   316|
|http://www.millyb...|          23563|           1007|       View Shopping Car...| 15909|   250|
+--------------------+---------------+---------------+---------------------------+------+------+
only showing top 5 rows

time consume:11.720622539520264s


                                                                                

### 4.7 计算留存比例

In [16]:
if mode_para["isTest"]:
    print("=== TestMode Log:")
    s = time.time()
    origin_chars = spark_df.agg(spark_sum("length")).collect()[0][0]
    remain_chars = group_df.agg(spark_sum("length")).collect()[0][0]
    e = time.time()
    print(f"origin chars:{origin_chars/1000/1000}M,remain_chars:{remain_chars/1000/1000}M \n \
            keep chars:{round(remain_chars/origin_chars*100,3)} % time consume:{e-s}")
else:
    print("=== DevMode Log:")
    s = time.time()
    origin_chars = spark_df.agg(spark_sum("length")).collect()[0][0]
    remain_chars = group_df.agg(spark_sum("length")).collect()[0][0]
    e = time.time()
    print(f"origin chars:{origin_chars/1000/1000}M,remain_chars:{remain_chars/1000/1000}M \n \
            keep chars:{round(remain_chars/origin_chars*100,3)} % time consume:{e-s}s")

=== TestMode Log:
origin chars:30.588446M,remain_chars:24.302366999999997M 
             keep chars:79.449 % time consume:0.6132962703704834


## 5. 语言识别导入

In [17]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, FloatType

import fasttext  # type: ignore

@cached(cache={})
def getFastTextModel():
    model_path = "models/fasttext/lid.bin"
    fasttext_model = fasttext.load_model(model_path)
    return fasttext_model
def predict(model, text: str, k: int = 1):
    labels, scores = model.predict(text, k=k)
    labels = [l.replace("__label__", "") for l in labels]
    return labels, scores

@udf(returnType=StringType())
def predictLang(text):
    if not text:
        return None
    labels, scores = predict(getFastTextModel(), text.replace("\n", ""), k=1)
    scores.round(2, out=scores)
    lang = labels[0]
    score = scores[0]
    if score < 0.5:
        return None
    return lang
@udf(returnType=FloatType())
def predictScore(text):
    if not text:
        return None
    labels, scores = predict(getFastTextModel(), text.replace("\n", ""), k=1)
    scores.round(2, out=scores)
    lang = labels[0]
    score = scores[0]
    if score < 0.5:
        return None
    return float(score)
lang_df = group_df.withColumn("lang", predictLang("raw_content"))
lang_df = lang_df.withColumn("score", predictScore("raw_content"))

if mode_para["isTest"]:
    print("=== TestMode Log:")
    s = time.time()
    lang_df.select("url","raw_content","lang","score").show(5)
    e = time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:


[Stage 24:>                                                         (0 + 1) / 1]

+--------------------+---------------------------+----+-----+
|                 url|                raw_content|lang|score|
+--------------------+---------------------------+----+-----+
|http://mylandia.r...|       Отдушка Лунный цв...|  ru| 0.98|
|http://www.recenz...|       Tym, co niesamowi...|  pl|  1.0|
|http://kuraruk.ha...|9月の旅：東京編\nKYOTO (...|  ja|  1.0|
|https://www.govtj...|       AP Fire Departmen...|  en| 0.73|
|http://www.millyb...|       View Shopping Car...|  en| 0.59|
+--------------------+---------------------------+----+-----+
only showing top 5 rows

time consume:3.5569612979888916s


                                                                                

## 6. MultiSentencePiece 分词

In [18]:
from typing import Any, Dict, Iterable, List, NamedTuple, Optional, Sequence, Tuple
import sentencepiece  # type: ignore
lm_dir: Path = Path("../cc_net/data/lm_sp")

def get_lm_languages() -> Sequence[str]:
    languages = [m.name.split(".")[0] for m in lm_dir.glob("*.arpa.bin")]
    return languages

@cached(cache={})
def getLMModel(lang):
    models={l: lm_dir / f"{l}.sp.model" for l in get_lm_languages()}
    lms=get_lm_languages()
    if(lms is None or lang not in lms):
        return None
    sp = sentencepiece.SentencePieceProcessor()
    sp.load(str(models[lang]))
    return sp

@udf(returnType=StringType())
def doSentencePiece(text,lang):
    if text is None or lang is None:
        return None
    text = text_normalizer.normalize(text)
    sp = getLMModel(lang)
    if sp is None:
        return None
    tokenized = sp.encode_as_pieces(text)
    return " ".join(tokenized)

In [29]:
lm_df = lang_df.withColumn("tokenized", doSentencePiece("raw_content","lang"))
if mode_para["isTest"]:
    print("=== TestMode Log:")
    s = time.time()
    lm_df.select("url","raw_content","lang","score","tokenized").show(5)
    e = time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:




+--------------------+---------------------------+----+-----+--------------------------+
|                 url|                raw_content|lang|score|                 tokenized|
+--------------------+---------------------------+----+-----+--------------------------+
|http://mylandia.r...|       Отдушка Лунный цв...|  ru| 0.98|      ▁от душ ка ▁лунны...|
|http://www.recenz...|       Tym, co niesamowi...|  pl|  1.0|      ▁tym , ▁co ▁nie s...|
|http://kuraruk.ha...|9月の旅：東京編\nKYOTO (...|  ja|  1.0|▁ 0 月 の旅 : 東京 編 k...|
|https://www.govtj...|       AP Fire Departmen...|  en| 0.73|      ▁ap ▁fire ▁depart...|
|http://www.millyb...|       View Shopping Car...|  en| 0.59|      ▁view ▁shopping ▁...|
+--------------------+---------------------------+----+-----+--------------------------+
only showing top 5 rows

time consume:0.6076529026031494s


## 7. 困惑度

In [20]:
lm_dir: Path = Path("../cc_net/data/lm_sp")
import kenlm  # type: ignore

@cached(cache={})
def getDocLMModel(lang):
    models={l: lm_dir / f"{l}.arpa.bin" for l in get_lm_languages()}
    lms=get_lm_languages()
    if(lms is None or lang not in lms):
        return None
    lm_config = kenlm.Config()
    lm_config.load_method = 2
    lm = kenlm.Model(str(models[lang]), lm_config)
    return lm
def pp(log_score, length):
    return 10.0 ** (-log_score / length)
@udf(returnType=FloatType())
def doDocLM(text,lang):
    if text is None or lang is None:
        return None
    model = getDocLMModel(lang)
    if model is None:
        return None
    lines = text.split("\n")

    doc_log_score, doc_length = 0, 0
    for line in lines:
        log_score = model.score(line)
        length = len(line.split()) + 1
        doc_log_score += log_score
        doc_length += length
    return round(pp(doc_log_score, doc_length), 1)
doclm=getDocLMModel("en")

In [27]:
doclm_df = lm_df.withColumn("perplexity", doDocLM("tokenized","lang"))
if mode_para["isTest"]:
    print("=== TestMode Log:")
    s = time.time()
    doclm_df.select("url","raw_content","lang","score","tokenized","perplexity").show(5)
    e = time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:


[Stage 48:>                                                         (0 + 1) / 1]

+--------------------+---------------------------+----+-----+--------------------------+----------+
|                 url|                raw_content|lang|score|                 tokenized|perplexity|
+--------------------+---------------------------+----+-----+--------------------------+----------+
|http://mylandia.r...|       Отдушка Лунный цв...|  ru| 0.98|      ▁от душ ка ▁лунны...|     719.4|
|http://www.recenz...|       Tym, co niesamowi...|  pl|  1.0|      ▁tym , ▁co ▁nie s...|     238.7|
|http://kuraruk.ha...|9月の旅：東京編\nKYOTO (...|  ja|  1.0|▁ 0 月 の旅 : 東京 編 k...|    1426.7|
|https://www.govtj...|       AP Fire Departmen...|  en| 0.73|      ▁ap ▁fire ▁depart...|    3504.9|
|http://www.millyb...|       View Shopping Car...|  en| 0.59|      ▁view ▁shopping ▁...|    1817.6|
+--------------------+---------------------------+----+-----+--------------------------+----------+
only showing top 5 rows

time consume:0.940995454788208s


                                                                                

## 8. PerplexityBucket

In [22]:
cutoff_csv = "../cc_net/cc_net/" + "data/" + "cutoff.csv"
percentile_head: int = 30
percentile_tail: int = 60
cutoffs = pd.read_csv(cutoff_csv, index_col=0)
cutoffs = {
    l: (cutoffs[l][percentile_head], cutoffs[l][percentile_tail])
    for l in cutoffs.columns
}

@udf(returnType=StringType())
def doPPBucket(perplexity,lang):
    if (perplexity is None):
        perplexity = -1
    if lang not in cutoffs or perplexity < 0:
        return "all"
    pp_head, pp_tail = cutoffs[lang]
    if perplexity < pp_head:
        return "head"
    if perplexity < pp_tail:
        return "middle"
    return "tail"

In [23]:
bucket_df = doclm_df.withColumn("bucket", doPPBucket("perplexity","lang"))
if mode_para["isTest"]:
    print("=== TestMode Log:")
    s = time.time()
    bucket_df.select("url","raw_content","lang","score","tokenized","perplexity","bucket").show(50)
    e = time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:


[Stage 33:>                                                         (0 + 1) / 1]

+--------------------+-------------------------------------+----+-----+------------------------------+----------+------+
|                 url|                          raw_content|lang|score|                     tokenized|perplexity|bucket|
+--------------------+-------------------------------------+----+-----+------------------------------+----------+------+
|http://mylandia.r...|                 Отдушка Лунный цв...|  ru| 0.98|          ▁от душ ка ▁лунны...|     719.4|  tail|
|http://www.recenz...|                 Tym, co niesamowi...|  pl|  1.0|          ▁tym , ▁co ▁nie s...|     238.7|middle|
|http://kuraruk.ha...|          9月の旅：東京編\nKYOTO (...|  ja|  1.0|    ▁ 0 月 の旅 : 東京 編 k...|    1426.7|middle|
|https://www.govtj...|                 AP Fire Departmen...|  en| 0.73|          ▁ap ▁fire ▁depart...|    3504.9|  tail|
|http://www.millyb...|                 View Shopping Car...|  en| 0.59|          ▁view ▁shopping ▁...|    1817.6|  tail|
|https://www.notti...|                 Nottin

                                                                                

## 9. dropKeys

In [24]:
drop_df = bucket_df.drop("tokenized")
if mode_para["isTest"]:
    print("=== TestMode Log:")
    s = time.time()
    print(drop_df.summary())
    e = time.time()
    print(f"time consume:{e-s}s")

=== TestMode Log:
DataFrame[summary: string, digest: string, url: string, date_download: string, source_domain: string, original_length: string, original_nlines: string, title: string, raw_content: string, nlines: string, length: string, lang: string, score: string, perplexity: string, bucket: string]
time consume:0.04165530204772949s


## 10. split by lang

In [25]:
if mode_para["isTest"]:
    print("=== TestMode Log:")
    s = time.time()
    selected_df = drop_df.filter((drop_df.lang == "en") & (drop_df.bucket == "head"))
    selected_df.select("url","raw_content","lang","bucket").show(50)
    e = time.time()
    print(f"time consume:{e-s}s")


=== TestMode Log:




+--------------------+--------------------+----+------+
|                 url|         raw_content|lang|bucket|
+--------------------+--------------------+----+------+
|https://www.stbal...|Grant Types\nThe ...|  en|  head|
|http://resourcere...|About Disease Inf...|  en|  head|
|https://gusto.com...|1. These R&D Term...|  en|  head|
|https://myownhome...|Reset Filter\nCal...|  en|  head|
|https://www.cultu...|Tagged with: #Mua...|  en|  head|
|https://wanttokno...|How Many Countrie...|  en|  head|
|http://www.honeys...|Changing World\nP...|  en|  head|
|http://www.kraken...|The Truth About t...|  en|  head|
|https://livinwith...|bandera endurance...|  en|  head|
|https://www.homet...|“We removed the s...|  en|  head|
|http://boxerlove....|Boxer early genea...|  en|  head|
|https://www.odesd...|ODES INDUSTRIES m...|  en|  head|
|https://www.bizka...|© Bizkaia:talent\...|  en|  head|
|http://www.nature...|Serious Themes\nT...|  en|  head|
|https://clbfundy....|SPARKS OF INSPIRA...|  en|

                                                                                