In [1]:
import re
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
    .appName("Process dataset with final configv2") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "3") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "3") \
    .config("spark.dynamicAllocation.maxExecutors", "9") \
    .config("spark.dynamicAllocation.initialExecutors", "3") \
    .getOrCreate()

sc = spark.sparkContext

In [5]:
input_path = "hdfs://namenode:9000/data/reddit_input.txt"
# raw_rdd = sc.textFile(input_path)
raw_rdd = sc.textFile(input_path, minPartitions=24)

In [6]:
def extract_link_info(link):
    pattern = r"https://old\.reddit\.com/r/([^/]+)/comments/([^/]+)/[^/]+/([^/]+)/?"
    match = re.match(pattern, link)
    if match:
        sub_reddit = match.group(1)
        post_id = match.group(2)
        comment_id = match.group(3)
        return sub_reddit, post_id, comment_id
    return None, None, None

In [7]:
def clean_comment_body(body):
    body = body.lower()
    body = re.sub(r'&[a-z]+;', '', body)
    #body = re.sub(r'http\S+|www.\S+', '', body)
    body = re.sub(r'[^a-zA-Z0-9\s.!?]', '', body)

    body = ' '.join(body.split())

    return body

In [8]:
def map_to_comment_blocks(lines):
    link = ""
    created_utc = ""
    score = ""
    body_lines = []
    for line in lines:
        line = line.strip()

        if re.match(r'^[a-zA-Z_]+:', line):
            if not (line.startswith("link:") or line.startswith("created_utc:") or line.startswith("score:")  or line.startswith("body:")):
                continue

        if line.startswith("link:"):
            if link and created_utc and score and body_lines:
                body = " ".join(body_lines).strip()
                body = clean_comment_body(body)
                sub_reddit, post_id, comment_id = extract_link_info(link)
                yield (link, int(created_utc), int(score), sub_reddit, post_id, comment_id, body)
            link = line.replace("link:", "").strip()
            created_utc = ""
            body_lines = []
        elif line.startswith("created_utc:"):
            created_utc = line.replace("created_utc:", "").strip()
        elif line.startswith("score:"):
            score = line.replace("score:", "").strip()
        elif line.startswith("body:"):
            body_lines.append(line.replace("body:", "").strip())
        else:
            body_lines.append(line)

    if link and created_utc and score and body_lines:
        return

In [9]:
comments_rdd = raw_rdd \
    .mapPartitions(lambda partition: map_to_comment_blocks(partition)) \
    .filter(lambda x: all(x))

comments_df = comments_rdd.toDF(["link", "created_utc", "score", "sub_reddit", "post_id", "comment_id", "body"])

                                                                                

In [None]:
output_path = "hdfs://namenode:9000/data/cleaned_dataset.parquet"
comments_df.write.mode("overwrite").parquet(output_path)

[Stage 1:=>                                                        (1 + 2) / 46]

In [None]:
spark.stop()