In [1]:
import glob
import json
import os
import csv
import zstandard as zstd
import io
import pandas as pd

# Define the columns to keep in the output CSV
SELECTED_COLUMNS = [
    "id", "author", "created_utc", "body", "score",
    "parent_id", "subreddit", "permalink", "author_flair_text"
]

SUBREDDITS = [
    "politics",
    "Ask_Politics",
    "OccupyWallStreet",
    "socialjustice",
    "conspiracy"
]

known_bots = {
    "AutoModerator", "remindmebot", "TotesMessengerBot", "news_fetcher",
    "COVID19_bot", "karmawhore_bot", "WikiSummarizerBot", "GPT3_CommentBot"
}

In [2]:
# Function to check if an author is a bot
def is_bot(author: str) -> bool:
    return author in known_bots or ("bot" in author.lower())

def load_submissions(submissions_file):
    submissions = pd.read_csv(submissions_file, compression="zstd", usecols=["id"])
    return set(submissions["id"].dropna())

def process_zst_files(input_folder, submissions_file, output_csv_zst):
    zst_files = glob.glob(os.path.join(input_folder, '*.zst'))
    if not zst_files:
        print("No .zst files found in the folder.")
        return

    submission_ids = load_submissions(submissions_file)
    temp_csv = output_csv_zst + '.tmp'
    comment_ids = set()

    with open(temp_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=SELECTED_COLUMNS, extrasaction='ignore')
        writer.writeheader()  # Write header once

        for zst_file in zst_files:
            print(f"Processing file: {zst_file}")
            with open(zst_file, 'rb') as f:
                dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
                with dctx.stream_reader(f) as reader:
                    text_stream = io.TextIOWrapper(reader, encoding='utf-8')
                    for line in text_stream:
                        try:
                            obj = json.loads(line)
                        except Exception as e:
                            print(f"Error decoding line in {zst_file}: {e}")
                            continue

                        if "author" not in obj or is_bot(obj['author']):
                            continue

                        if "author" not in obj or obj['author'] == "[deleted]":
                            continue

                        if "subreddit" not in obj or obj['subreddit'] not in SUBREDDITS:
                            continue
                            
                        # Ensure 'parent_id' exists and is not NaN
                        if "parent_id" not in obj or not obj["parent_id"]:
                            continue

                        # Check if parent exists in submissions or already filtered comments
                        parent_id = obj["parent_id"]
                        try:
                            prefix, pid = parent_id.split("_", 1)
                        except ValueError:
                            continue

                        if prefix == "t3" and pid in submission_ids:
                            # Direct reply to a submission
                            comment_ids.add(obj["id"])  # Store comment ID
                        elif prefix == "t1" and pid in comment_ids:
                            # Reply to a filtered comment
                            comment_ids.add(obj["id"])
                        else:
                            continue  # Skip irrelevant comments

                        filtered_obj = {col: obj.get(col, None) for col in SELECTED_COLUMNS}
                        writer.writerow(filtered_obj)

    with open(temp_csv, 'rb') as input_f, open(output_csv_zst, 'wb') as output_f:
        cctx = zstd.ZstdCompressor()
        cctx.copy_stream(input_f, output_f)

    os.remove(temp_csv)
    print(f"Compressed CSV written to {output_csv_zst}")

In [3]:
input_folder = 'comments'
output_csv_zst = 'comments.csv.zst'
submissions_file = 'submissions.csv.zst'
process_zst_files(input_folder, submissions_file, output_csv_zst)

Processing file: comments/RC_2011-08.zst
Processing file: comments/RC_2011-09.zst
Processing file: comments/RC_2011-10.zst
Compressed CSV written to comments.csv.zst
