In [3]:
import glob
import json
import os
import csv
import zstandard as zstd
import io

KEYWORDS = [
    "covid", "coronavirus", "pandemic", "vaccine", "vaccination", "lockdown", "quarantine", "mask mandate",
    "anti-vaccine", "bioweapon", "misinformation", "fake news"
]

SELECTED_COLUMNS = [
    "id", "author", "created_utc", "title", "selftext", "score",
    "num_comments", "subreddit", "permalink", "author_flair_text"
]

def is_relevant(obj, keywords=KEYWORDS):
    text = ""
    for field in ["title", "selftext", "body"]:
        if field in obj and obj[field]:
            text += " " + str(obj[field])
    text = text.lower()
    return any(keyword.lower() in text for keyword in keywords)

def process_zst_files(input_folder, output_csv_zst):
    zst_files = glob.glob(os.path.join(input_folder, '*.zst'))
    if not zst_files:
        print("No .zst files found in the folder.")
        return

    temp_csv = output_csv_zst + '.tmp'
    
    with open(temp_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=SELECTED_COLUMNS, extrasaction='ignore')
        writer.writeheader()  # Write header once

        for zst_file in zst_files:
            print(f"Processing file: {zst_file}")
            with open(zst_file, 'rb') as f:
                dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
                with dctx.stream_reader(f) as reader:
                    text_stream = io.TextIOWrapper(reader, encoding='utf-8')
                    for line in text_stream:
                        try:
                            obj = json.loads(line)
                        except Exception as e:
                            print(f"Error decoding line in {zst_file}: {e}")
                            continue
                        
                        if not is_relevant(obj):
                            continue

                        filtered_obj = {col: obj.get(col, None) for col in SELECTED_COLUMNS}
                        writer.writerow(filtered_obj)

    with open(temp_csv, 'rb') as input_f, open(output_csv_zst, 'wb') as output_f:
        cctx = zstd.ZstdCompressor()
        cctx.copy_stream(input_f, output_f)

    os.remove(temp_csv)
    print(f"Compressed CSV written to {output_csv_zst}")

In [4]:
input_folder = 'submissions'
output_csv_zst = 'submissions.csv.zst'
process_zst_files(input_folder, output_csv_zst)

Processing file: submissions/RS_2020-02.zst
Processing file: submissions/RS_2020-03.zst
Processing file: submissions/RS_2020-04.zst
Compressed CSV written to submissions.csv.zst


In [5]:
import pandas as pd
import zstandard as zstd
import io

def read_csv_zst(file_path):
    with open(file_path, 'rb') as compressed_file:
        dctx = zstd.ZstdDecompressor()
        with dctx.stream_reader(compressed_file) as reader:
            return pd.read_csv(io.BytesIO(reader.read()))

# Usage
df = read_csv_zst("submissions.csv.zst")

In [6]:
df.head()

Unnamed: 0,id,author,created_utc,title,selftext,score,num_comments,subreddit,permalink,author_flair_text
0,ewxgyv,wndragonlord,1580515205,[x-post from /r/askscience] Have a question ab...,,7,1,korea,/r/korea/comments/ewxgyv/xpost_from_raskscienc...,88년생 뉴저지
1,ewxh1g,farklinkbot,1580515213,Two coronavirus cases confirmed in UK on Brexi...,,1,0,fark,/r/fark/comments/ewxh1g/two_coronavirus_cases_...,
2,ewxh4e,TheGruesomeTwosome,1580515221,How did they know about coronavirus before it ...,,1,1,oldpeoplefacebook,/r/oldpeoplefacebook/comments/ewxh4e/how_did_t...,
3,ewxhfb,EliasMLG,1580515261,What are the chances of someone purchasing the...,[removed],0,5,AskReddit,/r/AskReddit/comments/ewxhfb/what_are_the_chan...,
4,ewxhl3,nofeenews,1580515278,White House dramatically scales up U.S. respon...,,1,0,nofeenews,/r/nofeenews/comments/ewxhl3/white_house_drama...,:doge: admin


In [7]:
import pandas as pd

subreddits = [
    "COVID19", "Coronavirus", "AntiVaxxers", "QAnon", "CoronavirusConspiracy", "Ask_Politics",
    "conspiracy", "The_Donald", "AltRight", "Conservative", "liberal"
]

filtered_df = df[df["subreddit"].isin(subreddits)]

filtered_df.head()

Unnamed: 0,id,author,created_utc,title,selftext,score,num_comments,subreddit,permalink,author_flair_text
6,ewxhs7,jackt-up,1580515303,What’s in store for us this year?,"So far we’ve had Kobe, Coronavirus, and Trump:...",7,9,conspiracy,/r/conspiracy/comments/ewxhs7/whats_in_store_f...,
69,ewxne6,sebast13,1580516027,An analysis of the recent paper about HIV prot...,"I am talking about this article, which is not ...",57,24,Coronavirus,/r/Coronavirus/comments/ewxne6/an_analysis_of_...,
86,ewxomt,tikicyn,1580516189,FEMA camps are good for quarantining coronavir...,[removed],1,2,conspiracy,/r/conspiracy/comments/ewxomt/fema_camps_are_g...,
96,ewxpl2,imNutsandAbolt,1580516293,Lots of questions answered pertaining to the C...,,17,4,Coronavirus,/r/Coronavirus/comments/ewxpl2/lots_of_questio...,
108,ewxq7t,[deleted],1580516366,Trump declares coronavirus outbreak a public e...,[deleted],29,1,The_Donald,/r/The_Donald/comments/ewxq7t/trump_declares_c...,


In [8]:
filtered_df = filtered_df[filtered_df["author"] != "[deleted]"]

filtered_df.head()

Unnamed: 0,id,author,created_utc,title,selftext,score,num_comments,subreddit,permalink,author_flair_text
6,ewxhs7,jackt-up,1580515303,What’s in store for us this year?,"So far we’ve had Kobe, Coronavirus, and Trump:...",7,9,conspiracy,/r/conspiracy/comments/ewxhs7/whats_in_store_f...,
69,ewxne6,sebast13,1580516027,An analysis of the recent paper about HIV prot...,"I am talking about this article, which is not ...",57,24,Coronavirus,/r/Coronavirus/comments/ewxne6/an_analysis_of_...,
86,ewxomt,tikicyn,1580516189,FEMA camps are good for quarantining coronavir...,[removed],1,2,conspiracy,/r/conspiracy/comments/ewxomt/fema_camps_are_g...,
96,ewxpl2,imNutsandAbolt,1580516293,Lots of questions answered pertaining to the C...,,17,4,Coronavirus,/r/Coronavirus/comments/ewxpl2/lots_of_questio...,
109,ewxqeo,Smooth_Imagination,1580516389,Coronavirus Attacks Immune System + Various Notes,[removed],1,1,conspiracy,/r/conspiracy/comments/ewxqeo/coronavirus_atta...,


In [9]:
import pandas as pd
import zstandard as zstd
import os

output_csv_zst = "submissions.csv.zst"

temp_csv = "filtered_subreddits.csv"
filtered_df.to_csv(temp_csv, index=False, encoding="utf-8")

with open(temp_csv, 'rb') as input_f, open(output_csv_zst, 'wb') as output_f:
    cctx = zstd.ZstdCompressor()
    cctx.copy_stream(input_f, output_f)

# Remove the temporary CSV file
os.remove(temp_csv)

print(f"Compressed CSV saved as {output_csv_zst}")

Compressed CSV saved as submissions.csv.zst
