In [1]:
import glob
import json
import os
import csv
import zstandard as zstd
import io

KEYWORDS = [
    "charlottesville",
    "unite the right",
    "nazi",
    "alt-right",
    "white supremacist",
    "racist",
    "white nationalist",
    "fascist",
    "kkk",
    "racism",
    "rally"
]

SELECTED_COLUMNS = [
    "id", "author", "created_utc", "title", "selftext", "score",
    "num_comments", "subreddit", "permalink", "author_flair_text"
]

In [2]:
def is_relevant(obj, keywords=KEYWORDS):
    text = ""
    for field in ["title", "selftext", "body"]:
        if field in obj and obj[field]:
            text += " " + str(obj[field])
    text = text.lower()
    return any(keyword.lower() in text for keyword in keywords)

def process_zst_files(input_folder, output_csv_zst):
    zst_files = glob.glob(os.path.join(input_folder, '*.zst'))
    if not zst_files:
        print("No .zst files found in the folder.")
        return

    temp_csv = output_csv_zst + '.tmp'
    
    with open(temp_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=SELECTED_COLUMNS, extrasaction='ignore')
        writer.writeheader()  # Write header once

        for zst_file in zst_files:
            print(f"Processing file: {zst_file}")
            with open(zst_file, 'rb') as f:
                dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
                with dctx.stream_reader(f) as reader:
                    text_stream = io.TextIOWrapper(reader, encoding='utf-8')
                    for line in text_stream:
                        try:
                            obj = json.loads(line)
                        except Exception as e:
                            print(f"Error decoding line in {zst_file}: {e}")
                            continue
                        
                        if not is_relevant(obj):
                            continue

                        filtered_obj = {col: obj.get(col, None) for col in SELECTED_COLUMNS}
                        writer.writerow(filtered_obj)

    with open(temp_csv, 'rb') as input_f, open(output_csv_zst, 'wb') as output_f:
        cctx = zstd.ZstdCompressor()
        cctx.copy_stream(input_f, output_f)

    os.remove(temp_csv)
    print(f"Compressed CSV written to {output_csv_zst}")

In [3]:
input_folder = 'submissions'
output_csv_zst = 'submissions.csv.zst'
process_zst_files(input_folder, output_csv_zst)

Processing file: submissions/RS_2017-07.zst
Processing file: submissions/RS_2017-08.zst
Processing file: submissions/RS_2017-09.zst
Compressed CSV written to submissions.csv.zst


In [4]:
import pandas as pd
import zstandard as zstd
import io

def read_csv_zst(file_path):
    with open(file_path, 'rb') as compressed_file:
        dctx = zstd.ZstdDecompressor()
        with dctx.stream_reader(compressed_file) as reader:
            return pd.read_csv(io.BytesIO(reader.read()))

# Usage
df = read_csv_zst("submissions.csv.zst")

In [5]:
df.head()

Unnamed: 0,id,author,created_utc,title,selftext,score,num_comments,subreddit,permalink,author_flair_text
0,6kjsy7,SomeFosterKid,1498867208,"[USA-AL][H] HP Pavilion 17"" Laptop X7N62AV w/ ...",Reposting w/ lower price/without added ssd unl...,7,12,hardwareswap,/r/hardwareswap/comments/6kjsy7/usaalh_hp_pavi...,
1,6kjt2w,Penguava,1498867257,Getting overrun by goblins on turn 4? Try this...,[i.imgur.com/peQtNAa.png](https://i.imgur.com/...,3,0,Shadowverse,/r/Shadowverse/comments/6kjt2w/getting_overrun...,And so our memes become victory!
2,6kjt69,alwaysdoe,1498867282,The Types of People You’ll Meet in Your House:...,"Part Three (congrats, you read the title). Huf...",38,13,harrypotter,/r/harrypotter/comments/6kjt69/the_types_of_pe...,Yew with Pheonix Feather/ Eagle / Thunderbird
3,6kjtjb,tubbem,1498867413,How do i counter this argument?,In some debates i have faced the argument that...,5,11,AskLibertarians,/r/AskLibertarians/comments/6kjtjb/how_do_i_co...,
4,6kjtk6,Shark_Bot,1498867420,[fakehistoryporn] First Jewish men to be permi...,,1,1,DankMemeArchive,/r/DankMemeArchive/comments/6kjtk6/fakehistory...,Mecha Meme Librarian


In [6]:
import pandas as pd

subreddits = [
    "politics",
    "Ask_Politics",
    "The_Donald",
    "AltRight",
    "Conservative",
    "liberal"
]

In [7]:
filtered_df = df[df["subreddit"].isin(subreddits)]

filtered_df.head()

Unnamed: 0,id,author,created_utc,title,selftext,score,num_comments,subreddit,permalink,author_flair_text
12,6kju6t,[deleted],1498867638,"#BlackLivesMatter, you racist!",[deleted],102,1,The_Donald,/r/The_Donald/comments/6kju6t/blacklivesmatter...,
66,6kjy5l,TomFreeSpeech,1498868847,"Al Sharpton, RACIST RACE BAITER EXTRAORDINAIRE...",,16,0,The_Donald,/r/The_Donald/comments/6kjy5l/al_sharpton_raci...,
77,6kjyh3,mainstream_lurker,1498868954,@sean_spicier: Boy? That's racist https://t.co...,,16,0,The_Donald,/r/The_Donald/comments/6kjyh3/sean_spicier_boy...,KEK
181,6kk6aw,WHYitBoy,1498871637,BLACK YOUTUBER MINNESOTABOYY PRETENDS TO BE A ...,,10,0,The_Donald,/r/The_Donald/comments/6kk6aw/black_youtuber_m...,
206,6kk88f,[deleted],1498872283,"Kamala Harris, Feinstein, Susan Collins, Marco...",[deleted],5,2,The_Donald,/r/The_Donald/comments/6kk88f/kamala_harris_fe...,


In [8]:
filtered_df = filtered_df[filtered_df["author"] != "[deleted]"]

filtered_df.head()

Unnamed: 0,id,author,created_utc,title,selftext,score,num_comments,subreddit,permalink,author_flair_text
66,6kjy5l,TomFreeSpeech,1498868847,"Al Sharpton, RACIST RACE BAITER EXTRAORDINAIRE...",,16,0,The_Donald,/r/The_Donald/comments/6kjy5l/al_sharpton_raci...,
77,6kjyh3,mainstream_lurker,1498868954,@sean_spicier: Boy? That's racist https://t.co...,,16,0,The_Donald,/r/The_Donald/comments/6kjyh3/sean_spicier_boy...,KEK
181,6kk6aw,WHYitBoy,1498871637,BLACK YOUTUBER MINNESOTABOYY PRETENDS TO BE A ...,,10,0,The_Donald,/r/The_Donald/comments/6kk6aw/black_youtuber_m...,
213,6kk8lw,kingpepesadfrog,1498872427,"Michael Grunwald, senior writer at Times, lite...",,228,5,The_Donald,/r/The_Donald/comments/6kk8lw/michael_grunwald...,
239,6kkama,RickyMissile,1498873158,How dare this fucking White cis male bring his...,,15,2,The_Donald,/r/The_Donald/comments/6kkama/how_dare_this_fu...,PEPE!


In [9]:
import pandas as pd
import zstandard as zstd
import os

output_csv_zst = "submissions.csv.zst"

temp_csv = "filtered_subreddits.csv"
filtered_df.to_csv(temp_csv, index=False, encoding="utf-8")

with open(temp_csv, 'rb') as input_f, open(output_csv_zst, 'wb') as output_f:
    cctx = zstd.ZstdCompressor()
    cctx.copy_stream(input_f, output_f)

# Remove the temporary CSV file
os.remove(temp_csv)

print(f"Compressed CSV saved as {output_csv_zst}")

Compressed CSV saved as submissions.csv.zst
