# Subset Data

Select data from 2019 through 2022 for the appriopriate subreddits. Remove unnecessary columns and save the data to a zst file.

In [None]:
from collections import defaultdict
from datetime import datetime
import json, os, zstandard

In [None]:
subreddits = ["Conservative", "progressive",
                "democrats", "Republican",
                "NeutralPolitics", "PoliticalDiscussion", "politics"]

In [None]:
# Set the start and end dates to subset the data
start_date = datetime(2019, 1, 1)
end_date = datetime(2022, 12, 31)

### Subset submissions

In [None]:
def subset_submissions(input_path: str,
                       output_path: str,
                       start_date: datetime=None,
                       end_date: datetime=None,
                       keys: list=None):

    # Create the zst handler
    handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))

    # Save the data to zst file
    with open(output_path, mode="w", newline="") as file:

        for line, file_bytes_processed in read_lines_zst(input_path):
            obj = json.loads(line)

            # Filter out comments that don't have a created_utc field
            if "created_utc" not in obj: continue
            created_utc = datetime.fromtimestamp(int(obj["created_utc"]))
            
            # Filter by date
            if created_utc < start_date: continue
            if created_utc > end_date: continue
            
            # Select the keys (if they exist)
            obj = {key: obj.get(key, "") for key in keys}
            
            # Skip if selftext is empty or deleted
            if obj["selftext"] == "" or obj["selftext"] == "[deleted]": continue

            # if id is in the keys, change it to link_id
            if "id" in keys:
                # Select characters after underscore
                try:
                    obj["link_id"] = obj["id"].split("_")[-1]
                except:
                    obj["link_id"] = ""

            # Write the data to the zst file
            line_clean = json.dumps(obj)
            write_line_zst(handle, line_clean)
    
    print(f"Data written to {output_path}")

    return

In [None]:
# Set the keys to keep
keys_submissions = ["id", "author", "downs", "ups", "title", "num_comments", "created_utc", "selftext", "score"]

for subreddit in subreddits:

    # Set the input and output paths
    input_path = f"data/{subreddit}/{subreddit}_submissions.zst"
    output_path = f"data/{subreddit}/{subreddit}_submissions_subset.zst"

    # Subset the data
    subset_submissions(input_path, output_path, start_date, end_date, keys_submissions)

### Subset comments

In [None]:
def subset_comments(input_path: str,
                    output_path: str,
                    start_date: datetime=None,
                    end_date: datetime=None,
                    keys: list=None):

    # Create the zst handler
    handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))

    # Save the data to zst file
    with open(output_path, mode="w", newline="") as file:

        for line, file_bytes_processed in read_lines_zst(input_path):
            obj = json.loads(line)

            # Filter out comments that don't have a created_utc field
            if "created_utc" not in obj: continue
            created_utc = datetime.fromtimestamp(int(obj["created_utc"]))
            
            # Filter by date
            if created_utc < start_date: continue
            if created_utc > end_date: continue
            
            # Select the keys (if they exist)
            obj = {key: obj.get(key, "") for key in keys}
            
            # Skip if body is empty or deleted
            if obj["body"] == "" or obj["body"] == "[deleted]": continue

            # Write the data to the zst file
            line_clean = json.dumps(obj)
            write_line_zst(handle, line_clean)
    
    print(f"Data written to {output_path}")

    return


In [None]:
# Set the keys to keep
keys_comments = ["link_id", "author", "created_utc", "body", "score", "ups", "downs", "controversiality", "gilded"]

for subreddit in subreddits:

    # Set the input and output paths
    input_path = f"data/{subreddit}/{subreddit}_comments.zst"
    output_path = f"data/{subreddit}/{subreddit}_comments_subset.zst"

    # Subset the data
    subset_comments(input_path, output_path, start_date, end_date, keys_comments)