This Python script is designed to process a Zstandard-compressed JSON file containing political comments. The script filters the comments based on their creation date and writes the relevant data to a CSV file for each year within a specified range.


To use the script, follow these steps:

1. Ensure that the script and the input file are in the same directory.
2. Specify the input file name and the fields to be extracted from each JSON object in the script.
3. Set the date range by specifying the start year and end year in the script.
4. Run the script in a Python environment where the `datetime`, `zstandard`, `os`, `orjson`, `csv`, and `logging.handlers` libraries are available. If not, install them usi.thor.


In [None]:
#Use this code to install all necessary packages, if needed
#!pip install datetime zstandard os orjson csv logging

<h1>Import necessary libraries</h1>

In [None]:
from datetime import datetime
import zstandard
import os
import orjson
import csv
import logging.handlers

<h1>Define the variables

In [None]:
# Define the input file and the fields to be extracted from each JSON object
input_file_name = "politics_comments.zst"
fields = "author,body,created_utc,parent_id,id,score,author_fullname,ups".split(",") #These are the fields you want to extract
input_file_directory = os.path.dirname(os.path.realpath('__file__'))  # Get the directory of the current script

# Set the date range
start_year = 2007
end_year = 2022

<h1>Unzip the zst file and convert it to CSV</h1>

In [None]:
# Set up logging
log = logging.getLogger("bot")
log.setLevel(logging.DEBUG)
log.addHandler(logging.StreamHandler())  # Log progress and errors to the console

# Define a function to read and decode chunks from the Zstandard-compressed file
def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
    chunk = reader.read(chunk_size)  # Read a chunk from the file
    bytes_read += chunk_size
    if previous_chunk is not None:
        chunk = previous_chunk + chunk  # Combine with the previous chunk if it exists
    try:
        return chunk.decode()  # Try to decode the chunk
    except UnicodeDecodeError:
        if bytes_read > max_window_size:
            raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
        return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)  # If decoding fails, read another chunk and try again

# Define a function to read lines from the Zstandard-compressed file
def read_lines_zst(file_name, start_date, end_date):
    with open(file_name, 'rb') as file_handle:
        buffer = ''
        reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)  # Create a Zstandard decompressor
        while True:
            chunk = read_and_decode(reader, 2**28, (2**30) * 2)  # Read and decode a chunk
            if not chunk:
                break
            lines = (buffer + chunk).split("\n")  # Split the chunk into lines

            for line in lines[:-1]:  # Yield each line
                yield line, file_handle.tell()

            buffer = lines[-1]  # Save the last line (which may be incomplete) for the next chunk
        reader.close()

# Process each year in the date range
for year in range(start_year, end_year + 1):
    start_date = int(datetime(year, 1, 1).timestamp())  # Calculate the start and end dates as Unix timestamps
    end_date = int(datetime(year, 12, 31).timestamp())
    input_file_path = os.path.join(input_file_directory, input_file_name)  # Construct the input and output file paths
    output_file_path = os.path.join(input_file_directory, f"{input_file_name}_{year}.csv")
    file_size = os.stat(input_file_path).st_size  # Get the size of the input file
    file_lines = 0
    file_bytes_processed = 0
    line = None
    created = None
    bad_lines = 0
    output_file = open(output_file_path, "w", encoding='utf-8', newline="")  # Open the output file for writing
    writer = csv.writer(output_file)  # Create a CSV writer
    try:
        for line, file_bytes_processed in read_lines_zst(input_file_path, start_date, end_date):  # Read lines from the input file
            try:
                obj = orjson.loads(line)  # Parse the line as a JSON object
                created = int(obj['created_utc'])  # Extract the creation date

                # Check if the creation date is within the specified date range
                if start_date <= created <= end_date:
                    output_obj = []
                    for field in fields:  # Extract the relevant fields
                        output_obj.append(str(obj.get(field, '')).encode("utf-8", errors='replace').decode())
                    writer.writerow(output_obj)  # Write the fields to the output file
            except orjson.JSONDecodeError as err:
                bad_lines += 1  # Count the line as a bad line if it cannot be parsed as a JSON object
            file_lines += 1
            if file_lines % 100000 == 0:  # Log progress information every 100,000 lines
                log.info(f"{datetime.utcfromtimestamp(created).strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {bad_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
    except KeyError as err:  # Log the error and the line that caused it if a KeyError occurs
        log.info(f"Object has no key: {err}")
        log.info(line)
    except Exception as err:  # Log the error and the line that caused it if any other exception occurs
        log.info(err)
        log.info(line)

    output_file.close()  # Close the output file
    log.info(f"Complete : {file_lines:,} : {bad_lines:,}")  # Log a completion message

2007-10-26 00:21:06 : 100,000 : 0 : 0%
2007-11-30 23:22:12 : 200,000 : 0 : 0%
2007-12-28 05:10:14 : 300,000 : 0 : 0%
2008-01-18 20:16:57 : 400,000 : 0 : 0%
2008-02-10 13:59:59 : 500,000 : 0 : 1%
2008-03-11 03:02:14 : 600,000 : 0 : 1%
2008-04-24 12:30:57 : 700,000 : 0 : 1%
2008-06-04 15:57:50 : 800,000 : 0 : 1%
2008-07-17 01:04:22 : 900,000 : 0 : 1%
2008-08-29 12:19:44 : 1,000,000 : 0 : 1%
2008-09-24 00:08:52 : 1,100,000 : 0 : 1%
2008-10-16 01:22:56 : 1,200,000 : 0 : 1%
2008-11-05 03:43:08 : 1,300,000 : 0 : 1%
2008-12-28 03:03:27 : 1,400,000 : 0 : 1%
2009-01-20 11:54:39 : 1,500,000 : 0 : 1%
2009-02-25 05:42:04 : 1,600,000 : 0 : 1%
2009-04-03 14:55:18 : 1,700,000 : 0 : 1%
2009-05-10 22:28:08 : 1,800,000 : 0 : 1%
2009-06-14 02:42:21 : 1,900,000 : 0 : 1%
2009-07-17 03:49:19 : 2,000,000 : 0 : 1%
2009-08-13 06:16:33 : 2,100,000 : 0 : 1%
2009-09-05 03:41:44 : 2,200,000 : 0 : 2%
2009-09-26 02:35:52 : 2,300,000 : 0 : 2%
2009-10-21 23:44:20 : 2,400,000 : 0 : 2%
2009-11-16 01:31:05 : 2,500,000 : 