In [1]:
subreddits = ["Conservative", "progressive",
                "democrats", "Republican",
                "NeutralPolitics", "PoliticalDiscussion", "politics"]

In [2]:
from collections import defaultdict
from datetime import datetime
from zst_reader import read_lines_zst, write_line_zst
import json, os, re, zstandard

### Get users

In [None]:
def find_unique_keys(input_path: str):

    # a set of all the unique keys
    unique_keys = set()

    # read the file line by line
    for line, file_bytes_processed in read_lines_zst(input_path):
        obj = json.loads(line)
        keys = obj.keys()
        unique_keys.update(keys)

    return unique_keys

    
def write_to_csv(input_path: str,
                 output_path: str,
                 columns: list=None,
                 start_date: datetime=None,
                 end_date: datetime=None):

    # find the unique keys
    if columns is None:
        print("Finding unique keys...")
        columns = find_unique_keys(input_path)

    # Create the csv file and write the header
    print("Writing to csv file...")
    with open(output_path, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(columns)

    # Save the data to the csv file
    with open(output_path, mode="a", newline="") as file:
        writer = csv.writer(file)
        for line, file_bytes_processed in read_lines_zst(input_path):
            obj = json.loads(line)

            # Filter out comments that don't have a created_utc field
            if "created_utc" not in obj:
                continue
            created_utc = datetime.fromtimestamp(int(obj["created_utc"]))
            
            # Filter by date
            if start_date is not None:
                if created_utc < start_date:
                    continue
            if end_date is not None:
                if created_utc > end_date:
                    continue
            
            # Write the data to the csv file
            values = [obj.get(key, "") for key in columns]
            writer.writerow(values)
    
    print(f"Data written to {output_path}")

    return


In [None]:
for subreddit in subreddits:
    print(f"Writing {subreddit} comments...")
    write_to_csv(f"data/{subreddit}/{subreddit}_comments.zst",
                 f"data/{subreddit}/{subreddit}_comments_users.csv",
                 col_comments,
                 start_date,
                 end_date)

### Clean Submissions

In [3]:
def clean_text(body: str):
    '''Clean the body text'''

    # Remove newlines
    body = body.replace("\n", " ")

    # Remove URLs
    body = re.sub(r"http\S+", "", body)

    # Remove markdown
    body = re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", body)

    # Remove HTML
    body = re.sub(r"<(.*?)>", "", body)

    # Remove emojis
    body = re.sub(r":[^:\s]*(?:::[^:\s]*)*:", "", body)

    # Remove subreddit mentions
    body = re.sub(r"\/?r\/\w+", "", body)
    
    # Remove special characters (except for spaces and exclamation marks)
    body = re.sub(r"[^a-zA-Z0-9!? ]", "", body)

    # Remove extra spaces
    body = re.sub(r"\s+", " ", body)

    # Remove leading and trailing spaces
    body = body.strip()

    return body

In [18]:
def clean_submissions(input_path: str,
                      output_path: str,
                      cols: list):

    # Create the zst handler
    handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))

    # Save the data to zst file
    with open(output_path, mode="w", newline="") as file:

        for line, file_bytes_processed in read_lines_zst(input_path):

            # Convert the line to a json object
            obj = json.loads(line)

            # Filter out submissions that don't have a created_utc field or selftext
            if obj["created_utc"] == "": continue
            if obj["selftext"] == "" or obj["selftext"] == "[deleted]": continue
            if obj["title"] == "": continue
            
            # If score less than 5, skip (remove low quality posts)
            try:
                if abs(int(obj["score"])) < 1:
                    continue
            except:
                score = 0

            # If num_comments less than 5, skip (remove low quality posts)
            try:
                if int(obj["num_comments"]) < 1:
                    continue
            except:
                num_comments = 0

            # Clean the text in each column
            for col in cols:
                text = clean_text(obj[col])
                if text == "": continue
                obj[col] = text
            
            # Write the data to the zst file
            line_clean = json.dumps(obj)
            write_line_zst(handle, line_clean)
    
    print(f"Data written to {output_path}")

    return

In [19]:
for subreddit in ['progressive']: #subreddits:

    file = f"data/{subreddit}/{subreddit}_submissions_subset.zst"
    output = f"data/{subreddit}/{subreddit}_submissions_clean.zst"

    clean_submissions(file, output, ["title", "selftext"])
    

Data written to data/progressive/progressive_submissions_clean.zst


### Clean comments

In [5]:
def clean_comments(input_path: str,
                   output_path: str):

    # Create the zst handler
    handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))

    # Save the data to zst file
    with open(output_path, mode="w", newline="") as file:

        for line, file_bytes_processed in read_lines_zst(input_path):

            # Convert the line to a json object
            obj = json.loads(line)
            text = obj["body"]

            # Filter out comments without a body
            if text == "[deleted]" or text == "[removed]":
                continue

            # Clean the text in each column
            text = clean_text(text)
            if text == "": continue
            obj["body"] = text
            
            # Write the data to the zst file
            line_clean = json.dumps(obj)
            write_line_zst(handle, line_clean)
    
    print(f"Data written to {output_path}")

    return

In [6]:
for subreddit in subreddits:

    input_comments = f"data/{subreddit}/{subreddit}_comments_subset.zst"
    output_comments = f"data/{subreddit}/{subreddit}_comments_clean.zst"

    clean_comments(input_comments, output_comments)

Data written to data/Conservative/Conservative_comments_clean.zst
Data written to data/progressive/progressive_comments_clean.zst
Data written to data/democrats/democrats_comments_clean.zst
Data written to data/Republican/Republican_comments_clean.zst
Data written to data/NeutralPolitics/NeutralPolitics_comments_clean.zst
Data written to data/PoliticalDiscussion/PoliticalDiscussion_comments_clean.zst
Data written to data/politics/politics_comments_clean.zst
