## 01: Preprocess Reddit data

This script reads data extracted from the [Reddit comments corpus](https://files.pushshift.io/reddit/comments/), cleans it and converts it to JSONL with a `"text"` property and `"meta"` data (by default, the name of the subreddit and timestamp). The resulting JSONL can be used for annotation with [Prodigy](https://prodi.gy).

In [None]:
INPUT_DATA = "./raw_data"       # .gz archive or directory of archives
OUTPUT_FILE = "./reddit.jsonl"  # path to output JSONL

In [None]:
!pip install srsly

In [None]:
import re
from pathlib import Path
import gzip
import srsly

In [None]:
class Reddit(object):
    """Stream cleaned comments from Reddit."""

    pre_format_re = re.compile(r"^[\`\*\~]")
    post_format_re = re.compile(r"[\`\*\~]$")
    url_re = re.compile(r"\[([^]]+)\]\(%%URL\)")
    link_re = re.compile(r"\[([^]]+)\]\(https?://[^\)]+\)")

    def __init__(
        self, file_path, meta_keys={"subreddit": "section", "created_utc": "utc"}
    ):
        """
        file_path (unicode / Path): Path to archive or directory of archives.
        meta_keys (dict): Meta data key included in the Reddit corpus, mapped
            to display name in Prodigy meta.
        RETURNS (Reddit): The Reddit loader.
        """
        self.meta = meta_keys
        self.file_path = Path(file_path)
        if not self.file_path.exists():
            raise IOError(f"Can't find file path: {self.file_path}")

    def __iter__(self):
        for file_path in self.iter_files():
            with gzip.open(str(file_path), "rb") as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    comment = srsly.json_loads(line)
                    if self.is_valid(comment):
                        text = self.strip_tags(comment["body"])
                        yield {"text": text, "meta": self.get_meta(comment)}

    def get_meta(self, item):
        return {name: item.get(key, "n/a") for key, name in self.meta.items()}

    def iter_files(self):
        if not self.file_path.is_dir():
            return [self.file_path]
        yield from self.file_path.glob("**/*.gz")

    def strip_tags(self, text):
        text = self.link_re.sub(r"\1", text)
        text = text.replace("&gt;", ">").replace("&lt;", "<")
        text = self.pre_format_re.sub("", text)
        text = self.post_format_re.sub("", text)
        text = re.sub(r"\s+", " ", text)
        return text.strip()

    def is_valid(self, comment):
        return (
            comment["body"] is not None
            and comment["body"] != "[deleted]"
            and comment["body"] != "[removed]"
        )

In [None]:
stream = Reddit(INPUT_DATA)
srsly.write_jsonl(OUTPUT_FILE, stream)