In [12]:
!pip install zstandard pandas tqdm

Collecting tqdm
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.1/77.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tqdm
Successfully installed tqdm-4.65.0
[0m

In [20]:
import pandas as pd
import numpy as np
from tqdm import tqdm

tqdm.pandas()

In [7]:
# Adapted from https://github.com/Watchful1/PushshiftDumps/blob/master/scripts/single_file.py

import zstandard
import os
import json
import logging.handlers


log = logging.getLogger("bot")
log.setLevel(logging.DEBUG)
log.addHandler(logging.StreamHandler())

def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
	chunk = reader.read(chunk_size)
	bytes_read += chunk_size
	if previous_chunk is not None:
		chunk = previous_chunk + chunk
	try:
		return chunk.decode()
	except UnicodeDecodeError:
		if bytes_read > max_window_size:
			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)


def read_lines_zst(file_name):
	with open(file_name, 'rb') as file_handle:
		buffer = ''
		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
		while True:
			chunk = read_and_decode(reader, 2**27, (2**29) * 2)

			if not chunk:
				break
			lines = (buffer + chunk).split("\n")

			for line in lines[:-1]:
				yield line, file_handle.tell()

			buffer = lines[-1]

		reader.close()


in_file = '/workspace/data/reddit/submissions/RS_2023-01.zst'
out_file = '/workspace/data/reddit/submissions/RS_2023-01.jsonl'

fields = [
  'id',
  'author',
  'subreddit',
  'title',
  'selftext',
  'created_utc',
  'score',
  'upvote_ratio',
  'removed_by_category',
  'num_comments',
]

file_size = os.stat(in_file).st_size
file_lines = 0
file_bytes_processed = 0
created = None
bad_lines = 0

with open(out_file, 'w') as out:
  for line, file_bytes_processed in read_lines_zst(in_file):    
    file_lines += 1
    if file_lines % 100000 == 0:
      log.info(f"Processed {file_lines} lines ({(file_bytes_processed / file_size) * 100:.0f}%) ({bad_lines} failed)")

    try:
      parsed = json.loads(line)

      if len(parsed['selftext'] or '') < 10:
        continue
      
      # Only keep the fields we want
      obj = {k: parsed[k] for k in fields}
      
      out.write(json.dumps(obj) + '\n')
    except (KeyError, json.JSONDecodeError) as err:
      print(err)

log.info(f"Complete : {file_lines:,} : {bad_lines:,}")

Processed 100000 lines (2%) (0 failed)
Processed 100000 lines (2%) (0 failed)
Processed 100000 lines (2%) (0 failed)
Processed 100000 lines (2%) (0 failed)
Processed 100000 lines (2%) (0 failed)
Processed 100000 lines (2%) (0 failed)
Processed 200000 lines (3%) (0 failed)
Processed 200000 lines (3%) (0 failed)
Processed 200000 lines (3%) (0 failed)
Processed 200000 lines (3%) (0 failed)
Processed 200000 lines (3%) (0 failed)
Processed 200000 lines (3%) (0 failed)
Processed 300000 lines (5%) (0 failed)
Processed 300000 lines (5%) (0 failed)
Processed 300000 lines (5%) (0 failed)
Processed 300000 lines (5%) (0 failed)
Processed 300000 lines (5%) (0 failed)
Processed 300000 lines (5%) (0 failed)
Processed 400000 lines (6%) (0 failed)
Processed 400000 lines (6%) (0 failed)
Processed 400000 lines (6%) (0 failed)
Processed 400000 lines (6%) (0 failed)
Processed 400000 lines (6%) (0 failed)
Processed 400000 lines (6%) (0 failed)
Processed 500000 lines (8%) (0 failed)
Processed 500000 lines (8

In [8]:
import pandas as pd

df = pd.read_json(out_file, lines=True)
df.head()

In [9]:
df['subreddit'] = df['subreddit'].astype('category')

df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

In [10]:
df.to_feather('/workspace/data/reddit/submissions/RS_2023-01.arrow')

In [14]:
df = pd.read_feather('/workspace/data/reddit/submissions/RS_2023-01.arrow')

In [18]:
# Limit to only subreddits with at least 1000 submissions
df = df.groupby('subreddit').filter(lambda x: len(x) > 1000)

df.shape

In [21]:
df['log_score'] = np.log10(df['score'] + 1)

In [22]:
def format_text(row):
  return f"""Title: {row.title}
Subreddit: /r/{row.subreddit}
Author: /u/{row.author}
Posted: {row.created_utc.strftime('%A, %B %d, %I:%M %p')}

Text: {row.selftext}"""

df['formatted_text'] = df.progress_apply(format_text, axis=1)[0]

100%|██████████| 2380634/2380634 [03:23<00:00, 11700.96it/s]


In [23]:
# Split into train and test based on date. 80% train, 20% test

df = df.sort_values('created_utc')

split_date = df.iloc[int(len(df) * 0.8)]['created_utc']

train_df = df[df['created_utc'] < split_date]
test_df = df[df['created_utc'] >= split_date]

train_df.shape, test_df.shape

((1904507, 11), (476127, 11))

In [27]:
train_df.reset_index(drop=True).to_feather('/workspace/data/reddit/submissions/RS_2023-01-train.arrow')
test_df.reset_index(drop=True).to_feather('/workspace/data/reddit/submissions/RS_2023-01-test.arrow')