In [5]:
%pip install zstandard pandas tqdm swifter rich

[0mNote: you may need to restart the kernel to use updated packages.


In [17]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import swifter

tqdm.pandas()

In [7]:
# Adapted from https://github.com/Watchful1/PushshiftDumps/blob/master/scripts/single_file.py

import zstandard
import os
import json
import logging.handlers


log = logging.getLogger("bot")
log.setLevel(logging.DEBUG)
log.addHandler(logging.StreamHandler())

def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
	chunk = reader.read(chunk_size)
	bytes_read += chunk_size
	if previous_chunk is not None:
		chunk = previous_chunk + chunk
	try:
		return chunk.decode()
	except UnicodeDecodeError:
		if bytes_read > max_window_size:
			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)


def read_lines_zst(file_name):
	with open(file_name, 'rb') as file_handle:
		buffer = ''
		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
		while True:
			chunk = read_and_decode(reader, 2**27, (2**29) * 2)

			if not chunk:
				break
			lines = (buffer + chunk).split("\n")

			for line in lines[:-1]:
				yield line, file_handle.tell()

			buffer = lines[-1]

		reader.close()


in_file = '/workspace/data/reddit/submissions/RS_2023-01.zst'
out_file = '/workspace/data/reddit/submissions/RS_2023-01.jsonl'

fields = [
  'id',
  'author',
  'subreddit',
  'title',
  'selftext',
  'created_utc',
  'score',
  'upvote_ratio',
  'removed_by_category',
  'num_comments',
]

file_size = os.stat(in_file).st_size
file_lines = 0
file_bytes_processed = 0
created = None
bad_lines = 0

with open(out_file, 'w') as out:
  for line, file_bytes_processed in read_lines_zst(in_file):    
    file_lines += 1
    if file_lines % 100000 == 0:
      log.info(f"Processed {file_lines} lines ({(file_bytes_processed / file_size) * 100:.0f}%) ({bad_lines} failed)")

    try:
      parsed = json.loads(line)

      if len(parsed['selftext'] or '') < 10:
        continue
      
      # Only keep the fields we want
      obj = {k: parsed[k] for k in fields}
      
      out.write(json.dumps(obj) + '\n')
    except (KeyError, json.JSONDecodeError) as err:
      print(err)

log.info(f"Complete : {file_lines:,} : {bad_lines:,}")

Processed 100000 lines (0%) (0 failed)
Processed 200000 lines (1%) (0 failed)
Processed 300000 lines (1%) (0 failed)
Processed 400000 lines (1%) (0 failed)
Processed 500000 lines (1%) (0 failed)


KeyboardInterrupt: 

In [None]:
df = pd.read_json(out_file, lines=True)
df.head()

In [None]:
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

In [None]:
df.to_feather('/workspace/data/reddit/submissions/RS_2023-01.arrow')

In [12]:
df = pd.read_feather('/workspace/data/reddit/submissions/RS_2023-01.arrow')

In [13]:
# Limit to only subreddits with at least 5000 submissions. I think it would work
# with smaller ones but I need a convenient way to decrease my dataset size
# anyway.
df = df.groupby('subreddit').filter(lambda x: len(x) > 5000)

df.shape

(921426, 9)

In [15]:
df['log_score'] = np.log10(df['score'] + 1)

In [18]:
def format_text(row):
  return f"""Title: {row.title}
Subreddit: /r/{row.subreddit}
Author: /u/{row.author}
Posted: {row.created_utc.strftime('%A, %B %d, %I:%M %p')}

Text: {row.selftext}"""

df['formatted_text'] = df.swifter.apply(format_text, axis=1)

Pandas Apply:   0%|          | 0/921426 [00:00<?, ?it/s]

In [19]:
df.reset_index(drop=True).to_feather('/workspace/data/reddit/submissions/RS_2023-01-formatted.arrow')

In [4]:
from transformers import AutoTokenizer
from math import ceil
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

df = pd.read_feather('/workspace/data/reddit/submissions/RS_2023-01-formatted.arrow')

tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')

def process_chunk(chunk):
    tokenizer_columns = tokenizer(chunk['formatted_text'].tolist(), padding=False, truncation=True, max_length=512)
    for key in tokenizer_columns:
        chunk[key] = tokenizer_columns[key]
    return chunk

# Split the dataset into batches of 1000 and apply the tokenizer columns to each batch
chunks = np.array_split(df, ceil(df.shape[0]/1000))

df = pd.concat([process_chunk(chunk) for chunk in tqdm(chunks)])

df.columns

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/922 [00:00<?, ?it/s]

Index(['id', 'author', 'subreddit', 'title', 'selftext', 'created_utc',
       'score', 'upvote_ratio', 'removed_by_category', 'log_score',
       'formatted_text', 'input_ids', 'token_type_ids', 'attention_mask'],
      dtype='object')

In [5]:
df.to_feather('/workspace/data/reddit/submissions/RS_2023-01-tokenized.arrow')

In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict

df = pd.read_feather('/workspace/data/reddit/submissions/RS_2023-01-tokenized.arrow')

print(df.shape)

# Get just the columns we need
df = df[['id', 'created_utc', 'input_ids', 'attention_mask', 'log_score']].rename(columns={'log_score': 'labels'})

# Split data into train and test based on publication date.
df = df.sort_values('created_utc')

split_date = df.iloc[int(len(df) * 0.95)]['created_utc']

train_df = df[df['created_utc'] < split_date]
test_df = df[df['created_utc'] >= split_date]

# Randomly choose 5000 samples from the test set to use as validation
eval_df = test_df.sample(5000, random_state=42)
test_df = test_df.drop(eval_df.index)

dataset = DatasetDict({
  'train': Dataset.from_pandas(train_df, preserve_index=False),
  'eval': Dataset.from_pandas(eval_df, preserve_index=False),
  'test': Dataset.from_pandas(test_df, preserve_index=False),
})

dataset


(921426, 14)


DatasetDict({
    train: Dataset({
        features: ['id', 'created_utc', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 875354
    })
    eval: Dataset({
        features: ['id', 'created_utc', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['id', 'created_utc', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 41072
    })
})

In [2]:
dataset.save_to_disk('/workspace/data/reddit/submissions/RS_2023-01-dataset')

Saving the dataset (0/15 shards):   0%|          | 0/875354 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/41072 [00:00<?, ? examples/s]