In [30]:
import pandas as pd
import swifter

df = pd.read_feather('/workspace/data/hn/stories_dump.feather')
df.shape

(4920000, 12)

In [31]:
df.rename(columns={'descendants': 'comments'}, inplace=True)

df['time'] = pd.to_datetime(df['time'], unit='s')
df['dead'] = df.dead.fillna(0).astype(bool)

# Keep stories from 2018 onward in case community tastes have changed
df = df[df['time'].dt.year > 2017]
df.shape

(2126850, 12)

In [32]:

# Only keep stories without text for now
df = df[df['text'].isnull() & df['url'].notnull()]
df.shape

(1810290, 12)

In [33]:
# Deduplicate stories based on the URL. Keep the one with the highest score.

print(df.shape)

df.sort_values(by=['score'], ascending=False, inplace=True)
df = df.drop_duplicates(subset=['url'], keep='first')

print(df.shape)

(1810290, 12)
(1518276, 12)


In [34]:
df['frontpage'] = (df.score >= 20) | (df.comments >= 10)

In [35]:
df.columns

Index(['by', 'comments', 'id', 'score', 'time', 'title', 'type', 'url', 'dead',
       'text', 'kids', 'deleted', 'frontpage'],
      dtype='object')

In [37]:
def format_text(row):
  return f"""Title: {row.title}
URL: {row.url}
Poster: {row.by}
Date: {row.time.strftime('%A, %B %d, %I:%M %p')}"""

df['formatted_text'] = df.swifter.apply(format_text, axis=1)

Pandas Apply:   0%|          | 0/1518276 [00:00<?, ?it/s]

In [38]:
from transformers import AutoTokenizer
from math import ceil
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')

def process_chunk(chunk):
    tokenizer_columns = tokenizer(chunk['formatted_text'].tolist(), padding=False, truncation=True, max_length=512)
    for key in tokenizer_columns:
        chunk[key] = tokenizer_columns[key]
    return chunk

# Split the dataset into batches of 1000 and apply the tokenizer columns to each batch
chunks = np.array_split(df, ceil(df.shape[0]/1000))

df = pd.concat([process_chunk(chunk) for chunk in tqdm(chunks)])

df.columns

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/1519 [00:00<?, ?it/s]

Index(['by', 'comments', 'id', 'score', 'time', 'title', 'type', 'url', 'dead',
       'text', 'kids', 'deleted', 'frontpage', 'formatted_text', 'input_ids',
       'token_type_ids', 'attention_mask'],
      dtype='object')

In [40]:
df.reset_index(drop=True).to_feather('/workspace/data/hn/stories-tokenized.feather')

In [50]:
import pandas as pd
from datasets import Dataset, DatasetDict

df = pd.read_feather('/workspace/data/hn/stories-tokenized.feather')

# Labels need to be a float for RMSE calculation
df['labels'] = df['frontpage'].astype(float)

# Get just the columns we need
df = df[['id', 'time', 'input_ids', 'attention_mask', 'labels']]

# Split data into train and test based on publication date.
df = df.sort_values('time')

split_date = df.iloc[int(len(df) * 0.95)]['time']

train_df = df[df['time'] < split_date]
test_df = df[df['time'] >= split_date]

dataset = DatasetDict({
  'train': Dataset.from_pandas(train_df, preserve_index=False),
  'test': Dataset.from_pandas(test_df, preserve_index=False),
})

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'time', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1442362
    })
    test: Dataset({
        features: ['id', 'time', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 75914
    })
})

In [52]:
dataset.save_to_disk('/workspace/data/hn/stories-dataset')

Saving the dataset (0/3 shards):   0%|          | 0/1442362 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/75914 [00:00<?, ? examples/s]