In [127]:
import pandas as pd
import swifter

df = pd.read_feather('/workspace/data/hn/stories_dump.feather')
df.shape

(4920000, 12)

In [128]:
df.rename(columns={'descendants': 'comments'}, inplace=True)

df['time'] = pd.to_datetime(df['time'], unit='s')
df['dead'] = df.dead.fillna(0).astype(bool)

# Keep stories from 2018 onward in case community tastes have changed
df = df[df['time'].dt.year > 2017]
df.shape

(2126850, 12)

In [129]:

# Only keep stories without text for now
df = df[df['text'].isnull() & df['url'].notnull()]
df.shape

(1810290, 12)

In [130]:
# For this analysis we're trying to see whether a "real" story is likely to make
# it to the front page. So let's remove dead stories as there are probably easier ways to detect those.
df = df[df['dead'] == False]
df.shape

(1544214, 12)

In [131]:
# Deduplicate stories based on the URL. Keep the one with the highest score.

print(df.shape)

df.sort_values(by=['score'], ascending=False, inplace=True)
df = df.drop_duplicates(subset=['url'], keep='first')

print(df.shape)

(1544214, 12)
(1293918, 12)


In [132]:
df['frontpage'] = (df.score >= 20) | (df.comments >= 10)

In [133]:
df.columns

Index(['by', 'comments', 'id', 'score', 'time', 'title', 'type', 'url', 'dead',
       'text', 'kids', 'deleted', 'frontpage'],
      dtype='object')

In [134]:
def format_text(row):
  return f"""Title: {row.title}
URL: {row.url}
Poster: {row.by}
Date: {row.time.strftime('%A, %B %d, %I:%M %p')}"""

df['formatted_text'] = df.swifter.apply(format_text, axis=1)

Pandas Apply:   0%|          | 0/1293918 [00:00<?, ?it/s]

In [135]:
# Print 10 random stories
for i, row in df.sample(10).iterrows():
  print(row.formatted_text)
  print()

Title: How to set junior employees up for success in remote
URL: https://slite.com/blog/micromanagement-is-not-a-bad-word
Poster: melanieb421
Date: Monday, June 27, 12:10 PM

Title: Sewage Is Helping Cities Flush Out the Opioid Crisis
URL: https://www.scientificamerican.com/article/sewage-is-helping-cities-flush-out-the-opioid-crisis/
Poster: dustfinger
Date: Monday, May 28, 10:24 AM

Title: Raspberry Pi Zero W Surveillance Camera
URL: https://www.youtube.com/watch?v=rhIzfRmKHnQ
Poster: fortran77
Date: Friday, June 05, 04:57 PM

Title: Date World: The Army’s Authoritative Training Environment
URL: https://www.army.mil/article/242997/decisive_action_training_environment_world_the_armys_authoritative_training_environment
Poster: openasocket
Date: Wednesday, April 14, 06:33 PM

Title: SQL 3D engine (interactive preview)
URL: https://observablehq.com/@pallada-92/sql-3d-engine
Poster: duck
Date: Thursday, February 27, 09:59 PM

Title: How do you explain technical subjects in a non condescen

In [136]:
from transformers import AutoTokenizer
from math import ceil
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')

def process_chunk(chunk):
    tokenizer_columns = tokenizer(chunk['formatted_text'].tolist(), padding=False, truncation=True, max_length=512)
    for key in tokenizer_columns:
        chunk[key] = tokenizer_columns[key]
    return chunk

# Split the dataset into batches of 1000 and apply the tokenizer columns to each batch
chunks = np.array_split(df, ceil(df.shape[0]/1000))

df = pd.concat([process_chunk(chunk) for chunk in tqdm(chunks)])

df.columns

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/1294 [00:00<?, ?it/s]

Index(['by', 'comments', 'id', 'score', 'time', 'title', 'type', 'url', 'dead',
       'text', 'kids', 'deleted', 'frontpage', 'formatted_text', 'input_ids',
       'token_type_ids', 'attention_mask'],
      dtype='object')

In [137]:
# Assign 5% of the data to the test set randomly
df['split'] = np.random.choice(['train', 'test'], p=[0.95, 0.05], size=(df.shape[0],))
df = df.sample(frac=1, random_state=1318)

df.groupby('split').size()

split
test       64728
train    1229190
dtype: int64

In [141]:
df.columns

Index(['id', 'time', 'input_ids', 'attention_mask', 'labels'], dtype='object')

In [138]:
# Labels need to be a float for RMSE calculation
df['labels'] = df['frontpage'].astype(float)

In [139]:
df.reset_index(drop=True).to_feather('/workspace/data/hn/stories-dataset.feather')

In [145]:
import pandas as pd
from datasets import Dataset, DatasetDict

df = pd.read_feather('/workspace/data/hn/stories-dataset.feather')

# Get just the columns we need
df = df[['id', 'input_ids', 'attention_mask', 'labels', 'split']]

dataset = DatasetDict({
  'train': Dataset.from_pandas(df[df['split'] == 'train'], preserve_index=False),
  'test': Dataset.from_pandas(df[df['split'] == 'test'], preserve_index=False)
})

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels', 'split'],
        num_rows: 1229190
    })
    test: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels', 'split'],
        num_rows: 64728
    })
})

In [148]:
dataset.save_to_disk('/workspace/data/hn/stories-dataset')

Saving the dataset (0/3 shards):   0%|          | 0/1229190 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/64728 [00:00<?, ? examples/s]