In [1]:
import pandas as pd
import swifter

df = pd.read_feather('/workspace/data/hn/stories_dump.feather')
df.shape

(4920000, 12)

In [2]:
df.rename(columns={'descendants': 'comments'}, inplace=True)

df['time'] = pd.to_datetime(df['time'], unit='s')
df['dead'] = df.dead.fillna(0).astype(bool)

# Keep stories from 2018 onward in case community tastes have changed
df = df[df['time'].dt.year > 2017]
df.shape

(2126850, 12)

In [3]:

# Only keep stories without text for now
df = df[df['text'].isnull() & df['url'].notnull()]
df.shape

(1810290, 12)

In [4]:
# For this analysis we're trying to see whether a "real" story is likely to make
# it to the front page. So let's remove dead stories as there are probably easier ways to detect those.
df = df[df['dead'] == False]
df.shape

(1544214, 12)

In [5]:
# Deduplicate stories based on the URL. Keep the one with the highest score.

print(df.shape)

df.sort_values(by=['score'], ascending=False, inplace=True)
df = df.drop_duplicates(subset=['url'], keep='first')

print(df.shape)

(1544214, 12)
(1293918, 12)


In [6]:
df['frontpage'] = (df.score >= 20) | (df.comments >= 10)

In [7]:
df.columns

Index(['by', 'comments', 'id', 'score', 'time', 'title', 'type', 'url', 'dead',
       'text', 'kids', 'deleted', 'frontpage'],
      dtype='object')

In [8]:
def format_text(row):
  return f"""Title: {row.title}
URL: {row.url}
Poster: {row.by}
Date: {row.time.strftime('%A, %B %d, %I:%M %p')}"""

df['formatted_text'] = df.swifter.apply(format_text, axis=1)

Pandas Apply:   0%|          | 0/1293918 [00:00<?, ?it/s]

In [9]:
# Print 10 random stories
for i, row in df.sample(10).iterrows():
  print(row.formatted_text)
  print()

Title: Amazon makes education push in India with JEE preparation app
URL: https://techcrunch.com/2021/01/12/amazon-makes-education-push-in-india-with-jee-preparation-app/
Poster: jmsflknr
Date: Wednesday, January 13, 07:50 AM

Title: If I could teach my younger programmer self a few things...
URL: https://medium.com/@iluga/i-wish-i-knew-this-when-i-began-programming-7381b576c699
Poster: danielwbean
Date: Friday, August 28, 10:09 PM

Title: Trealla – A compact, efficient Prolog interpreter written in plain-old C
URL: https://github.com/trealla-prolog/trealla
Poster: nikolay
Date: Monday, August 08, 05:02 AM

Title: Govt blocks Yahoo, Steam, PayPal for failing to comply with licensing policy
URL: https://www.thejakartapost.com/indonesia/2022/07/30/govt-blocks-yahoo-steam-paypal-for-failing-to-comply-with-licensing-policy.html
Poster: mfcc64
Date: Saturday, July 30, 06:41 AM

Title: What a hangover taught me about fake meritocracy
URL: https://edmarferreira.com/archive/what-a-hangover-tau

In [11]:
from transformers import AutoTokenizer
from math import ceil
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')

def process_chunk(chunk):
    tokenizer_columns = tokenizer(chunk['formatted_text'].tolist(), padding=False, truncation=True, max_length=512)
    for key in tokenizer_columns:
        chunk[key] = tokenizer_columns[key]
    return chunk

# Split the dataset into batches of 1000 and apply the tokenizer columns to each batch
chunks = np.array_split(df, ceil(df.shape[0]/1000))

df = pd.concat([process_chunk(chunk) for chunk in tqdm(chunks)])

df.columns

  0%|          | 0/1294 [00:00<?, ?it/s]

Index(['by', 'comments', 'id', 'score', 'time', 'title', 'type', 'url', 'dead',
       'text', 'kids', 'deleted', 'frontpage', 'formatted_text', 'input_ids',
       'attention_mask'],
      dtype='object')

In [12]:
# Assign 5% of the data to the test set randomly
df['split'] = np.random.choice(['train', 'test'], p=[0.95, 0.05], size=(df.shape[0],))
df = df.sample(frac=1, random_state=1318)

df.groupby('split').size()

split
test       64746
train    1229172
dtype: int64

In [13]:
df.columns

Index(['by', 'comments', 'id', 'score', 'time', 'title', 'type', 'url', 'dead',
       'text', 'kids', 'deleted', 'frontpage', 'formatted_text', 'input_ids',
       'attention_mask', 'split'],
      dtype='object')

In [14]:
# Labels need to be a float for RMSE calculation
df['labels'] = df['frontpage'].astype(float)

In [15]:
df.reset_index(drop=True).to_feather('/workspace/data/hn/stories-roberta.feather')

In [16]:
import pandas as pd
from datasets import Dataset, DatasetDict

df = pd.read_feather('/workspace/data/hn/stories-roberta.feather')

# Get just the columns we need
df = df[['id', 'input_ids', 'attention_mask', 'labels', 'split']]

dataset = DatasetDict({
  'train': Dataset.from_pandas(df[df['split'] == 'train'], preserve_index=False),
  'test': Dataset.from_pandas(df[df['split'] == 'test'], preserve_index=False)
})

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels', 'split'],
        num_rows: 1229172
    })
    test: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels', 'split'],
        num_rows: 64746
    })
})

In [18]:
dataset.save_to_disk('/workspace/data/hn/stories-roberta-dataset')

Saving the dataset (0/3 shards):   0%|          | 0/1229172 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/64746 [00:00<?, ? examples/s]