# Hacker News Submission Score Predictor w/ Keras and TensorFlow

by Max Woolf ([@minimaxir](https://minimaxir.com))

In [1]:
import pandas as pd
import numpy as np
import hopsworks
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F

In [39]:
# project = hopsworks.login(project='id2223_enric')
# fs = project.get_feature_store()
# hackernews_fg = fs.get_feature_group("hackernews_fg", 2)
# query = hackernews_fg.select_all()
# feature_view = fs.get_or_create_feature_view(name="hackernews_fv",
#                                   version=2,
#                                   description="Hackernews feature view",
#                                   labels=["score"],
#                                   query=query)
from sklearn.model_selection import train_test_split
from urllib.parse import urlparse

def url_to_domain(url):
    parsed_url = urlparse(url)

    domain = parsed_url.netloc
    return domain

feature_view = pd.read_csv('../data/pd_combined.csv')

train_df, test_df = train_test_split(feature_view, test_size=0.05)

df = train_df.sample(frac=1, random_state=123).dropna().reset_index(drop=True)
df['domain'] = df['url'].apply(url_to_domain)

df.head(10)

Unnamed: 0,id,title,url,score,time,descendants,by,karma,domain
0,8535571.0,5 Guidelines for Recovery Drills Within the AW...,http://www.n2ws.com/blog/5-guidelines-recovery...,2.0,1414700000.0,0.0,iamondemand,83.0,www.n2ws.com
1,38122977.0,Why do we allow ourselves to hold ungrounded a...,https://meltingasphalt.com/crony-beliefs/,3.0,1698974000.0,0.0,TheIronYuppie,1547.0,meltingasphalt.com
2,7349975.0,Japan Said to Be Ready to Impose Bitcoin Rules,http://dealbook.nytimes.com/2014/03/05/japan-s...,2.0,1394054000.0,0.0,JumpCrisscross,129974.0,dealbook.nytimes.com
3,19972463.0,“Maslow’s pyramid” is based on an elitist misr...,https://qz.com/work/1588491/maslow-didnt-make-...,2.0,1558460000.0,0.0,wjSgoWPm5bWAhXB,2456.0,qz.com
4,21521793.0,The Girl Who Never Came Back (1960),https://www.americanheritage.com/girl-who-neve...,2.0,1573630000.0,0.0,smacktoward,54886.0,www.americanheritage.com
5,3471206.0,Pasadena Cheeseburger Week - We Went There,http://aloneinaforest.com/cheeseburger-in-para...,1.0,1326731000.0,-1.0,darlingalice,2.0,aloneinaforest.com
6,3346871.0,Recreating the original Macintosh boot beep in...,http://romulusetrem.us/bootbeep/,2.0,1323764000.0,0.0,pom,73.0,romulusetrem.us
7,17079306.0,Google One is coming soon,https://one.google.com,81.0,1526434000.0,86.0,tvvocold,1946.0,one.google.com
8,4388709.0,Let's Build a Tesla Museum,http://theoatmeal.com/blog/tesla_museum,1.0,1345071000.0,-1.0,cjdavis,174.0,theoatmeal.com
9,11422153.0,Fair Source licensing is the worst thing to ha...,http://www.techrepublic.com/article/fair-sourc...,2.0,1459780000.0,0.0,alxsanchez,450.0,www.techrepublic.com


In [40]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab

def pad_sequences(seq, max_len=15):
    seq =  torch.tensor(seq)
    seq = nn.ConstantPad1d((0, max_len - len(seq)), 0)(seq)
    return seq

tokenizer = get_tokenizer("basic_english")

In [41]:
from collections import Counter
from functools import partial

counter = Counter()
for title in df['title']:
    counter.update(tokenizer(title))
vocab = Vocab(counter)
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
df['title'] = df['title'].apply(text_pipeline)
df['title'] = df['title'].apply(pad_sequences)

### Top Domains

Identify the top *n* domains by count (in this case *n* = 100), then transform it to a *n*D vector for each post.

In [42]:
num_domains = 100

domain_counts = df['domain'].value_counts()[0:num_domains]

print(domain_counts)

domain
github.com                4773
medium.com                3468
www.youtube.com           2783
www.nytimes.com           2114
techcrunch.com            1542
                          ... 
bit.ly                     104
apnews.com                 103
blogs.wsj.com              103
www.usatoday.com           103
bits.blogs.nytimes.com     102
Name: count, Length: 100, dtype: int64


In [43]:
titles = torch.stack([title for title in df['title'].values])

In [47]:
from sklearn.preprocessing import LabelBinarizer

top_domains = np.array(domain_counts.index, dtype=object)

domain_encoder = LabelBinarizer()
domain_encoder.fit(top_domains)

domains = domain_encoder.transform(df['domain'].values.astype(str))
domains[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### Day-of-Week and Hour

Convert day-of-week to a 7D vector and hours to a 24D vector. Both pandas and keras have useful functions for this workflow.

In [54]:
def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return np.eye(num_classes, dtype='uint8')[y]

In [55]:
# from keras.utils import to_categorical

dayofweeks = to_categorical(pd.to_datetime(df['time']).dt.dayofweek, 7)
hours = to_categorical(pd.to_datetime(df['time']).dt.hour, 24)

print(dayofweeks[0:5])
print(hours[0:5])

[[0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0]]
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


## Sample Weights

Weight `score=1` samples lower so model places a higher importance on atypical submissions.

In [56]:
weights = np.where(df['score'].values == 1, 0.5, 1.0)
print(weights[0:5])

[1. 1. 1. 1. 1.]


## Trend and Time on New

Unused in final model, but kept here for reference.

In [57]:
from sklearn.preprocessing import MinMaxScaler

trend_encoder = MinMaxScaler()
trends = trend_encoder.fit_transform(pd.to_datetime(df['time']).values.reshape(-1, 1))
trends[0:5]

array([[0.46796003],
       [0.99111293],
       [0.42996415],
       [0.73252349],
       [0.76044037]])

In [58]:
# newtime_encoder = MinMaxScaler()
# newtimes = trend_encoder.fit_transform(df['time_on_new'].values.reshape(-1, 1))
# newtimes[0:5]

## Build the Model Prototype

Add R^2 as a performance metric: https://jmlb.github.io/ml/2017/03/20/CoeffDetermination_CustomMetric4Keras/

In [116]:
# from keras import backend as K
epsilon = 1e-7
def r_2(y_true, y_pred):
    SS_res =  torch.sum(torch.square( y_true - y_pred )) 
    SS_tot = torch.sum(torch.square( y_true - torch.mean(y_true) ) ) 
    return ( 1 - SS_res/(SS_tot + epsilon) )

Minimizing `mse` loss as typical for regression problems will not work, as the model will realize that selecting 1 unilaterally accomplishes this task the best.

Instead, create a hybrid loss of `mae`, `msle`, and `poisson` (see Keras's docs for more info: https://github.com/keras-team/keras/blob/master/keras/losses.py) The latter two losses can account for very high values much better; perfect for the hyper-skewed data.

In [120]:
def hybrid_loss(y_true, y_pred):
    weight_mae = 0.1
    weight_msle = 1.
    weight_poisson = 0.1
    
    mae_loss = weight_mae * torch.mean(torch.abs(y_pred - y_true), axis=-1)
    
    first_log = torch.log(torch.clip(y_pred, 1, None) + 1.)
    second_log = torch.log(torch.clip(y_true, epsilon, None) + 1.)
    msle_loss = weight_msle * torch.mean(torch.square(first_log - second_log), axis=-1)
    
    poisson_loss = weight_poisson * torch.mean(y_pred - y_true * torch.log(y_pred + epsilon), axis=-1)
    return torch.mean(mae_loss + msle_loss + poisson_loss)

In [74]:
domains = torch.tensor(domains)
dayofweeks = torch.tensor(dayofweeks)
hours = torch.tensor(hours)



In [75]:
titles.shape, domains.shape, dayofweeks.shape, hours.shape

(torch.Size([129802, 15]),
 torch.Size([129802, 100]),
 torch.Size([129802, 7]),
 torch.Size([129802, 24]))

In [109]:
class Model(nn.Module):
    def __init__(self, num_words=15, num_hidden_layers=5):
        super().__init__()

        self.embedding_titles = nn.Embedding(31001, 50)
        self.spatial_dropout = nn.Dropout2d(0.2)
        self.rnn_titles = nn.LSTM(50, 128)

        self.hidden_layers = nn.ModuleList([
            nn.Sequential(
                nn.Linear(259, 259),
                nn.ReLU(),
                nn.BatchNorm1d(259),
                nn.Dropout(0.5)
            )
            for _ in range(num_hidden_layers)
        ])

        self.output_layer = nn.Linear(259, 1)

    def forward(self, input_titles, input_domains, input_dayofweeks, input_hours):
        embedding_titles = self.embedding_titles(input_titles)
        spatial_dropout = self.spatial_dropout(embedding_titles)
        rnn_titles, _ = self.rnn_titles(spatial_dropout.permute(1, 0, 2))

        concat = torch.cat([rnn_titles[-1], input_domains, input_dayofweeks, input_hours], dim=1)
        i = 0
        for layer in self.hidden_layers:
            concat = layer(concat)

        output = self.output_layer(concat)
        return output

In [110]:
import torch.optim as optim
model = Model()
batch_lr = 1e-3
num_epochs = 100
optimizer = optim.Adam(model.parameters(), lr=1e-3)
lr_scheduler = optim.lr_scheduler.LinearLR(optimizer)

In [123]:
from tqdm import tqdm

batch_size = 12
total_len = titles.shape[0]
gt_scores = torch.tensor(df['score'].values)
for epoch in range(num_epochs):
    loss_per_batch = []
    for i in tqdm(range(0, total_len, batch_size)):
        X_title = titles[i:i+batch_size]
        X_domain = domains[i:i+batch_size]
        X_dayofweeks = dayofweeks[i:i+batch_size]
        X_hours = hours[i:i+batch_size]
        y_true = gt_scores[i:i+batch_size]

        score = model(X_title, X_domain, X_dayofweeks, X_hours)
        
        optimizer.zero_grad()
        loss = hybrid_loss(score, y_true)
        loss.backward()
        loss_per_batch.append(loss.item())
        optimizer.step()
    
    print(f"Epoch: {epoch}/{num_epochs} ; Training loss: {np.mean(loss_per_batch)}")

  0%|          | 4/10817 [00:00<04:33, 39.55it/s]

100%|██████████| 10817/10817 [04:47<00:00, 37.61it/s]


Epoch: 0/100 ; Training loss: 4.07519202375377


100%|██████████| 10817/10817 [04:51<00:00, 37.15it/s]


Epoch: 1/100 ; Training loss: 4.01695658219623


100%|██████████| 10817/10817 [04:44<00:00, 37.96it/s]


Epoch: 2/100 ; Training loss: 4.013249814462836


100%|██████████| 10817/10817 [04:39<00:00, 38.65it/s]


Epoch: 3/100 ; Training loss: 4.011533654568707


100%|██████████| 10817/10817 [04:31<00:00, 39.85it/s]


Epoch: 4/100 ; Training loss: 4.010807378475456


100%|██████████| 10817/10817 [04:34<00:00, 39.44it/s]


Epoch: 5/100 ; Training loss: 4.010070092854905


100%|██████████| 10817/10817 [04:32<00:00, 39.71it/s]


Epoch: 6/100 ; Training loss: 4.009645540442315


100%|██████████| 10817/10817 [04:36<00:00, 39.18it/s]


Epoch: 7/100 ; Training loss: 4.009058134294455


100%|██████████| 10817/10817 [04:35<00:00, 39.30it/s]


Epoch: 8/100 ; Training loss: 4.008829973905289


100%|██████████| 10817/10817 [04:37<00:00, 38.98it/s]


Epoch: 9/100 ; Training loss: 4.008490686151469


100%|██████████| 10817/10817 [04:33<00:00, 39.59it/s]


Epoch: 10/100 ; Training loss: 4.008407439134019


100%|██████████| 10817/10817 [04:35<00:00, 39.28it/s]


Epoch: 11/100 ; Training loss: 4.008176884540046


100%|██████████| 10817/10817 [04:34<00:00, 39.45it/s]


Epoch: 12/100 ; Training loss: 4.008252148236976


  2%|▏         | 222/10817 [00:05<04:16, 41.29it/s]


KeyboardInterrupt: 

The model uses a linear learning rate decay to allow it to learn better once it starts converging.

Note: in this Kaggle Notebook, the training times out after 33 epochs when committing, so I set it to 25 here. You should probably train for longer. (50+ epochs)

## Check Predictions Against Validation Set

Predicting against data that was not trained in the model: the model does this poorly. :(

In [125]:
torch.save(model, 'model_v2.pkl')

In [None]:
import hsml
connection = hsml.connection()
mr = connection.get_model_registry()
ms = connection.get_model_serving()

In [None]:
def prepare_for_pred(title, url, time):
    domain = url_to_domain(url)
    df['domain'] = df['url'].apply(url_to_domain)
    

In [None]:
val_size = int(split_prop * df.shape[0])

predictions = model.predict([titles[-val_size:],
                             domains[-val_size:],
                             dayofweeks[-val_size:],
                             hours[-val_size:]])[:, 0]

predictions

In [None]:
df_preds = pd.concat([pd.Series(df['title'].values[-val_size:]),
                      pd.Series(df['score'].values[-val_size:]),
                      pd.Series(predictions)],
                     axis=1)
df_preds.columns = ['title', 'actual', 'predicted']
# df_preds.to_csv('hn_val.csv', index=False)
df_preds.head(50)

## Check Predictions Against Training Set

The model should be able to predict these better.

In [None]:
train_size = int((1-split_prop) * df.shape[0])

predictions = model.predict([titles[:train_size],
                             domains[:train_size],
                             dayofweeks[:train_size],
                             hours[:train_size]])[:, 0]

df_preds = pd.concat([pd.Series(df['title'].values[:train_size]),
                      pd.Series(df['score'].values[:train_size]),
                      pd.Series(predictions)],
                     axis=1)
df_preds.columns = ['title', 'actual', 'predicted']
# df_preds.to_csv('hn_train.csv', index=False)
df_preds.head(50)