# Hacker News Submission Score Predictor w/ Keras and TensorFlow

by Max Woolf ([@minimaxir](https://minimaxir.com))

In [1]:
import pandas as pd
import numpy as np
import hopsworks
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# project = hopsworks.login(project='id2223_enric')
# fs = project.get_feature_store()
# hackernews_fg = fs.get_feature_group("hackernews_fg", 2)
# query = hackernews_fg.select_all()
# feature_view = fs.get_or_create_feature_view(name="hackernews_fv",
#                                   version=2,
#                                   description="Hackernews feature view",
#                                   labels=["score"],
#                                   query=query)
from sklearn.model_selection import train_test_split
from urllib.parse import urlparse

def url_to_domain(url):
    parsed_url = urlparse(url)

    domain = parsed_url.netloc
    return domain

feature_view = pd.read_csv('../feature_pipeline/pd_combined.csv')

train_df, test_df = train_test_split(feature_view, test_size=0.05)

df = train_df.sample(frac=1, random_state=123).dropna().reset_index(drop=True)
df['domain'] = df['url'].apply(url_to_domain)

df.head(10)

Unnamed: 0,id,title,url,score,time,descendants,by,karma,domain
0,12680033.0,"Medical school can be brutal, and it’s making ...",https://www.washingtonpost.com/national/health...,113.0,1476136000.0,165.0,snake117,2750.0,www.washingtonpost.com
1,7787848.0,A new way to buy seeds for your garden,http://www.MySeedz.com,1.0,1400816000.0,0.0,MySeedz,1.0,www.MySeedz.com
2,12740841.0,Facebook's 100 GbE Wedge Switch Makes Strides,http://www.networkcomputing.com/data-centers/f...,2.0,1476848000.0,0.0,kungfudoi,14163.0,www.networkcomputing.com
3,1261361.0,Iron Man 2,http://www.imax.com/movie/ironman2,1.0,1271145000.0,-1.0,Ladarius11,2.0,www.imax.com
4,19007270.0,"Mark Zuckerberg, Let Me Be Your Ghost Writer",https://www.nytimes.com/2019/01/25/opinion/mar...,1.0,1548524000.0,0.0,theBashShell,9917.0,www.nytimes.com
5,10240614.0,Is Big Tech Too Powerful? Ask Google,http://www.nytimes.com/2015/09/20/opinion/is-b...,3.0,1442597000.0,0.0,Amorymeltzer,18668.0,www.nytimes.com
6,222098.0,"Bedsteads,soft and home furninshings from Dunelm",http://www.dunelm-mill.com/category/Beds%5FBed...,1.0,1213905000.0,-1.0,bedsteads,1.0,www.dunelm-mill.com
7,1611421.0,The quest for the next Journeyman Programming ...,http://kirkwylie.blogspot.com/2010/08/i-want-n...,3.0,1282060000.0,2.0,KirkWylie,111.0,kirkwylie.blogspot.com
8,265114.0,Feedback on Q&A apps in FB,http://www.facebook.com/add.php?api_key=7712d2...,1.0,1217701000.0,0.0,whleung,1.0,www.facebook.com
9,12080269.0,A Proposal For the Dartmouth Summer Research P...,http://www-formal.stanford.edu/jmc/history/dar...,124.0,1468340000.0,47.0,projectramo,4048.0,www-formal.stanford.edu


In [3]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab

def pad_sequences(seq, max_len=15):
    seq =  torch.tensor(seq)
    seq = nn.ConstantPad1d((0, max_len - len(seq)), 0)(seq)
    return seq

tokenizer = get_tokenizer("basic_english")

In [4]:
from collections import Counter
from functools import partial

counter = Counter()
for title in df['title']:
    counter.update(tokenizer(title))
vocab = Vocab(counter)
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
df['title'] = df['title'].apply(text_pipeline)
df['title'] = df['title'].apply(pad_sequences)

### Top Domains

Identify the top *n* domains by count (in this case *n* = 100), then transform it to a *n*D vector for each post.

In [5]:
num_domains = 100

domain_counts = df['domain'].value_counts()[0:num_domains]

print(domain_counts)

domain
github.com           4756
medium.com           3462
www.youtube.com      2760
www.nytimes.com      2143
techcrunch.com       1567
                     ... 
edition.cnn.com       105
www.cbsnews.com       103
www.usatoday.com      103
chrome.google.com     103
www.techdirt.com      103
Name: count, Length: 100, dtype: int64


In [6]:
titles = torch.stack([title for title in df['title'].values])

In [7]:
df['domain'].values.astype(str)

array(['www.washingtonpost.com', 'www.MySeedz.com',
       'www.networkcomputing.com', ..., 'www.slate.com',
       'stackoverflow.com', 'www.techcrunch.com'], dtype='<U95')

In [8]:
from sklearn.preprocessing import LabelBinarizer

top_domains = np.array(domain_counts.index, dtype=object)

domain_encoder = LabelBinarizer()
domain_encoder.fit(top_domains)

domains = domain_encoder.transform(df['domain'].values.astype(str))
domains[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

### Day-of-Week and Hour

Convert day-of-week to a 7D vector and hours to a 24D vector. Both pandas and keras have useful functions for this workflow.

In [9]:
def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return np.eye(num_classes, dtype='uint8')[y]

In [10]:
# from keras.utils import to_categorical

dayofweeks = to_categorical(pd.to_datetime(df['time']).dt.dayofweek, 7)
hours = to_categorical(pd.to_datetime(df['time']).dt.hour, 24)

print(dayofweeks[0:5])
print(hours[0:5])

[[0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0]]
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


## Sample Weights

Weight `score=1` samples lower so model places a higher importance on atypical submissions.

In [12]:
weights = np.where(df['score'].values == 1, 0.5, 1.0)
print(weights[0:5])

[1.  0.5 1.  0.5 0.5]


## Trend and Time on New

Unused in final model, but kept here for reference.

In [13]:
from sklearn.preprocessing import MinMaxScaler

trend_encoder = MinMaxScaler()
trends = trend_encoder.fit_transform(pd.to_datetime(df['time']).values.reshape(-1, 1))
trends[0:5]

array([[0.58101883],
       [0.44240484],
       [0.58232834],
       [0.20376753],
       [0.71423612]])

In [14]:
# newtime_encoder = MinMaxScaler()
# newtimes = trend_encoder.fit_transform(df['time_on_new'].values.reshape(-1, 1))
# newtimes[0:5]

## Build the Model Prototype

Add R^2 as a performance metric: https://jmlb.github.io/ml/2017/03/20/CoeffDetermination_CustomMetric4Keras/

In [15]:
# from keras import backend as K
epsilon = 1e-7
def r_2(y_true, y_pred):
    SS_res =  torch.sum(torch.square( y_true - y_pred )) 
    SS_tot = torch.sum(torch.square( y_true - torch.mean(y_true) ) ) 
    return ( 1 - SS_res/(SS_tot + epsilon) )

In [16]:
def hybrid_loss(y_true, y_pred):
    weight_mae = 0.1
    weight_msle = 1.
    weight_poisson = 0.1
    
    mae_loss = weight_mae * torch.mean(torch.abs(y_pred - y_true), axis=-1)
    
    first_log = torch.log(torch.clip(y_pred, 1, None) + 1.)
    second_log = torch.log(torch.clip(y_true, epsilon, None) + 1.)
    msle_loss = weight_msle * torch.mean(torch.square(first_log - second_log), axis=-1)
    
    poisson_loss = weight_poisson * torch.mean(y_pred - y_true * torch.log(y_pred + epsilon), axis=-1)
    return torch.mean(mae_loss + msle_loss + poisson_loss)

In [17]:
domains = torch.tensor(domains)
dayofweeks = torch.tensor(dayofweeks)
hours = torch.tensor(hours)

In [18]:
titles.shape, domains.shape, dayofweeks.shape, hours.shape

(torch.Size([129814, 15]),
 torch.Size([129814, 100]),
 torch.Size([129814, 7]),
 torch.Size([129814, 24]))

In [19]:
class Model(nn.Module):
    def __init__(self, num_words=15, num_hidden_layers=5):
        super().__init__()

        self.embedding_titles = nn.Embedding(31001, 50)
        self.spatial_dropout = nn.Dropout2d(0.2)
        self.rnn_titles = nn.LSTM(50, 128)

        self.hidden_layers = nn.ModuleList([
            nn.Sequential(
                nn.Linear(259, 259),
                nn.ReLU(),
                nn.BatchNorm1d(259),
                nn.Dropout(0.5)
            )
            for _ in range(num_hidden_layers)
        ])

        self.output_layer = nn.Linear(259, 1)

    def forward(self, input_titles, input_domains, input_dayofweeks, input_hours):
        embedding_titles = self.embedding_titles(input_titles)
        spatial_dropout = self.spatial_dropout(embedding_titles)
        rnn_titles, _ = self.rnn_titles(spatial_dropout.permute(1, 0, 2))

        concat = torch.cat([rnn_titles[-1], input_domains, input_dayofweeks, input_hours], dim=1)
        for layer in self.hidden_layers:
            concat = layer(concat)

        output = self.output_layer(concat)
        return output

In [110]:
import torch.optim as optim
model = Model()
batch_lr = 1e-3
num_epochs = 100
optimizer = optim.Adam(model.parameters(), lr=1e-3)
lr_scheduler = optim.lr_scheduler.LinearLR(optimizer)

In [None]:
from tqdm import tqdm

batch_size = 12
total_len = titles.shape[0]
gt_scores = torch.tensor(df['score'].values)
for epoch in range(num_epochs):
    loss_per_batch = []
    for i in tqdm(range(0, total_len, batch_size)):
        X_title = titles[i:i+batch_size]
        X_domain = domains[i:i+batch_size]
        X_dayofweeks = dayofweeks[i:i+batch_size]
        X_hours = hours[i:i+batch_size]
        y_true = gt_scores[i:i+batch_size]

        score = model(X_title, X_domain, X_dayofweeks, X_hours)
        
        optimizer.zero_grad()
        loss = hybrid_loss(score, y_true)
        loss.backward()
        loss_per_batch.append(loss.item())
        optimizer.step()
    
    print(f"Epoch: {epoch}/{num_epochs} ; Training loss: {np.mean(loss_per_batch)}")

The model uses a linear learning rate decay to allow it to learn better once it starts converging.

Note: in this Kaggle Notebook, the training times out after 33 epochs when committing, so I set it to 25 here. You should probably train for longer. (50+ epochs)

## Check Predictions Against Validation Set

Predicting against data that was not trained in the model: the model does this poorly. :(

In [20]:
model = torch.load('model_v2.pkl')

In [21]:
import hopsworks
project = hopsworks.login()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/223381


In [22]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

In [35]:
X1.dtype, X2.dtype, X3.dtype, X4.dtype

(torch.int64, torch.int64, torch.uint8, torch.uint8)

In [34]:
input_schema = Schema([X1, X2, X3, X4])
output_schema = Schema(score)
model_schema = ModelSchema(input_schema, output_schema)

RuntimeError: Tensor.__contains__ only supports Tensor or scalar, but you passed in a <class 'str'>.

In [None]:
hknews_model = mr.python.create_model(
    name="knews_model",
    version=2,
    model_schema=model_schema,
    description="hknews_model mark 1"
)
model_dir = './model_dir/'
hknews_model.save(model_dir)

In [24]:
def prepare_for_pred(title, url, time):
    title = pad_sequences(text_pipeline(title)).unsqueeze(0)
    domain = torch.tensor(domain_encoder.transform(np.array([url_to_domain(url)]).astype(str)))
    dayofweek = torch.tensor(to_categorical(pd.to_datetime(time).dayofweek, 7)).unsqueeze(0)
    hour = torch.tensor(to_categorical(pd.to_datetime(time).hour, 24)).unsqueeze(0)
    return title, domain, dayofweek, hour

In [25]:
title = test_df['title'].iloc[55][0]
url = test_df['url'].iloc[55]
time = test_df['time'].iloc[55]

In [26]:
X1, X2, X3, X4 = prepare_for_pred(title, url, time)

In [27]:
X1.shape, X2.shape, X3.shape, X4.shape

(torch.Size([1, 15]),
 torch.Size([1, 100]),
 torch.Size([1, 7]),
 torch.Size([1, 24]))

In [28]:
model.eval()
score = model(X1, X2, X3, X4)



In [29]:
score

tensor([[4.1687]], grad_fn=<AddmmBackward0>)

In [152]:
val_size = int(split_prop * df.shape[0])

predictions = model.predict([titles[-val_size:],
                             domains[-val_size:],
                             dayofweeks[-val_size:],
                             hours[-val_size:]])[:, 0]

predictions

NameError: name 'split_prop' is not defined

In [None]:
df_preds = pd.concat([pd.Series(df['title'].values[-val_size:]),
                      pd.Series(df['score'].values[-val_size:]),
                      pd.Series(predictions)],
                     axis=1)
df_preds.columns = ['title', 'actual', 'predicted']
# df_preds.to_csv('hn_val.csv', index=False)
df_preds.head(50)

## Check Predictions Against Training Set

The model should be able to predict these better.

In [None]:
train_size = int((1-split_prop) * df.shape[0])

predictions = model.predict([titles[:train_size],
                             domains[:train_size],
                             dayofweeks[:train_size],
                             hours[:train_size]])[:, 0]

df_preds = pd.concat([pd.Series(df['title'].values[:train_size]),
                      pd.Series(df['score'].values[:train_size]),
                      pd.Series(predictions)],
                     axis=1)
df_preds.columns = ['title', 'actual', 'predicted']
# df_preds.to_csv('hn_train.csv', index=False)
df_preds.head(50)