# News Engagement Prediction

Create a system that predicts the engagement of a given news article based on its content (title and/or article text):
* Data from https://webhose.io/free-datasets/popular-news-articles/
* word2vec-CNN-BiLSTM model
* Trained on Triton supercomputer GPUs
* Created in Jupyter Notebook using Pytorch
* Deliverable as requestable API on AWS
* Possibly create a dashboard for user to test and rank different word choices

## Preprocessing

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os
import json
import pickle
import pytz
import warnings
# warnings.filterwarnings('ignore')
import random
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import StepLR

import preprocess as pp
import graph as gp
import hparams as hp
import model as ml
import train as tr
import postprocess as pop
import analysis as al

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# ml_choice: cnn-bilstm
ml_choice = 'cnn-bilstm'

# dataset: webhose-popular
dataset = 'webhose-popular'

update_dict = {}
update_dict['ml_choice'] = ml_choice
update_dict['dataset'] = dataset

In [4]:
data_all = pp.read_data(dataset)
data_all = pp.remove_keys(data_all)
data_all = pp.cal_all_engagements(data_all)

titles = pp.get_titles(data_all)
data_all_ordered = pp.order_keys(data_all)

In [5]:
pp.pprint(data_all_ordered[0])

{
    "title": "Here are The Funniest Reaction Memes To Beyonce's Pregnancy | Angie Martinez | Power 105.1 FM",
    "sanitized_title": "here are the funniest reaction memes to beyonces pregnancy",
    "text": "Here are The Funniest Reaction Memes To Beyonce's Pregnancy posted by Gabriel Pabon - \nThe internet wastes no time when reacting to big news especially when it comes to Beyonce . \nCheck out the funniest memes from today's announcement below. A photo posted by HONEY GERMAN (@honeygerman) on Feb 1, 2017 at 1:03pm PST A photo posted by HONEY GERMAN (@honeygerman) on Feb 1, 2017 at 12:57pm PST A photo posted by HONEY GERMAN (@honeygerman) on Feb 1, 2017 at 11:40am PST A photo posted by SSquared Podcast/Radio Show (@teamssquared) on Feb 1, 2017 at 2:07pm PST A photo posted by D-Roc \u264f\ufe0f (@inked_scorpio) on Feb 1, 2017 at 2:07pm PST A photo posted by At Random With Yeasha (@random_yeasha) on Feb 1, 2017 at 2:05pm PST Left or Right? \uf62d #beyonce pic.twitter.com/PPhB9QmUAy v

In [6]:
# currently using only titles to save on training time
all_text = pp.get_all_text(titles)
words = pp.get_words(all_text)

# score: 'original', 'log_weigh', 'log_no_weigh', 'no_log_weigh', 'no_log_no_weigh'
scores = pp.get_scores(data_all_ordered)
mean, std = pp.get_mean_std(scores)
scores = pp.scale_data(scores, mean, std)

In [7]:
tokens = pp.tokenize_words(words)
update_dict['embed_in'] = len(tokens)
title_tokens = pp.tokenize_titles(titles, tokens)
title_lengths = pp.get_title_lengths(title_tokens)
title_tokens, titles, scores = pp.remove_shorts(title_tokens, titles, scores, min_len=3)
padded_titles = pp.pad_titles(title_tokens)

In [8]:
train_x, val_x, test_x = pp.split_data(padded_titles)
train_y, val_y, test_y = pp.split_data(scores)

print("Train Size: {}".format(train_x.shape),
      "\nValidation Size: {}".format(val_x.shape),
      "\nTest Size: {}".format(test_x.shape))

Train Size: (786, 25) 
Validation Size: (98, 25) 
Test Size: (99, 25)


In [9]:
train_data = pp.create_tensor_dataset(train_x, train_y)
val_data = pp.create_tensor_dataset(val_x, val_y)
test_data = pp.create_tensor_dataset(test_x, test_y)

train_loader = pp.create_loader(train_data)
val_loader = pp.create_loader(val_data)
test_loader = pp.create_loader(test_data)

In [12]:
architecture = ml.MODELS[ml_choice]
hps = hp.setup_hparams(architecture, update_dict)
hps = pp.update_hps(hps)
model = ml.CNN_BiLSTM(hps)
print(model)

CNN_BiLSTM(
  (embed): Embedding(4317, 200, padding_idx=0)
  (bilstm): LSTM(200, 200, num_layers=2, dropout=0.1, bidirectional=True)
  (fc1): Linear(in_features=700, out_features=350, bias=True)
  (fc2): Linear(in_features=350, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [13]:
folder = tr.create_folder()
seed = tr.set_seed()
model, hps, ml_file_losses = tr.run_all(train_loader, test_loader, folder, epochs=10)

[autoreload of train failed: Traceback (most recent call last):
  File "C:\Users\Eugene\anaconda3\envs\deep-learning\lib\site-packages\IPython\extensions\autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "C:\Users\Eugene\anaconda3\envs\deep-learning\lib\site-packages\IPython\extensions\autoreload.py", line 394, in superreload
    module = reload(module)
  File "C:\Users\Eugene\anaconda3\envs\deep-learning\lib\imp.py", line 314, in reload
    return importlib.reload(module)
  File "C:\Users\Eugene\anaconda3\envs\deep-learning\lib\importlib\__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 604, in _exec
  File "<frozen importlib._bootstrap_external>", line 783, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "C:\Code\Jupyter Notebook\news-engagement-prediction\news-engagement-prediction\train.py", line 4, in <module>
    from termcolor 

NameError: name 'hps_data' is not defined

In [None]:
train_losses = tr.run(model, train_loader, opt, criterion, hps)
gp.graph_losses(train_losses)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 1/100...
Train Loss: 8.227258
Epoch: 10/100...
Train Loss: 8.051239
Epoch: 20/100...
Train Loss: 7.995482
Epoch: 30/100...
Train Loss: 7.974991
Epoch: 40/100...
Train Loss: 7.969732
Epoch: 50/100...
Train Loss: 8.077700
Epoch: 60/100...
Train Loss: 8.028047
