# News data modeling

In [1]:
# !pip install mkl

Collecting mkl
[?25l  Downloading https://files.pythonhosted.org/packages/ac/1e/c713b011b90cd238023df1c0025130c40bc40870a46273d942e89114233c/mkl-2019.0-py2.py3-none-macosx_10_12_intel.macosx_10_12_x86_64.whl (193.8MB)
[K    100% |████████████████████████████████| 193.8MB 256kB/s ta 0:00:011   35% |███████████▏                    | 67.9MB 7.5MB/s eta 0:00:17    90% |████████████████████████████▉   | 174.6MB 5.6MB/s eta 0:00:04
[?25hCollecting intel-openmp (from mkl)
[?25l  Downloading https://files.pythonhosted.org/packages/df/51/1138f9df9fa1659c035927297d275a57404f174a1405febe4a5084e77320/intel_openmp-2019.0-py2.py3-none-macosx_10_12_intel.macosx_10_12_x86_64.whl (1.1MB)
[K    100% |████████████████████████████████| 1.1MB 3.3MB/s ta 0:00:011
[?25hInstalling collected packages: intel-openmp, mkl
Successfully installed intel-openmp-2019.0 mkl-2019.0


In [16]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
## Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
# Others
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk import tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.manifold import TSNE
# ref: https://medium.com/@sabber/classifying-yelp-review-comments-using-cnn-lstm-and-pre-trained-glove-word-embeddings-part-3-53fcea9a17fa

Using TensorFlow backend.


In [7]:
# Load CNN
import pickle
news_summaries = pickle.load(open('/Users/dbm/Documents/Insight S19/data/cnn_dmail_news_summary.pkl', 'rb'))
print('Loaded CNN+DailyMail Stories %d' % len(news_summaries))

Loaded CNN+DailyMail Stories 312085


## Tokenize text

In [8]:
story = []
summary = []
# range(len(master))
for i in news_summaries:
#     story["story"] = nltk.wordpunct_tokenize(story["story"])
#     story["summary"] = nltk.wordpunct_tokenize(story["summary"])
#     story.append(nltk.wordpunct_tokenize(story["story"]))
#     summary.append(nltk.wordpunct_tokenize(story["story"]))
    story.append(i["story"])
    summary.append(i["summary"])    

In [4]:
# For story
tknzr_story = Tokenizer(num_words = 300000)
tknzr_story.fit_on_texts(story)
seq_story = tknzr_story.texts_to_sequences(story)

word_idx_story = tknzr_story.word_index
print('Found %s unique tokens in the stories.' % len(word_idx_story))

Found 8444059 unique tokens in the stories.


In [5]:
# For summary
tknzr_summary = Tokenizer(num_words = 300000)
tknzr_summary.fit_on_texts(summary)
seq_summary = tknzr_summary.texts_to_sequences(summary)

word_idx_summary = tknzr_summary.word_index
print('Found %s unique tokens in the summaries.' % len(word_idx_summary))

Found 1123526 unique tokens in the summaries.


In [8]:
# summary_length = []
# story_length = []
# for i in news_summaries:
#     story_length.append(len(i["story"]))
#     summary_length.append(len(i["summary"]))
# max_story_length = max(story_length)
# max_summary_length = max(summary_length)

In [9]:
# print(f"Max story length = {max_story_length}", f"Max summary length = {max_summary_length}")

Max story length = 396 Max summary length = 128


In [6]:
# Pad the datasets to have similar lengths
story_data = pad_sequences(seq_story, maxlen=500)
summary_data = pad_sequences(seq_summary, maxlen=500)

print('Shape of data tensor:', story_data.shape)

Shape of data tensor: (312085, 500)


In [7]:
print('Shape of label tensor:', summary_data.shape)

Shape of label tensor: (312085, 500)


## Split data into training and test

In [8]:
indices = np.arange(story_data.shape[0])
np.random.shuffle(indices)
story_data = story_data[indices]
summary_data = summary_data[indices]
nb_validation_samples = int(0.20 * story_data.shape[0])

x_train = story_data[:-nb_validation_samples]
y_train = summary_data[:-nb_validation_samples]
x_val = story_data[-nb_validation_samples:]
y_val = summary_data[-nb_validation_samples:]

## Embedding layer

In [9]:
word_embeddings = {}
f = open('/Users/dbm/Documents/Insight S19/data/glove.840B.300d.txt', encoding='utf-8')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    word_embeddings[word] = coefs
f.close()
print('Loaded dailymail Stories %d' %len(word_embeddings))

Loaded dailymail Stories 2195884


In [None]:
embedding_matrix = np.zeros((len(word_index_story) + 1, len(word_embeddings)))
for word, i in word_idx.items():
    embedding_vector = word_embeddings.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

## LSTM

In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

# Word2Vec

In [4]:
import gensim as gs

In [9]:
story[0]

['at the start of a big week for the higgs boson the most sought after particle in all of physics scientists in illinois said monday that they had crept closer to proving that the particle exists but had been unable to reach a definitive conclusion.',
 "the scientists outlined their final analysis based on more than 10 years of research and 500 trillion particle collisions using the u.s. department of energy's fermilab tevatron collider near batavia illinois whose budgetary woes shut it down last year.",
 'what is the higgs boson and why is it important?',
 'their announcement came two days before researchers at the large hadron collider under the alps are due to unveil their latest results at an eagerly awaited seminar at the cern particle physics laboratory in geneva switzerland.',
 'our data strongly point toward the existence of the higgs boson rob roser a spokesman for one of two independent experiments at the tevatron said in a statement. but it will take results from the experim

In [31]:
from nltk.tokenize import RegexpTokenizer

def tokenize_word(txt):
    tokenized_words = []
    # For each line in text
    for l in txt:
        # keep words
        # Split by whitespace
        tokenizer = RegexpTokenizer(r"[a-z]+")        
        l = tokenizer.tokenize(l)
        tokenized_words.append(l)
    # Remove empty strings
    tokenized_words = [c_txt for c_txt in tokenized_words if len(c_txt) > 0]
    tokenized_words.append(" ".join(l))
    return tokenized_words

story_w2v = []
summary_w2v = []

for i in news_summaries:
    story_w2v.append(tokenize_word(i["story"]))
    summary_w2v.append(tokenize_word(i["summary"]))
# for i in story:
#     tokenizer = RegexpTokenizer(r"\w+")        
#     i = tokenizer.tokenize(i)
#     story_w2v.append(i)
                     
# for i in summary:    
#     summary_w2v.append(gs.utils.simple_preprocess(i))    

In [32]:
story_w2v[0]

[['at',
  'the',
  'start',
  'of',
  'a',
  'big',
  'week',
  'for',
  'the',
  'higgs',
  'boson',
  'the',
  'most',
  'sought',
  'after',
  'particle',
  'in',
  'all',
  'of',
  'physics',
  'scientists',
  'in',
  'illinois',
  'said',
  'monday',
  'that',
  'they',
  'had',
  'crept',
  'closer',
  'to',
  'proving',
  'that',
  'the',
  'particle',
  'exists',
  'but',
  'had',
  'been',
  'unable',
  'to',
  'reach',
  'a',
  'definitive',
  'conclusion'],
 ['the',
  'scientists',
  'outlined',
  'their',
  'final',
  'analysis',
  'based',
  'on',
  'more',
  'than',
  'years',
  'of',
  'research',
  'and',
  'trillion',
  'particle',
  'collisions',
  'using',
  'the',
  'u',
  's',
  'department',
  'of',
  'energy',
  's',
  'fermilab',
  'tevatron',
  'collider',
  'near',
  'batavia',
  'illinois',
  'whose',
  'budgetary',
  'woes',
  'shut',
  'it',
  'down',
  'last',
  'year'],
 ['what',
  'is',
  'the',
  'higgs',
  'boson',
  'and',
  'why',
  'is',
  'it',
  '

In [33]:
story_w2v_model = gs.models.Word2Vec(story_w2v, size=350, window=10, min_count=2, workers=10)

TypeError: unhashable type: 'list'

In [None]:
story_w2v_model.train(story_w2v,total_examples=len(story_w2v),epochs=10)