**Text Summarization Using Keras and Tensorflow Backend**

In [205]:
# Importing all the modules
import numpy as np
import tensorflow as tf
import newspaper as ns
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dropout, Dense, LSTM, Activation
import os
import pickle
import re
from nltk.tokenize import word_tokenize

**Reading BBC News Dataset**

In [183]:
heading = {}
description = {}
for root, dirs, files in os.walk('bbc/'):
    for key, name in enumerate(files):
        if ".txt" in name:
            with open(os.path.join(root, name), "rb") as f:
                tmp = f.read().splitlines()
                heading[key] = tmp[0].decode('utf-8')
                description[key] = tmp[2:]

In [184]:
len(heading)

511

In [185]:
len(description)

511

In [186]:
print("Heading: ", heading[0])
print("Description:\n", description[0])

Heading:  Mobiles rack up 20 years of use
Description:
 [b'Mobile phones in the UK are celebrating their 20th anniversary this weekend.', b'', b"Britain's first mobile phone call was made across the Vodafone network on 1 January 1985 by veteran comedian Ernie Wise. In the 20 years since that day, mobile phones have become an integral part of modern life and now almost 90% of Britons own a handset. Mobiles have become so popular that many people use their handset as their only phone and rarely use a landline.", b'', b"The first ever call over a portable phone was made in 1973 in New York but it took 10 years for the first commercial mobile service to be launched. The UK was not far behind the rest of the world in setting up networks in 1985 that let people make calls while they walked. The first call was made from St Katherine's dock to Vodafone's head office in Newbury which at the time was over a curry house. For the first nine days of 1985 Vodafone was the only firm with a mobile net

In [187]:
for k, v in description.items():
    description[k] = "".join(w.decode('utf-8') for w in v)

In [188]:
print(heading[0])
print(description[0])

Mobiles rack up 20 years of use
Mobile phones in the UK are celebrating their 20th anniversary this weekend.Britain's first mobile phone call was made across the Vodafone network on 1 January 1985 by veteran comedian Ernie Wise. In the 20 years since that day, mobile phones have become an integral part of modern life and now almost 90% of Britons own a handset. Mobiles have become so popular that many people use their handset as their only phone and rarely use a landline.The first ever call over a portable phone was made in 1973 in New York but it took 10 years for the first commercial mobile service to be launched. The UK was not far behind the rest of the world in setting up networks in 1985 that let people make calls while they walked. The first call was made from St Katherine's dock to Vodafone's head office in Newbury which at the time was over a curry house. For the first nine days of 1985 Vodafone was the only firm with a mobile network in the UK. Then on 10 January Cellnet (now

**Scraping content to test the text summarizer**

In [8]:
site = "https://www.cnbc.com/2018/01/12/earnings-will-be-the-next-test-for-the-seemingly-unstoppable-30-trillion-stock-market.html"
article = ns.Article(site)
article.download()
article.parse()
text = article.text
text



**Creating the text corpus for creating vocablary**

In [316]:
all_text = []
for (i, j) in list(zip(description.values(), heading.values())):
    all_text.append(word_tokenize(i + j))

print(all_text[0])
vocab_list = [i.lower() for j in all_text for i in j]

['Mobile', 'phones', 'in', 'the', 'UK', 'are', 'celebrating', 'their', '20th', 'anniversary', 'this', 'weekend.Britain', "'s", 'first', 'mobile', 'phone', 'call', 'was', 'made', 'across', 'the', 'Vodafone', 'network', 'on', '1', 'January', '1985', 'by', 'veteran', 'comedian', 'Ernie', 'Wise', '.', 'In', 'the', '20', 'years', 'since', 'that', 'day', ',', 'mobile', 'phones', 'have', 'become', 'an', 'integral', 'part', 'of', 'modern', 'life', 'and', 'now', 'almost', '90', '%', 'of', 'Britons', 'own', 'a', 'handset', '.', 'Mobiles', 'have', 'become', 'so', 'popular', 'that', 'many', 'people', 'use', 'their', 'handset', 'as', 'their', 'only', 'phone', 'and', 'rarely', 'use', 'a', 'landline.The', 'first', 'ever', 'call', 'over', 'a', 'portable', 'phone', 'was', 'made', 'in', '1973', 'in', 'New', 'York', 'but', 'it', 'took', '10', 'years', 'for', 'the', 'first', 'commercial', 'mobile', 'service', 'to', 'be', 'launched', '.', 'The', 'UK', 'was', 'not', 'far', 'behind', 'the', 'rest', 'of', 'th

In [372]:
def indexing(txt):
    """
    The function creates word to index mapping
    Param:
        txt: List of Vocab words
    """
    vocab = set(txt)
    vocab_to_idx = {v:k for k, v in enumerate(vocab)}
    idx_to_vocab = {v:k for k, v in vocab_to_idx.items()}
    return vocab, vocab_to_idx, idx_to_vocab

vocab, vocab_to_idx, idx_to_vocab = indexing(vocab_list)

In [373]:
print(len(vocab_list))
print(len(vocab))
print(len(vocab_to_idx))
print(len(all_text))

267650
17279
17279
511


**Reading in the pre-trained GloVe vectors from https://nlp.stanford.edu/projects/glove/** 

In [12]:
with open("glove.6B.200d.txt","rb") as f:
    glove = f.readlines()

In [13]:
def glove_dict(glove_vector):
    """
    The function creates mapping between words and the GloVe vectors
    Param:
        glove_vector: List of GloVe words and their weights
    """
    word_weights = []
    for word in glove_vector:
        word_weights.append(word.split())
    
    print("Creating GloVe word and weight dictionary....")
    glove_words_weights = dict((i[0], i[1:]) for i in word_weights)
    
    print("Completed!")
    return glove_words_weights

glove_words_weights = glove_dict(glove)
pickle.dump(glove_words_weights, open('glove_words_weight.pkl','wb'))

Creating GloVe word and weight dictionary....
Completed!


In [361]:
n_embeddings = 200
n_glove_symbols = len(glove_words_weights.keys())
print("Number of GloVe symbols: ", n_glove_symbols)
glove_weight_matrix = np.empty((n_glove_symbols, n_embeddings))
print(glove_weight_matrix.shape)
print(glove_weight_matrix[:10,:10])

Number of GloVe symbols:  400000
(400000, 200)
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]


In [362]:
c = 0
glove_index_dict = {}
global_scale = .1
for i in glove_words_weights.keys():
    w = i.decode("utf-8")
    glove_index_dict[w] = c 
    glove_weight_matrix[c,:] = glove_words_weights[i] 
    c += 1
    if c % 100000 == 0:
        print(c)
        
glove_weight_matrix *= global_scale
print("GloVe weight matrix std...", glove_weight_matrix.std())
print(glove_weight_matrix[:5,:5])

100000
200000
300000
400000
GloVe weight matrix std... 0.0381862057272
[[-0.0071549   0.0093459   0.0023738  -0.0090339   0.0056123 ]
 [ 0.017651    0.029208   -0.00020768 -0.037523    0.00049139]
 [ 0.012289    0.058037   -0.0069635  -0.050288    0.010503  ]
 [ 0.0052924   0.025427    0.031353   -0.035613    0.0029629 ]
 [ 0.057346    0.05417    -0.023477   -0.03624     0.04037   ]]


In [363]:
vocab_size = len(vocab)
shape = (vocab_size, n_embeddings)
scale = glove_weight_matrix.std() * np.sqrt(12/2)
print("Scale: ", scale)
embedding = np.random.uniform(low=-scale, high=scale, size=shape)
print("Embedding shape...",embedding.shape,"Std...",embedding.std())

Scale:  0.0935367192446
Embedding shape... (17279, 200) Std... 0.0540163978217


In [374]:
c = 0
for i in range(vocab_size):
    w = idx_to_vocab[i]
    g = glove_index_dict.get(w, glove_index_dict.get(w))
    if g is not None:
        embedding[i,:] = glove_weight_matrix[g]
        c +=1

print("No. of tokens, found in GloVe matrix and copied to embedding...", c, float(c / vocab_size))

No. of tokens, found in GloVe matrix and copied to embedding... 14354 0.8307193703339314


**Lots of words in the vocabulary not found in pre-trained GloVe dictionary. For those words, trying find the closest word in the GloVe dictionary** 

In [375]:
glove_threshold = 0.5
word2glove = {}
for w in vocab_to_idx:
    if w in glove_index_dict:
        g = w
        word2glove[w] = g

len(word2glove)

14354

In [376]:
normed_embedding = embedding/np.array([np.sqrt(np.dot(gweight,gweight)) for gweight in embedding])[:,None]

nb_unknown_words = 2600

glove_match = []
for w, idx in vocab_to_idx.items():
    if idx >= vocab_size-nb_unknown_words and w.isalpha() and w in word2glove:
        gidx = glove_index_dict[word2glove[w]]
        gweight = glove_weight_matrix[gidx,:].copy()
        
        # find row in embedding that has the highest cos score with gweight
        gweight /= np.sqrt(np.dot(gweight,gweight))
        score = np.dot(normed_embedding[:vocab_size-nb_unknown_words], gweight)
        while True:
            embedding_idx = score.argmax()
            s = score[embedding_idx]
            if s < glove_threshold:
                break
            if idx_to_vocab[embedding_idx] in word2glove :
                glove_match.append((w, embedding_idx, s)) 
                break
            score[embedding_idx] = -1
glove_match.sort(key = lambda x: -x[2])
print('# of glove substitutes found', len(glove_match))

# of glove substitutes found 1678


**Manually checking the substitute words**

In [377]:
for orig, sub, score in glove_match[-10:]:
    print(score, orig, '=>', idx_to_vocab[sub])

0.505118716841 dots => pixels
0.504919795961 wallets => pockets
0.504600572303 scratches => bumps
0.504364499959 programmed => configured
0.503212410739 issuers => card
0.50306687316 speegle => nienhuis
0.502371861575 podshow => bbci
0.501750363598 mater => cornell
0.500028319271 aus => x1
0.500019282124 upstart => rival


In [378]:
glove_idx2idx = dict((vocab_to_idx[w], embedding_idx) for  w, embedding_idx, _ in glove_match)

In [379]:
print(embedding.shape)
print(len(glove_match))

(17279, 200)
1678


In [380]:
c = 0
for i in glove_idx2idx.keys():
    g = glove_idx2idx[i]
    embedding[i, :] = embedding[g]
    c += 1

print("Number of word substitued with the closest word is...", c)

Number of word substitued with the closest word is... 1678


**Building Encoder Network**

In [408]:
def model_builder(embeds):
    model = keras.Sequential()
    model.add(Embedding(weights=[embeds], name="embedding_1", input_dim=vocab_size,
                        output_dim=n_embeddings, input_length = max_length))
    for i in range(2):
        lstm = LSTM(rnn_size, name="layer_%s" %(i), return_sequences=True)
        model.add(lstm)
        model.add(Dropout(prob, name="drop_%s" %(i)))
        
    lstm = LSTM(rnn_size, name="layer_2", return_sequences=False)
    model.add(lstm)
    model.add(Dropout(prob, name="drop_2"))
    model.add(Dense(1))
    model.add(Activation('softmax', name="activation"))
    return model

In [409]:
rnn_size = 200
prob = 0.5
encoder = model_builder(embedding)
encoder.compile(loss='categorical_crossentropy', optimizer='rmsprop')

**Encoding document words with their the rescpective word indices (i.e. vocab_to_idx) and paddign them to make them of same lenght**

In [381]:
desc_list = []
for i in description.values():
    desc_list.append(word_tokenize(i))

In [382]:
head_list = []
for i in heading.values():
    head_list.append(word_tokenize(i))

In [383]:
head_list[0]

['Mobiles', 'rack', 'up', '20', 'years', 'of', 'use']

In [386]:
doc2idx = []
for i in desc_list:
    doc2idx.append([vocab_to_idx[w.lower()] if w.lower() in vocab_to_idx.keys() else 0 for w in i])
    
print(len(doc2idx))

511


In [387]:
head2idx = []
for i in head_list:
    head2idx.append([vocab_to_idx[w.lower()] if w.lower() in vocab_to_idx.keys() else 0 for w in i])
    
print(head2idx[0])

[1230, 8893, 17174, 5792, 10803, 359, 11990]


In [414]:
max_length = 200
padded_docs = pad_sequences(doc2idx, maxlen=max_length)

In [415]:
padded_docs.shape

(511, 200)

**Fitting the Network defined on the Documents**

In [416]:
encoder.fit(padded_docs, head2idx, epochs=10, verbose=0)

ValueError: You are passing a target array of shape (511, 1) while using as loss `categorical_crossentropy`. `categorical_crossentropy` expects targets to be binary matrices (1s and 0s) of shape (samples, classes). If your targets are integer classes, you can convert them to the expected format via:
```
from keras.utils import to_categorical
y_binary = to_categorical(y_int)
```

Alternatively, you can use the loss function `sparse_categorical_crossentropy` instead, which does expect integer targets.