Text Summarisation

In [1]:
import numpy as np
import os
from random import shuffle
import re
import sklearn as sk
import matplotlib.pyplot as plt
import random

In [2]:
import urllib.request
import zipfile
import lxml.etree

In [3]:
import tensorflow as tf

In [4]:
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
    
raw_text = doc.xpath('//content/text()')
raw_summary = doc.xpath('//head/description/text()')

del doc

In [5]:
talk_sentences = []
talknum = len(raw_text)

for i in range(talknum):
    temp = re.sub(r'\([^)]*\)', '', raw_text[i])
    temp = re.sub(r'\n', '', raw_text[i])
    temp = temp.split('.')
    talk_sentences.append(temp)

In [6]:
summary_sentences = []
talknum = len(raw_summary)

for i in range(talknum):
    temp = re.sub(r'\([^)]*\)', '', raw_summary[i])
    temp = re.sub(r'\n', '', raw_summary[i])
    temp = temp.split('.')
    summary_sentences.append(temp)

In [7]:
print(talk_sentences[0][1])

To me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation


In [8]:
print(summary_sentences[0][1])

 He shares insights on how to strike a balance between perfecting what we already know and exploring totally new ideas -- and lays out how to avoid two major strategy traps


In [9]:
talk_sentence_word = []

for talk in talk_sentences:
    temp = []
    for sent in talk:
        tokens = re.sub(r"[^a-z0-9]+", " ", sent.lower()).split()
        temp.append(tokens)#
    talk_sentence_word.append(temp)

In [10]:
summary_sentence_word = []

for talk in summary_sentences:
    temp = []
    for sent in talk:
        tokens = re.sub(r"[^a-z0-9]+", " ", sent.lower()).split()
        temp.append(tokens)#
    summary_sentence_word.append(temp)

In [11]:
talk_sentence_word[0][1]

['to',
 'me',
 'the',
 'real',
 'real',
 'solution',
 'to',
 'quality',
 'growth',
 'is',
 'figuring',
 'out',
 'the',
 'balance',
 'between',
 'two',
 'activities',
 'exploration',
 'and',
 'exploitation']

In [12]:
summary_sentence_word[0][1]

['he',
 'shares',
 'insights',
 'on',
 'how',
 'to',
 'strike',
 'a',
 'balance',
 'between',
 'perfecting',
 'what',
 'we',
 'already',
 'know',
 'and',
 'exploring',
 'totally',
 'new',
 'ideas',
 'and',
 'lays',
 'out',
 'how',
 'to',
 'avoid',
 'two',
 'major',
 'strategy',
 'traps']

In [13]:
temp = list(zip(talk_sentence_word, summary_sentence_word))
random.shuffle(temp)
talk_sentence_word_shuffle, summary_sentence_word_shuffle = zip(*temp)

In [14]:
train_data = talk_sentence_word_shuffle[:1835]
test_data = talk_sentence_word_shuffle[1835:]

train_summaries = summary_sentence_word_shuffle[:1835]
test_summaries = summary_sentence_word_shuffle[1835:]

In [15]:
print(np.shape(talk_sentence_word_shuffle))
print(np.shape(summary_sentence_word_shuffle))
print(np.shape(train_data))
print(np.shape(test_data))
print(np.shape(train_summaries))
print(np.shape(test_summaries))

(2085,)
(2085,)
(1835,)
(250,)
(1835,)
(250,)


In [16]:
train_talk_lenghts = []

for talk in train_data:
    N = 0
    for sent in talk:
        for word in sent:
            N += 1
    train_talk_lenghts.append(N)

In [17]:
train_summary_lenghts = []

for talk in train_summaries:
    N = 0
    for sent in talk:
        for word in sent:
            N += 1
    train_summary_lenghts.append(N)

In [18]:
print(train_talk_lenghts[:10])
print(train_summary_lenghts[:10])

[3641, 3243, 3357, 431, 2388, 1235, 2749, 769, 1885, 1492]
[36, 70, 57, 64, 73, 60, 47, 47, 66, 52]


In [19]:
test_talk_lenghts = []

for talk in test_data:
    N = 0
    for sent in talk:
        for word in sent:
            N += 1
    test_talk_lenghts.append(N)

In [20]:
test_summary_lenghts = []

for talk in test_summaries:
    N = 0
    for sent in talk:
        for word in sent:
            N += 1
    test_summary_lenghts.append(N)

In [21]:
print(test_talk_lenghts[:10])
print(test_summary_lenghts[:10])

[1585, 2804, 1848, 1157, 2808, 931, 2942, 1308, 801, 790]
[29, 50, 116, 53, 43, 88, 39, 50, 52, 62]


In [22]:
train_talk_word = []

for talk in train_data:
    temp = []
    for sent in talk:
        for word in sent:
            temp.append(word)
    train_talk_word.append(temp)
    
print(len(train_talk_word[0]))

3641


In [23]:
train_summary_word = []

for talk in train_summaries:
    temp = []
    for sent in talk:
        for word in sent:
            temp.append(word)
    train_summary_word.append(temp)
    
print(len(train_summary_word[0]))

36


In [24]:
test_talk_word = []

for talk in test_data:
    temp = []
    for sent in talk:
        for word in sent:
            temp.append(word)
    test_talk_word.append(temp)
    
print(len(test_talk_word[0]))

1585


In [25]:
test_summary_word = []

for talk in test_summaries:
    temp = []
    for sent in talk:
        for word in sent:
            temp.append(word)
    test_summary_word.append(temp)
    
print(len(test_summary_word[0]))

29


In [26]:
vocab = {}
N = 1

for talk in train_data:
    for sent in talk:
        for word in sent:
            if word in vocab:
                N = N
            else:
                vocab[word] = N
                N += 1
                
vocab["unknown_word"] = 0

In [27]:
print(len(vocab))

52083


In [28]:
train_talk_word_index = []

for talk in train_talk_word:
    temp = []
    for word in talk:
        if word in vocab:
            temp.append(vocab[word])
        else:
            temp.append(vocab["unknown_word"])
    train_talk_word_index.append(temp)

In [29]:
train_summary_word_index = []

for talk in train_summary_word:
    temp = []
    for word in talk:
        if word in vocab:
            temp.append(vocab[word])
        else:
            temp.append(vocab["unknown_word"])
    train_summary_word_index.append(temp)

In [30]:
test_talk_word_index = []

for talk in test_talk_word:
    temp = []
    for word in talk:
        if word in vocab:
            temp.append(vocab[word])
        else:
            temp.append(vocab["unknown_word"])
    test_talk_word_index.append(temp)

In [31]:
test_summary_word_index = []

for talk in test_summary_word:
    temp = []
    for word in talk:
        if word in vocab:
            temp.append(vocab[word])
        else:
            temp.append(vocab["unknown_word"])
    test_summary_word_index.append(temp)

In [38]:
train_talk_word_index_exp = []

for i in range(len(train_talk_word_index)):
    temp = []
    for j in range(max(train_talk_lenghts)):
        if j <= (train_talk_lenghts[i]-1):
            temp.append(train_talk_word_index[i][j])
        else:
            temp.append(0)
    train_talk_word_index_exp.append(temp)

In [39]:
train_summary_word_index_exp = []

for i in range(len(train_summary_word_index)):
    temp = []
    for j in range(max(train_summary_lenghts)):
        if j <= (train_summary_lenghts[i]-1):
            temp.append(train_summary_word_index[i][j])
        else:
            temp.append(0)
    train_summary_word_index_exp.append(temp)

In [40]:
test_talk_word_index_exp = []

for i in range(len(test_talk_word_index)):
    temp = []
    for j in range(max(test_talk_lenghts)):
        if j <= (test_talk_lenghts[i]-1):
            temp.append(test_talk_word_index[i][j])
        else:
            temp.append(0)
    test_talk_word_index_exp.append(temp)

In [41]:
test_summary_word_index_exp = []

for i in range(len(test_summary_word_index)):
    temp = []
    for j in range(max(test_summary_lenghts)):
        if j <= (test_summary_lenghts[i]-1):
            temp.append(test_summary_word_index[i][j])
        else:
            temp.append(0)
    test_summary_word_index_exp.append(temp)

In [48]:
tf.reset_default_graph()
try:
    sess.close()
except:
    
    pass
sess = tf.InteractiveSession()

In [49]:
input_seq_length = 16
output_seq_length = 15
batch_size = 128

input_vocab_size = 70
output_vocab_size = 28
embedding_dim = 256

In [50]:
encode_input = [tf.placeholder(tf.int32, 
                                shape=(None,),
                                name = "ei_%i" %i)
                                for i in range(input_seq_length)]

labels = [tf.placeholder(tf.int32,
                                shape=(None,),
                                name = "l_%i" %i)
                                for i in range(output_seq_length)]

decode_input = [tf.zeros_like(encode_input[0], dtype=np.int32, name="GO")] + labels[:-1]

In [64]:
keep_prob = tf.placeholder("float")

cells = [tf.nn.rnn_cell.DropoutWrapper(
        tf.nn.rnn_cell.BasicLSTMCell(embedding_dim), output_keep_prob=keep_prob
    ) for i in range(3)]

stacked_lstm = tf.nn.rnn_cell.MultiRNNCell(cells)

with tf.variable_scope("decoders") as scope:
    decode_outputs, decode_state = tf.nn.seq2seq.embedding_rnn_seq2seq(
        encode_input, decode_input, stacked_lstm, input_vocab_size, output_vocab_size, reuse=True)
    
    scope.reuse_variables()
    
    decode_outputs_test, decode_state_test = tf.nn.seq2seq.embedding_rnn_seq2seq(
        encode_input, decode_input, stacked_lstm, input_vocab_size, output_vocab_size,
    feed_previous=True)



TypeError: embedding_rnn_seq2seq() got an unexpected keyword argument 'reuse'