<a href="https://colab.research.google.com/github/devhemza/deeplearningproject/blob/main/text_summarization_feats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
%tensorflow_version 1.x

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
#Change this with the path to the cloned repo
%cd /content/drive/MyDrive/M2/DeepLearning/deeplearningproject/

/content/drive/MyDrive/M2/DeepLearning/deeplearningproject


In [1]:
#imports
import tensorflow as tf
from tensorflow.contrib import rnn
from utils import get_init_embedding_feats
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import pos_tag, ne_chunk
import re
import collections
import pickle
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
!pip install wget



### ENV Preparation:

- Download the glove if it does not exist. 

We used the Wikipedia 2014 + Gigaword 5 Glove (6B tokens, 400K vocab, uncased, 300d vectors). to initialize word embedding.

https://nlp.stanford.edu/projects/glove/



In [3]:
import os 
import wget
from os import path
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

glove_dir = "glove"
glove_url = "https://nlp.stanford.edu/data/wordvecs/glove.6B.300d.zip"


# Download glove vector if not exit
if not path.exists("glove"):
    if not os.path.exists(glove_dir):
        os.mkdir(glove_dir)
    wget.download(glove_url, out=glove_dir)
    # Extract glove file
    with zipfile.ZipFile(os.path.join("glove", "glove.6B.300d.zip"), "r") as z:
        z.extractall(glove_dir)

[nltk_data] Downloading package punkt to /Users/amine/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/amine/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/amine/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/amine/nltk_data...
[nltk_data]   Package words is already up-to-date!


### Model

**Architecture** : Encoder-decoder RNN with attention. (Feature-rich encoder).
* Encoder : Bidirectionnel GRU-RNN. 
* Decoder : Unidirectionnel GRU-RNN with the same hidden-state size as the encoder. 
* Attention : Bahdanau attention mechanism.
* Softmax layer over the target vocabulary.

**Feature-rich encoder** :
Capturing additional linguistic features, such as part-of-speech tags (**POS**), named-entity (**NER**) tags, and TF-IDF statics of the words.

All of this features and the word based embeddings are passed as an input to the encoder after a concatenation into one long vector.

In [4]:
import tensorflow as tf
from tensorflow.contrib import rnn
from utils import get_init_embedding


class Model(object):
    def __init__(self, reversed_dict, article_max_len, summary_max_len, args, forward_only=False):
        self.vocabulary_size = len(reversed_dict)
        self.embedding_size = args.embedding_size
        self.num_hidden = args.num_hidden
        self.num_layers = args.num_layers
        self.learning_rate = args.learning_rate
        self.beam_width = args.beam_width
        if not forward_only:
            self.keep_prob = args.keep_prob
        else:
            self.keep_prob = 1.0
        self.cell = tf.nn.rnn_cell.GRUCell
        with tf.variable_scope("decoder/projection"):
            self.projection_layer = tf.layers.Dense(self.vocabulary_size, use_bias=False)

        self.batch_size = tf.placeholder(tf.int32, (), name="batch_size")
        self.X = tf.placeholder(tf.int32, [None, article_max_len])
        self.X_len = tf.placeholder(tf.int32, [None])
        self.decoder_input = tf.placeholder(tf.int32, [None, summary_max_len])
        self.decoder_len = tf.placeholder(tf.int32, [None])
        self.decoder_target = tf.placeholder(tf.int32, [None, summary_max_len])
        self.global_step = tf.Variable(0, trainable=False)

        with tf.name_scope("embedding"):
            if not forward_only and args.glove:
                init_embeddings = tf.constant(get_init_embedding(reversed_dict, self.embedding_size), dtype=tf.float32)
            else:
                init_embeddings = tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0)
            self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings)
            self.encoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.X), perm=[1, 0, 2])
            self.decoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.decoder_input), perm=[1, 0, 2])

        with tf.name_scope("encoder"):
            fw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            bw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            fw_cells = [rnn.DropoutWrapper(cell) for cell in fw_cells]
            bw_cells = [rnn.DropoutWrapper(cell) for cell in bw_cells]

            encoder_outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
                fw_cells, bw_cells, self.encoder_emb_inp,
                sequence_length=self.X_len, time_major=True, dtype=tf.float32)
            self.encoder_output = tf.concat(encoder_outputs, 2)
            encoder_state_c = tf.concat((encoder_state_fw[0].c, encoder_state_bw[0].c), 1)
            encoder_state_h = tf.concat((encoder_state_fw[0].h, encoder_state_bw[0].h), 1)
            self.encoder_state = rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)

        with tf.name_scope("decoder"), tf.variable_scope("decoder") as decoder_scope:
            decoder_cell = self.cell(self.num_hidden * 2)

            if not forward_only:
                attention_states = tf.transpose(self.encoder_output, [1, 0, 2])
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, attention_states, memory_sequence_length=self.X_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                   attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size)
                initial_state = initial_state.clone(cell_state=self.encoder_state)
                helper = tf.contrib.seq2seq.TrainingHelper(self.decoder_emb_inp, self.decoder_len, time_major=True)
                decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, initial_state)
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True, scope=decoder_scope)
                self.decoder_output = outputs.rnn_output
                self.logits = tf.transpose(
                    self.projection_layer(self.decoder_output), perm=[1, 0, 2])
                self.logits_reshape = tf.concat(
                    [self.logits, tf.zeros([self.batch_size, summary_max_len - tf.shape(self.logits)[1], self.vocabulary_size])], axis=1)
            else:
                tiled_encoder_output = tf.contrib.seq2seq.tile_batch(
                    tf.transpose(self.encoder_output, perm=[1, 0, 2]), multiplier=self.beam_width)
                tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(self.encoder_state, multiplier=self.beam_width)
                tiled_seq_len = tf.contrib.seq2seq.tile_batch(self.X_len, multiplier=self.beam_width)
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, tiled_encoder_output, memory_sequence_length=tiled_seq_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                   attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size * self.beam_width)
                initial_state = initial_state.clone(cell_state=tiled_encoder_final_state)
                decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell=decoder_cell,
                    embedding=self.embeddings,
                    start_tokens=tf.fill([self.batch_size], tf.constant(2)),
                    end_token=tf.constant(3),
                    initial_state=initial_state,
                    beam_width=self.beam_width,
                    output_layer=self.projection_layer
                )
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder, output_time_major=True, maximum_iterations=summary_max_len, scope=decoder_scope)
                self.prediction = tf.transpose(outputs.predicted_ids, perm=[1, 2, 0])

        with tf.name_scope("loss"):
            if not forward_only:
                crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.logits_reshape, labels=self.decoder_target)
                weights = tf.sequence_mask(self.decoder_len, summary_max_len, dtype=tf.float32)
                self.loss = tf.reduce_sum(crossent * weights / tf.to_float(self.batch_size))

                params = tf.trainable_variables()
                gradients = tf.gradients(self.loss, params)
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.update = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

### Training


Here we tried to reproduce the training phase of the paper.
* Encoder-decoder hidden-state size : 400
* Optimizer : Adadelta.
* Learning rate : 0.001.
* batch-size : 50.
* Gradient clipping.

For lack of time and performance reasons, we used a reduced dataset to train and test the model.

**Train** : 10,000 pairs of article, summary.

**Test**  : 1000 pairs of article, summary.

In [5]:
import time
start = time.perf_counter()
import tensorflow as tf
import argparse
import pickle
import os
from utils import build_dict, build_dataset, batch_iter

params = {
    "batch_size":64, 
    "beam_width":10, 
    "embedding_size":300, 
    "glove":False, 
    "keep_prob":0.8, 
    "learning_rate":0.001, 
    "num_epochs":10, 
    "num_hidden":150, 
    "num_layers":2,
    "toy":False, 
    "with_model":False
}
class Arg(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__
args = Arg(params)


if not os.path.exists("saved_model"):
    os.mkdir("saved_model")
else:
    if args['with_model']:
        old_model_checkpoint_path = open('saved_model/checkpoint', 'r')
        old_model_checkpoint_path = "".join(["saved_model/",old_model_checkpoint_path.read().splitlines()[0].split('"')[1] ])


print("Building dictionary...")
word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("train", args.toy)
print("Loading training dataset...")
train_x, train_y = build_dataset("train", word_dict, article_max_len, summary_max_len, args.toy)


with tf.Session() as sess:
    model = Model(reversed_dict, article_max_len, summary_max_len, args)
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(tf.global_variables())
    if 'old_model_checkpoint_path' in globals():
        print("Continuing from previous trained model:" , old_model_checkpoint_path , "...")
        saver.restore(sess, old_model_checkpoint_path )

    batches = batch_iter(train_x, train_y, args.batch_size, args.num_epochs)
    num_batches_per_epoch = (len(train_x) - 1) // args.batch_size + 1

    print("\nIteration starts.")
    print("Number of batches per epoch :", num_batches_per_epoch)
    for batch_x, batch_y in batches:
        batch_x_len = list(map(lambda x: len([y for y in x if y != 0]), batch_x))
        batch_decoder_input = list(map(lambda x: [word_dict["<s>"]] + list(x), batch_y))
        batch_decoder_len = list(map(lambda x: len([y for y in x if y != 0]), batch_decoder_input))
        batch_decoder_output = list(map(lambda x: list(x) + [word_dict["</s>"]], batch_y))

        batch_decoder_input = list(
            map(lambda d: d + (summary_max_len - len(d)) * [word_dict["<padding>"]], batch_decoder_input))
        batch_decoder_output = list(
            map(lambda d: d + (summary_max_len - len(d)) * [word_dict["<padding>"]], batch_decoder_output))

        train_feed_dict = {
            model.batch_size: len(batch_x),
            model.X: batch_x,
            model.X_len: batch_x_len,
            model.decoder_input: batch_decoder_input,
            model.decoder_len: batch_decoder_len,
            model.decoder_target: batch_decoder_output
        }

        _, step, loss = sess.run([model.update, model.global_step, model.loss], feed_dict=train_feed_dict)

        if step % 1000 == 0:
            print("step {0}: loss = {1}".format(step, loss))

        if step % num_batches_per_epoch == 0:
            hours, rem = divmod(time.perf_counter() - start, 3600)
            minutes, seconds = divmod(rem, 60)
            saver.save(sess, "./saved_model/model.ckpt", global_step=step)
            print(" Epoch {0}: Model is saved.".format(step // num_batches_per_epoch),
            "Elapsed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds) , "\n")


Building dictionary...
Loading training dataset...
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.

KeyboardInterrupt: 

### Test


In [45]:
import tensorflow as tf
tf.reset_default_graph()
import pickle
from utils import build_dict, build_dataset, batch_iter


with open("args.pickle", "rb") as f:
    args = pickle.load(f)

print("Loading dictionary...")
word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("valid", args.toy)
print("Loading validation dataset...")
valid_x = build_dataset("valid", word_dict, article_max_len, summary_max_len, args.toy)
valid_x_len = [len([y for y in x if y != 0]) for x in valid_x]
open("result.txt", 'w').close()

with tf.Session() as sess:
    print("Loading saved model...")
    model = Model(reversed_dict, article_max_len, summary_max_len, args, forward_only=True)
    saver = tf.train.Saver(tf.global_variables())
    ckpt = tf.train.get_checkpoint_state("./saved_model/")
    saver.restore(sess, ckpt.model_checkpoint_path)

    batches = batch_iter(valid_x, [0] * len(valid_x), args.batch_size, 1)

    print("Writing summaries to 'result.txt'...")
    
    for batch_x, _ in batches:
        batch_x_len = [len([y for y in x if y != 0]) for x in batch_x]

        valid_feed_dict = {
            model.batch_size: len(batch_x),
            model.X: batch_x,
            model.X_len: batch_x_len,
        }

        prediction = sess.run(model.prediction, feed_dict=valid_feed_dict)
        prediction_output = [[reversed_dict[y] for y in x] for x in prediction[:, 0, :]]
        summaries = []
        
        with open("result.txt", "a") as f:
            for line in prediction_output:
                summary = list()
                for word in line:
                    if word == "</s>":
                        break
                    if word not in summary:
                        summary.append(word)
                s = " ".join(summary)
                summaries.append(s)
                print(s, file=f)

    print('Summaries are saved to "result.txt"...')


Loading dictionary...
Loading validation dataset...
Loading saved model...
INFO:tensorflow:Restoring parameters from ./saved_model/model.ckpt-1570
Writing summaries to 'result.txt'...
Summaries are saved to "result.txt"...


####Evaluation

For evaluation we used ROUGE-1, ROUGE-2, and ROUGE-L. 

Average ROUGE score (R)recall, (P)precision, or F1-score on all summaries is printed.

In [46]:
import numpy as np

In [47]:
summaries_hat = []  
with open("result.txt", "r") as f:
    for line in f:
        summaries_hat.append(line[:-1])

summaries = []  
with open("reduceddata/sumdata/train/valid.title.filter.txt", "r") as f:
    for line in f:
        summaries.append(line)
articles = []  
with open("reduceddata/sumdata/train/valid.article.filter.txt", "r") as f:
    for line in f:
        articles.append(line)


def printRandomExamples(articles, y, y_hat, n_examples = 3):
    for i in range(n_examples):
        j = np.random.randint(0, len(articles))
        print("-"*40)
        print(f'Article {i+1} :')
        print(articles[j])
        print('Original summary :')
        print(y[j])
        print('Generated summary :')
        print(y_hat[j])
        print('\n')
    return 

In [48]:
printRandomExamples(articles, summaries, summaries_hat)

----------------------------------------
Article 1 :
a maltese cargo ship was impounded friday on suspicion of colliding with a trawler that sank in the english channel , as the search resumed for five missing crew members , french maritime police said .

Original summary :
ship impounded in france over trawler 's sinking

Generated summary :
< unk > body for mexican spy crash


----------------------------------------
Article 2 :
the state of israel is one of the world 's youngest countries , having been carved out in the last century from what used to be the ottoman empire .

Original summary :
israel in a nutshell

Generated summary :
< unk > committee to be built in bulgaria


----------------------------------------
Article 3 :
the top french chef marc <unk> has seriously injured his leg and shoulder in a ski accident , he told afp on monday , adding that he hoped to be back on the <unk> within days .

Original summary :
top french chef injured in ski accident

Generated summary :

https://pypi.org/project/py-rouge/
* The folder "rouge" of this package should be placed in the root of the project directory


In [49]:
import rouge

In [50]:
evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l'],
                        max_n = 2,
                        limit_length=True,
                        length_limit=15,
                        length_limit_type='words',
                        apply_avg=True,
                        stemming=False)


In [51]:
scores = evaluator.get_scores(summaries_hat, summaries)

In [52]:
def prepare_results(p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)


In [53]:
for metric, results in sorted(scores.items(), key=lambda x: x[0]):
     print(prepare_results(results['p'], results['r'], results['f']))

	rouge-1:	P: 13.97	R: 10.40	F1: 11.67
	rouge-2:	P:  3.39	R:  2.56	F1:  2.82
	rouge-l:	P: 13.66	R: 10.19	F1: 11.42
