<a href="https://colab.research.google.com/github/devhemza/deeplearningproject/blob/main/Abstractive_text_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 Since the GRUs are bi-directional, there is both a forward and a backward state which are combined (concatenated) as the encoder state.

In [1]:
!pip install pyrouge



In [110]:
from numpy.random import seed
seed(1)

from sklearn.model_selection import train_test_split as tts
import logging
import tensorflow as tf

from pyrouge import Rouge155 
import matplotlib.pyplot as plt
import keras
from keras import initializers
from keras.optimizers import RMSprop, Adadelta
from keras.models import Model
from keras.layers import Bidirectional, Dense,GRU,Input,Activation,Add,TimeDistributed,\
Permute,Flatten,RepeatVector,merge,Lambda,Multiply,Reshape, Attention, Embedding, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

Load data 

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
%cd "/content/drive/MyDrive/M2/DeepLearning/DeepProject/text-summarization-tensorflow-master"

/content/drive/MyDrive/M2/DeepLearning/DeepProject/text-summarization-tensorflow-master


In [5]:
!pip install gensim
!pip install wget
  
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [59]:
from nltk.tokenize import word_tokenize
import re
import collections
import pickle
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

default_path = "./sample_data/"

train_article_path = default_path + "sumdata/train/train.article.txt"
train_title_path   = default_path + "sumdata/train/train.title.txt"
valid_article_path = default_path + "sumdata/train/valid.article.filter.txt"
valid_title_path   = default_path + "sumdata/train/valid.title.filter.txt"

#valid_article_path = default_path + "sumdata/DUC2003/input.txt"
#valid_title_path   = default_path + "sumdata/DUC2003/task1_ref0.txt"

def clean_str(sentence):
    sentence = re.sub("[#.]+", "#", sentence)
    return sentence


def get_text_list(data_path, toy):
    with open (data_path, "r", encoding="utf-8") as f:
        if not toy:
            return [clean_str(x.strip()) for x in f.readlines()][:200000]
        else:
            return [clean_str(x.strip()) for x in f.readlines()][:50]

  
def build_dict(step, toy=False):
    if step == "train":
        train_article_list = get_text_list(train_article_path, toy)
        train_title_list = get_text_list(train_title_path, toy)

        words = list()
        for sentence in train_article_list + train_title_list:
            for word in word_tokenize(sentence):
                words.append(word)

        word_counter = collections.Counter(words).most_common()
        word_dict = dict()
        word_dict["<padding>"] = 0
        word_dict["<unk>"] = 1
        word_dict["<s>"] = 2
        word_dict["</s>"] = 3
        for word, _ in word_counter:
            word_dict[word] = len(word_dict)

        with open(default_path + "word_dict.pickle", "wb") as f:
            pickle.dump(word_dict, f)

    elif step == "valid":
        with open(default_path + "word_dict.pickle", "rb") as f:
            word_dict = pickle.load(f)

    reversed_dict = dict(zip(word_dict.values(), word_dict.keys()))

    article_max_len = 50
    summary_max_len = 15

    return word_dict, reversed_dict, article_max_len, summary_max_len


def build_dataset(step, word_dict, article_max_len, summary_max_len, toy=False):
    if step == "train":
        article_list = get_text_list(train_article_path, toy)
        title_list = get_text_list(train_title_path, toy)
    elif step == "valid":
        article_list = get_text_list(valid_article_path, toy)
    else:
        raise NotImplementedError

    x = [word_tokenize(d) for d in article_list]
    x = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in x]
    x = [d[:article_max_len] for d in x]
    x = [d + (article_max_len - len(d)) * [word_dict["<padding>"]] for d in x]
    
    if step == "valid":
        return x
    else:        
        y = [word_tokenize(d) for d in title_list]
        y = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in y]
        y = [d[:(summary_max_len - 1)] for d in y]
        y = [d + (summary_max_len - len(d)) * [word_dict["<padding>"]] for d in y]
        return x, y


def get_init_embedding(word_dict , reversed_dict, embedding_size):
    embedding_matrix = np.zeros((len(word_dict) + 1, embedding_size))
    print("Loading Lists...")
    train_article_list = get_text_list(train_article_path, False)
    train_title_list = get_text_list(train_title_path, False)

    print("Loading TF-IDF...")
    tf_idf_list = tf_idf_generate(train_article_list+train_title_list)
    
    print("Loading Pos Tags...")
    pos_list , postags_for_named_entity = get_pos_tags_dict(word_dict.keys())

    print("Loading Named Entity...")
    named_entity_recs = named_entity(postags_for_named_entity) 
    
    print("Loading Glove vectors...")

    with open( default_path + "model_glove_300.pkl", 'rb') as handle:
        word_vectors = pickle.load(handle)
     
    
    used_words = 0
    word_vec_list = list()
    for i, word in sorted(reversed_dict.items()):
        try:
            word_vec = word_vectors.word_vec(word)
            if word in tf_idf_list:
              v= tf_idf_list[word]
              rich_feature_array = np.array([v,v,v,v,v,v,v,v,v,v])
              word_vec = np.append(word_vec, rich_feature_array)
            else:
              v=0
              rich_feature_array = np.array([v,v,v,v,v,v,v,v,v,v])
              word_vec = np.append(word_vec, rich_feature_array)

            if word in pos_list:
              v=pos_list[word]
              rich_feature_array_2 = np.array([v,v,v,v,v,v,v,v,v,v])
              word_vec = np.append(word_vec, rich_feature_array_2)
            else:
              v=0
              rich_feature_array_2 = np.array([v,v,v,v,v,v,v,v,v,v])
              word_vec = np.append(word_vec, rich_feature_array_2) 

            if word in named_entity_recs:
              v=named_entity_recs[word]
              rich_feature_array_3 = np.array([v,v,v,v,v,v,v,v,v,v])
              word_vec = np.append(word_vec, rich_feature_array_3)
            else:
              v=0
              rich_feature_array_3 = np.array([v,v,v,v,v,v,v,v,v,v])
              word_vec = np.append(word_vec, rich_feature_array_3)  
          
            used_words += 1
        except KeyError:
            word_vec = np.zeros([embedding_size], dtype=np.float32) #to generate for <padding> and <unk>
        
        
        word_vec_list.append(np.array(word_vec))
        embedding_matrix[i] = word_vec

    print("words found in glove percentage = " + str((used_words/len(word_vec_list))*100) )
          
    # Assign random vector to <s>, </s> token
    word_vec_list[2] = np.random.normal(0, 1, embedding_size)
    word_vec_list[3] = np.random.normal(0, 1, embedding_size)
     
    return embedding_matrix

#### TF-IDF

In [7]:
# _____TF-IDF libraries_____
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# _____helper Libraries_____
import pickle  # would be used for saving temp files
import csv     # used for accessing the dataset
import timeit  # to measure time of training
import random  # used to get a random number


def tf_idf_generate(sentences):
    #https://stackoverflow.com/questions/30976120/find-the-tf-idf-score-of-specific-words-in-documents-using-sklearn

    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    # our corpus
    data = sentences

    cv = CountVectorizer()

    # convert text data into term-frequency matrix
    data = cv.fit_transform(data)

    tfidf_transformer = TfidfTransformer()

    # convert term-frequency matrix into tf-idf
    tfidf_matrix = tfidf_transformer.fit_transform(data)

    # create dictionary to find a tfidf word each word
    word2tfidf = dict(zip(cv.get_feature_names(), tfidf_transformer.idf_))

    #i = 0
    #for word, score in word2tfidf.items():
    #    print(word, score)
    #    if (i == 10):
    #      break
    #    i+=1  
  
    return word2tfidf

### POS tags

In [8]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [9]:
#https://stackoverflow.com/questions/38088652/pandas-convert-categories-to-numbers
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

#ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

def get_pos_tags_dict(words):
    #sent = nltk.word_tokenize(sent)
    #print(sent)
    post_tags_for_words = nltk.pos_tag(words)

    pos_list ={}
    #sent = preprocess(ex)
    for word,pos in post_tags_for_words:
        pos_list[word] = pos
    #print(pos_list)

    import pandas as pd
    df = pd.DataFrame(list(pos_list.items()))
    df.columns = ['word', 'pos']
    df.pos = pd.Categorical(df.pos)
    df['code'] = df.pos.cat.codes
    #print(df)

    pos_list ={}
    for index, row in df.iterrows():
       pos_list[row['word']] = row['code']
    print(pos_list)
    return pos_list , post_tags_for_words

### Named Entity Reognition

In [10]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [11]:
#https://nlpforhackers.io/named-entity-extraction/
from nltk import word_tokenize, pos_tag, ne_chunk

#sentence = "Mark and John are working at Google."


#print (ne_chunk(pos_tag(word_dict.keys())[:5]))
#names = ne_chunk(pos_tag(word_tokenize(sentence)))

#names = ne_chunk(pos_tag(word_tokenize(sentence)))

def named_entity(post_tags_for_words):
  names = ne_chunk(post_tags_for_words)
  names_dict = {}
  for n in names:
    if (len(n) == 1):
      named_entity = str(n).split(' ')[0][1:]
      word = str(n).split(' ')[1].split('/')[0]
      names_dict[word] = named_entity
  print (names_dict)

  import pandas as pd
  df = pd.DataFrame(list(names_dict.items()))
  df.columns = ['word', 'pos']
  df.pos = pd.Categorical(df.pos)
  df['code'] = df.pos.cat.codes
  #print(df)

  names_dict ={}
  for index, row in df.iterrows():
     names_dict[row['word']] = row['code']
  print(names_dict)
  return names_dict

In [12]:
!ls sample_data

Glove		     result_featurerich_15_11_2018_5_28pm.xml  word_dict.pickle
model_glove_300.pkl  saved_model_2
result2.txt	     sumdata


In [60]:
print("Building dictionary...")
word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("train", False)
print("Loading training dataset...")
train_x, train_y = build_dataset("train", word_dict, article_max_len, summary_max_len, False)

Building dictionary...
Loading training dataset...


In [14]:
train_embedding = get_init_embedding(word_dict , reversed_dict, 330)


Loading Lists...
Loading TF-IDF...
Loading Pos Tags...
Loading Named Entity...
{'dutch-belgian': 'GPE'}
{'dutch-belgian': 0}
Loading Glove vectors...
words found in glove percentage = 96.82779456193353


In [15]:
train_embedding.shape

(17213, 330)

In [16]:
train_embedding[1].shape

(330,)

In [17]:
train_embedding[50]


array([-2.19050005e-01, -4.92289990e-01, -2.98319995e-01, -1.51740000e-01,
        7.21120000e-01, -4.07070011e-01,  2.07949996e-01,  7.25679994e-02,
       -5.42850018e-01, -1.95749998e+00, -3.71899992e-01, -1.48000002e-01,
        6.96280003e-01, -1.62000000e-01,  4.11630005e-01, -3.29739988e-01,
       -1.72570005e-01, -9.59800005e-01, -1.94429997e-02, -2.42870003e-01,
        1.82779998e-01, -2.15039998e-01,  5.52010000e-01, -4.29650009e-01,
       -1.18469996e-02, -2.94450015e-01, -9.67909992e-02,  1.96090005e-02,
       -6.37449980e-01,  1.35959998e-01, -4.60889995e-01,  5.40569983e-02,
        4.37050015e-01,  6.08479977e-01, -3.86900008e-01, -4.19950008e-01,
        2.98570007e-01,  2.19549999e-01, -3.04500014e-01, -1.55790001e-01,
       -5.34510016e-01, -2.87699997e-02, -2.46299997e-01,  5.09379983e-01,
        3.57050002e-01,  1.57140002e-01, -4.45380002e-01, -1.05090000e-01,
       -5.12629986e-01,  4.16229993e-01, -2.94970006e-01,  1.35330006e-01,
        5.91769993e-01,  

In [18]:
train_embedding.shape

(17213, 330)

In [19]:
np.shape(train_x[1])

(50,)

In [20]:
def embedding_matrix_creater(embedding_dim, word_index):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
          # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

Model

In [164]:
### Constructor for baseline enc. dec. network.
### Adding the encoder
#######################model params###########################
batch_size = 50
num_classes = 1
epochs = 20
learning_rate = 0.005
clip_norm = 2.0
en_shape = np.shape(train_x)
de_shape = np.shape(train_y[1])
hidden_units = 400
###############################################################
###############################################################

encoder_inputs = Input(shape = (en_shape[1], ))
print(encoder_inputs)

"""_______encoder________"""

# GRU
encoder_embedding_layer = Embedding(1000, 330, input_length = 50,  weights = [train_embedding[:1000]])
encoder_embedding = encoder_embedding_layer(encoder_inputs)
encoder_GRU = Bidirectional(GRU(hidden_units, return_sequences=True, return_state=True))
encoder_output, encoder_fs, encoder_bs = encoder_GRU(encoder_embedding)

encoder_state = Concatenate()([encoder_fs, encoder_bs])

Tensor("input_24:0", shape=(None, 50), dtype=float32)


In [165]:
encoder_embedding.shape

TensorShape([None, 50, 330])

In [166]:
### Adding the decoder
decoder_input = Input(shape=(None, ))
decoder_embedding = Embedding(train_embedding.shape[0], train_embedding.shape[1], input_length = 50,  weights = [train_embedding])(decoder_input)

# GRU using encoder_states as initial state
decoder_gru = GRU(hidden_units * 2, return_sequences=True, return_state=True)
decoder_output, decoder_state = decoder_gru(decoder_embedding, initial_state=[encoder_state])

In [167]:
# Attention Layer for alignment model and computation of weights for all encoder hidden states
attention_layer = Attention() 
attention_out = attention_layer([encoder_output, decoder_output])

# Concat attention output and decoder GRU output
decoder_concatenate = Concatenate(axis=-1)([decoder_output, attention_out])

# Dense layer for generating words from vocabulary distribution
decoder_dense = TimeDistributed(
    Dense(train_embedding.shape[0], activation='softmax'))
decoder_dense_output = decoder_dense(decoder_concatenate)

# Define the model
model = Model([encoder_inputs, decoder_input], decoder_dense_output)
model.compile(optimizer='rmsprop', 
              loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [168]:
model.summary()

Model: "functional_39"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_24 (InputLayer)           [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding_18 (Embedding)        (None, 50, 330)      330000      input_24[0][0]                   
__________________________________________________________________________________________________
input_25 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
bidirectional_10 (Bidirectional [(None, 50, 800), (N 1756800     embedding_18[0][0]               
______________________________________________________________________________________

In [173]:
train_X = tf.convert_to_tensor(train_x)
train_Y = tf.convert_to_tensor(train_y)

In [174]:
model.fit(train_X, train_Y, batch_size=50,epochs=10, verbose=2)

Epoch 1/10


AssertionError: ignored