<a href="https://colab.research.google.com/github/dikshank/Extractive-Text-Summarization/blob/main/ExtractiveTextSummarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, TimeDistributed, Embedding, Convolution1D, Dense, Flatten, Activation, RepeatVector, Permute, Add, multiply,BatchNormalization
from tensorflow.keras.layers import Lambda, Bidirectional, LSTM, Layer, MultiHeadAttention, LayerNormalization, Dropout, Concatenate
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalAveragePooling2D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import backend as K
import numpy as np
import pickle
import pandas as pd
import os
import string
import re
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
cd /content/drive/MyDrive/NLP_PROJECTS

In [None]:
!mkdir "text_summerization"

In [None]:
cd "/content/drive/MyDrive/NLP_PROJECTS/text_summerization"

In [None]:
!tar -xvf /content/drive/MyDrive/NLP_PROJECTS/text_summerization/cnn_stories.tgz

In [None]:
#step 1
def load_story(single_story_path):

  file_handle = open(single_story_path,encoding="utf-8")
  single_complete_story = file_handle.read()
  file_handle.close()
  return single_complete_story

In [None]:
#step 2
def split_story_into_para_highlights(single_complete_story):

  highlight_loc = single_complete_story.find("@highlight")
  para, highlights = single_complete_story[:highlight_loc], single_complete_story[highlight_loc:].split("@highlight")
  highlights = [summary.strip() for summary in highlights if len(highlights) > 0]

  return para,highlights

In [None]:
paragraphs = list()
summaries = list()

for story_filename in os.listdir("./cnn/stories"):

  single_story_path = os.path.join("./cnn/stories",story_filename)
  single_complete_story = load_story(single_story_path)

  para, highlights = split_story_into_para_highlights(single_complete_story)

  paragraphs.append(para)
  summaries.append(highlights)


stories = dict(zip(["Story_paragraphs","Abstractive_summaries"],[paragraphs,summaries]))

In [None]:
# saving the stories in pickle file (step 3)
pkl_file_handle = open("cnn_news_stories.pkl","wb")
pickle.dump(stories,pkl_file_handle)

In [None]:
# load the pickle file data (step 4)
stories = pickle.load(open("/content/drive/MyDrive/NLP_PROJECTS/text_summerization/cnn_news_stories.pkl","rb"))

In [None]:
len(stories["Story_paragraphs"])

92579

In [None]:
len(stories["Abstractive_summaries"])

92579

In [None]:
#step 5
def preprocess_single_sent_per_story(sents_per_story):

  cleaned_sents = list()
  waste_tokens_ascii_values_mapping = dict(zip(list(range(33,48)) + list(range(58,65)) +\
                                                 list(range(91,97)) + list(range(123,127)),[None]*32))
  for sent in sents_per_story:
            
    loc = sent.find('(CNN) -- ')
    if loc > -1:
      sent = sent[loc+len('(CNN)'):]
        
    sent = sent.split()  # to split the sentance into tokens
    sent = [token.lower() for token in sent] # to turn tokens in lower-case
    sent = [token.translate(waste_tokens_ascii_values_mapping) for token in sent] # to remove waste characters (translate method replaces character using ascii numbers)
    # The translate() method returns a string where some specified characters are replaced with the character described in a dictionary, or in a mapping table.
    # https://www.w3schools.com/python/ref_string_translate.asp

    sent = [token for token in sent if token.isalpha()]  # to make sure the tokens are only characters 
    cleaned_sents.append(' '.join(sent)) # to join the token list as a sentance 
    
  cleaned_sents = [sent for sent in cleaned_sents if len(sent) > 0]
  return cleaned_sents


# other method to preprocess the sentance

def clean_doc(doc):
  cleaned_sents = list()
  global tokens
  for sent in doc:
    
    # split into tokens by white space
    tokens = sent.split()
    tokens = [token.lower() for token in tokens]
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    
  
    text = " ".join(tokens)
    cleaned_sents.append(text)

  cleaned_sents = [sent for sent in cleaned_sents if len(sent) > 0]
  
  return cleaned_sents

In [None]:
for i in tqdm(range(len(stories["Story_paragraphs"]))):

  stories["Story_paragraphs"][i] = preprocess_single_sent_per_story(stories["Story_paragraphs"][i].split("\n"))
  stories["Abstractive_summaries"][i] = preprocess_single_sent_per_story(stories["Abstractive_summaries"][i])

In [None]:
# Our data is Abstractive Summaries data, we need to convert it into Extractive Summaries.
# For that we will use Rouge Score and Extract those sentance from the summary which match with the Abstractive summary
! pip install Rouge
from rouge import Rouge
rouge_obj = Rouge() 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: Rouge
Successfully installed Rouge-1.0.1


In [None]:
# step 5 (computing rouge score)
def compute_rouge_score(story_para_sent, abstractive_summaries):

  score_per_story_para_sent = list()

  for summary in abstractive_summaries:

    summary_scores = rouge_obj.get_scores(summary, story_para_sent)
    # summary score will contain n values per sentance if the summary contains n different lines
    # for example if an abstract summary have 3 diffenet lines than max score will contain 3 values for each sentance

    score_per_story_para_sent.append(summary_scores[0]['rouge-1']['f'])
    
  return max(score_per_story_para_sent) # this will return the maximum value


# (selecting top 5 sentances)
def fetch_each_story_top5_para_sents(story_para, abstractive_summaries):

  story_para_sents = list()
  max_scores = list()

  # this loop will run for single story at a time
  for i in range(0, len(story_para)):

    # stroy_para contains paragraph of single story
    # story_para_sent contain single sentance of that paragraph
    story_para_sent = story_para[i]

    max_score = compute_rouge_score(story_para_sent, abstractive_summaries) # this will return the maximum score of a single sentance (this will be a single value)


    story_para_sents.append(story_para_sent)
    max_scores.append(max_score)
        
  story_para_sents = np.array(story_para_sents)
    
  max_scores1 = np.array(max_scores)
  max_scores2 = np.sort(max_scores)[::-1]
  idx = np.argsort(max_scores)[::-1] # https://www.geeksforgeeks.org/numpy-argsort-in-python/
     
  idx = idx[0:5]
    
  return list(story_para_sents[idx]), max_scores2[0:5] #so this will return the top5 sentances along with their rouge score

In [None]:
#step 6 (storing the top 5 sentances and their rouge score as a dictionary)

all_stories_top5_sents_dict = dict()
all_stories_top5_sents_scores = dict()

for story_idx in tqdm(range(0, len(stories["Story_paragraphs"]))):
    
  story_para_sents = stories["Story_paragraphs"][story_idx]
  abstractive_summaries = stories["Abstractive_summaries"][story_idx]
  top5_para_sents, top5_sents_scores = fetch_each_story_top5_para_sents(story_para_sents,abstractive_summaries)
  all_stories_top5_sents_dict[story_idx] = top5_para_sents
  all_stories_top5_sents_scores[story_idx] = top5_sents_scores

In [None]:
# pkl_file_handle = open("./all_stories_top5_sents_dict.pkl","wb")
# pickle.dump(all_stories_top5_sents_dict,pkl_file_handle)

# pkl_file_handle = open("./all_stories_top5_sents_scores.pkl","wb")
# pickle.dump(all_stories_top5_sents_scores,pkl_file_handle)

In [None]:
import pickle
all_stories_top5_sents_dict = pickle.load(open("/content/drive/MyDrive/NLP_PROJECTS/text_summerization/all_stories_top5_sents_dict.pkl","rb"))
all_stories_top5_sents_scores = pickle.load(open("/content/drive/MyDrive/NLP_PROJECTS/text_summerization/all_stories_top5_sents_scores.pkl","rb"))

In [None]:
len(all_stories_top5_sents_dict)

92579

In [None]:
#step7 (creating the dataset columns)
story_idx = list()
sent_idx = list()
sents_list = list()
extractive_label = list()

for i in tqdm(range(0, len(stories["Story_paragraphs"]))):
    
  top5_para_sents = all_stories_top5_sents_dict[i]
    
  for j, para_sent in enumerate(stories["Story_paragraphs"][i]):
        
    ohe_label =  int(para_sent in top5_para_sents)
    extractive_label.append(ohe_label)
    sents_list.append(para_sent)
    sent_idx.append(j)
    story_idx.append(i)
    

extractive_summaries_df = pd.DataFrame()
extractive_summaries_df["Story_idx"] = story_idx
extractive_summaries_df["Sent_idx"] = sent_idx
extractive_summaries_df["Para_sents"] = sents_list
extractive_summaries_df["Extractive_label"] = extractive_label

In [None]:
len(sent_idx)

1972394


In [None]:
len(extractive_summaries_df["Story_idx"].unique())

92465


In [None]:
#saving the pickle file
# extractive_summaries_df.to_pickle("extractive_summaries_df.pkl")

In [None]:
data = pd.read_pickle("/content/drive/MyDrive/NLP_PROJECTS/text_summerization/extractive_summaries_df.pkl")

In [None]:
len(data)

1972394

In [None]:
data_story_sents_count = data.groupby("Story_idx").size().reset_index(name="Sentences_count")

In [None]:
data_story_sents_count.head()

Unnamed: 0,Story_idx,Sentences_count
0,0,17
1,1,20
2,2,23
3,3,17
4,4,34


In [None]:
# we will select the maximum senatnce lenght as 20 so that only those stories are counted that have maximum 20 sentance or less
selected_stories_idx = list(data_story_sents_count[data_story_sents_count["Sentences_count"] <= 20]["Story_idx"])

In [None]:
len(selected_stories_idx)

52030

In [None]:
train_story_ids = selected_stories_idx[:30000]
cv_story_ids = selected_stories_idx[30000:40000]
test_story_ids = selected_stories_idx[40000:]

training_data = data[data["Story_idx"].isin(train_story_ids)]
cv_data = data[data["Story_idx"].isin(cv_story_ids)]
testing_data = data[data["Story_idx"].isin(test_story_ids)]

In [None]:
training_data = training_data.sort_values(["Story_idx","Sent_idx"])
sents_count = training_data.groupby("Story_idx").size().reset_index(name="Sentences_count")

In [None]:
story_max_length = sents_count["Sentences_count"].max()
print(story_max_length)

20


In [None]:
unique_sents = set(training_data["Para_sents"].tolist())
print(unique_sents)

372544


In [None]:
num_labels = len(training_data["Extractive_label"].unique())
print(num_labels)

2


In [None]:
labels2idx = {l: i+1 for i,l in enumerate(np.sort(training_data["Extractive_label"].unique()))}
labels2idx["PAD"] = 0
idx2labels = {i: l for l,i in labels2idx.items()}
print(labels2idx)

# 2 index or 1 label means that the senatnce is present in summary
# 1 index or 0 label means that the senatnce is not present in summary
# 0 index or 'PAD' label refers to padded sentance

{0: 1, 1: 2, 'PAD': 0}


In [None]:
def create_token_count_list(df):
  
  df['Number_tokens'] = df["Para_sents"].apply(lambda x: len(x.split())) # number tokens means how many useful tokens are there in each sentance
  df['Tokens_list'] = df["Para_sents"].apply(lambda x: x.split())
  return df

In [None]:
training_data = create_token_count_list(training_data)
cv_data = create_token_count_list(cv_data)
testing_data = create_token_count_list(testing_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
training_data.head()

Unnamed: 0,Story_idx,Sent_idx,Para_sents,Extractive_label,Number_tokens,Tokens_list
0,0,0,london england reuters harry potter star danie...,1,32,"[london, england, reuters, harry, potter, star..."
1,0,1,daniel radcliffe as harry potter in harry pott...,1,14,"[daniel, radcliffe, as, harry, potter, in, har..."
2,0,2,to the disappointment of gossip columnists aro...,1,29,"[to, the, disappointment, of, gossip, columnis..."
3,0,3,i dont plan to be one of those people who as s...,0,41,"[i, dont, plan, to, be, one, of, those, people..."
4,0,4,the things i like buying are things that cost ...,0,16,"[the, things, i, like, buying, are, things, th..."


In [None]:
total_unique_tokens = set(list(chain(*training_data['Tokens_list'].tolist())))
num_unique_tokens = len(total_unique_tokens)

token2idx = {token: i+2 for i,token in enumerate(total_unique_tokens)}
token2idx["UNK"] = 1
token2idx["PAD"] = 0

idx2token = {i: token for token, i in token2idx.items()}

In [None]:
token2idx['london']

128248

In [None]:
len(idx2token)

145176

In [None]:
def create_sent_label_example(df):

  df["Sent_example"] = df[["Para_sents","Extractive_label"]].apply(tuple,axis=1)
  return df

In [None]:
training_data = create_sent_label_example(training_data)
cv_data = create_sent_label_example(cv_data)
testing_data = create_sent_label_example(testing_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
training_data.tail()

Unnamed: 0,Story_idx,Sent_idx,Para_sents,Extractive_label,Number_tokens,Tokens_list,Sent_example
993412,49792,13,on friday superintendent john deasy said the d...,0,15,"[on, friday, superintendent, john, deasy, said...",(on friday superintendent john deasy said the ...
993413,49792,14,we are relieved that the students and families...,0,60,"[we, are, relieved, that, the, students, and, ...",(we are relieved that the students and familie...
993414,49792,15,los angeles district pays million to pupils in...,0,11,"[los, angeles, district, pays, million, to, pu...",(los angeles district pays million to pupils i...
993415,49792,16,la schools to review past years of teacher dis...,0,10,"[la, schools, to, review, past, years, of, tea...",(la schools to review past years of teacher di...
993416,49792,17,cnns linda hall and jaqueline hurtado contribu...,0,10,"[cnns, linda, hall, and, jaqueline, hurtado, c...",(cnns linda hall and jaqueline hurtado contrib...


In [None]:
max_sent_length = 40

In [None]:
def stories_representation(df):
   
  story_ids = df['Story_idx'].unique()
  stories_examples = list()

  for story_idx in tqdm(story_ids):

    temp_story = list(df[df['Story_idx'] == story_idx]["Sent_example"])
    new_story = list()
    new_story.append(temp_story)
    stories_examples.append(temp_story)
    X_token = np.zeros((len(stories_examples), story_max_length, max_sent_length))
    
    for idx, story_example in enumerate(new_story):

      story_seq = list()
        
      # to give an upper bound on the maximum length of the token sequence for sentence
      for i in range(story_max_length):

          sent_seq = list()
            
          # to give an upper bound on the maximum length of tokens to consider
          for j in range(max_sent_length):

            try:
                split_sent = story_example[i][0].split()
                sent_seq.append(token2idx.get(split_sent[j]))
            except:  
                # exception when there will be no more sentence for the length 
                # and will be padded 0
                sent_seq.append(token2idx.get("PAD"))

          story_seq.append(sent_seq)
        
      X_token[idx] = np.array(story_seq)

  return (X_token, stories_examples)

In [None]:
X_train,Y_train = stories_representation(training_data)

In [None]:
print(f'X train shape {X_train.shape}')
print(f'Y train shape {Y_train.shape}')

X train shape (30000, 20, 40)
Y train shape 30000


In [None]:
# pkl_file_handle = open("./X_train.pkl","wb")
# pickle.dump(X_train,pkl_file_handle)

In [None]:
X_cv,Y_cv = stories_representation(cv_data)

In [None]:
print(f'X cv shape {X_cv.shape}')
print(f'Y cv shape {Y_cv.shape}')

X cv shape (10000, 20, 40)
Y cv shape 10000


In [None]:
# pkl_file_handle = open("./X_cv.pkl","wb")
# pickle.dump(X_cv,pkl_file_handle)

In [None]:
def prepare_labels(story_examples):

    Y = [[labels2idx[ex_content[1]] for ex_content in sent_example] for sent_example in story_examples]
    Y = pad_sequences(maxlen=story_max_length, sequences=Y, value=labels2idx["PAD"], padding='post', truncating='post')
    Y = Y.reshape(-1, story_max_length, 1)
    
    return Y

In [None]:
train_labels = prepare_labels(Y_train)
cv_labels = prepare_labels(Y_cv)

In [None]:
print(train_labels.shape)
print(cv_labels.shape)

(30000, 20, 1)
(10000, 20, 1)


In [None]:
# pkl_file_handle = open("./train_labels.pkl","wb")
# pickle.dump(train_labels,pkl_file_handle)

# pkl_file_handle = open("./cv_labels.pkl","wb")
# pickle.dump(cv_labels,pkl_file_handle)

In [None]:
X_train = pickle.load(open("./X_train.pkl","rb"))
X_cv = pickle.load(open("./X_cv.pkl","rb"))

In [None]:
train_labels = pickle.load(open("/content/drive/MyDrive/NLP_PROJECTS/text_summerization/train_labels.pkl","rb"))
cv_labels = pickle.load(open("/content/drive/MyDrive/NLP_PROJECTS/text_summerization/cv_labels.pkl","rb"))

In [None]:
# It is not possible to have a tf.Tensor with more than one data type. 
# It is possible, however, to serialize arbitrary data structures as strings and store those in tf.Tensors.

training_data_batch_gen = tf.data.Dataset.from_tensor_slices((X_train, train_labels))
training_data_batch_gen = (training_data_batch_gen.batch(128).cache().prefetch(tf.data.experimental.AUTOTUNE))

cv_data_batch_gen = tf.data.Dataset.from_tensor_slices((X_cv, cv_labels))
cv_data_batch_gen = (cv_data_batch_gen.batch(128).cache().prefetch(tf.data.experimental.AUTOTUNE))

In [None]:
# we will create sentance embedding matrix using sentance transformer

!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.6 MB/s 
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=5fb22ebf4b478b3508f062df2fb6a994c73dde8ca287f63d0fc39124298fc9d7
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2


In [None]:
from sentence_transformers import SentenceTransformer,util
model = SentenceTransformer('all-MiniLM-L6-v2')


def creating_sentance_embedding(data):

  sent_embeddings = list()
  sent_embeddings_list = list()
  previous_value = 0

  for ind in tqdm(training_data.index):
    current_value = training_data['Story_idx'][ind]

    if current_value != previous_value:
      sent_embeddings_list.append(sent_embeddings)
      sent_embeddings = []

    previous_value = training_data['Story_idx'][ind]

    if previous_value == current_value:
      embeddings = model.encode(training_data['Para_sents'][ind])
      sent_embeddings.append(embeddings)
    
  return sent_embeddings

embeddings = creating_sentance_embedding(training_data)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

  0%|          | 0/402724 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
# import pickle
# pkl_file_handle = open("/content/drive/MyDrive/NLP_PROJECTS/text_summerization/embeddings.pkl","wb")
# pickle.dump(sent_embeddings,pkl_file_handle)

# with open("/content/drive/MyDrive/NLP_PROJECTS/text_summerization/embeddings.pkl", 'wb') as f:
#     pickle.dump(sent_embeddings_list, f)

In [None]:
with open("/content/drive/MyDrive/NLP_PROJECTS/text_summerization/embeddings.pkl", 'rb') as f:
    sent_embeddings_list = pickle.load(f)

In [None]:
len(sent_embeddings_list)

29999

In [None]:
len(sent_embeddings_list[0])

17

In [None]:
len(sent_embeddings_list[0][0])

384

In [None]:
new_train_labels = train_labels[:-1]

In [None]:
len(new_train_labels)

29999

In [None]:
sent_embeddings_matrix = list()
list_to_add = list(np.zeros((384)))
for para in sent_embeddings_list:
  while len(para) < 20:
     para.append(list_to_add)
     if len(para) == 20:
        break
  sent_embeddings_matrix.append(para)

In [None]:
len(sent_embeddings_matrix)

29999

In [None]:
sent_embeddings_matrix = np.array(sent_embeddings_matrix)

In [None]:
sent_embeddings_matrix.shape

(29999, 20, 384)

In [None]:
new_train_labels = np.array(new_train_labels)

In [None]:
new_train_labels.shape

(29999, 20, 1)

In [None]:
cv_embeddings = creating_sentance_embedding(cv_data)

  0%|          | 0/137256 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
# with open("/content/drive/MyDrive/NLP_PROJECTS/text_summerization/cv_embeddings.pkl", 'wb') as f:
#     pickle.dump(cv_sent_embeddings_list, f)

In [None]:
# with open("/content/drive/MyDrive/NLP_PROJECTS/text_summerization/new_cv_labels.pkl", 'wb') as f:
#     pickle.dump(new_cv_labels, f)

In [None]:
with open("/content/drive/MyDrive/NLP_PROJECTS/text_summerization/cv_embeddings.pkl", 'rb') as f:
    cv_sent_embeddings_list = pickle.load(f)

with open("/content/drive/MyDrive/NLP_PROJECTS/text_summerization/new_cv_labels.pkl", 'rb') as f:
    new_cv_labels = pickle.load(f)

In [None]:
cv_sent_embeddings_matrix = list()
list_to_add = list(np.zeros((384)))
for para in cv_sent_embeddings_list:
  while len(para) < 20:
     para.append(list_to_add)
     if len(para) == 20:
        break
  cv_sent_embeddings_matrix.append(para)

In [None]:
cv_sent_embeddings_matrix = cv_sent_embeddings_matrix[:-1]

In [None]:
cv_sent_embeddings_matrix = np.array(cv_sent_embeddings_matrix)
print(cv_sent_embeddings_matrix.shape)

(9999, 20, 384)


In [None]:
training_data_batch_gen = tf.data.Dataset.from_tensor_slices((sent_embeddings_matrix, new_train_labels))
training_data_batch_gen = (training_data_batch_gen.batch(64).cache().prefetch(tf.data.experimental.AUTOTUNE))

cv_data_batch_gen = tf.data.Dataset.from_tensor_slices((cv_sent_embeddings_matrix, new_cv_labels))
cv_data_batch_gen = (cv_data_batch_gen.batch(128).cache().prefetch(tf.data.experimental.AUTOTUNE))

In [None]:
story_max_length = 20
max_sent_length = 384
num_labels = 2
def text_summarization_model():

  token_input = Input(shape=(story_max_length, max_sent_length,))
    
  lstm_nw = Bidirectional(LSTM(units=16, return_sequences=True))(token_input)
  nw_final_output = TimeDistributed(Dense(num_labels + 1, activation='softmax'))(lstm_nw)

  model = Model([token_input], nw_final_output)

  return model

In [None]:
model = text_summarization_model()

In [None]:
lr_start = 1e-5
lr_max = 1e-3
lr_rampup_epochs = 5
lr_to_sustain_epochs = 0
lr_step_decay = 0.75

In [None]:
def lr_scheduler(epoch):

  if epoch < lr_rampup_epochs:
    lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start

  elif epoch < lr_rampup_epochs + lr_to_sustain_epochs:
    lr = lr_max

  else:
    lr = lr_max * lr_step_decay**((epoch - lr_rampup_epochs - lr_to_sustain_epochs)//10)

  return lr

In [None]:
lr_scheduler_cb = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=True)
early_stopping_cb = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

In [None]:
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
model.fit(training_data_batch_gen,validation_data=cv_data_batch_gen, epochs=50,callbacks=[lr_scheduler_cb, early_stopping_cb], verbose=1)


Epoch 1: LearningRateScheduler setting learning rate to 1e-05.
Epoch 1/50

Epoch 2: LearningRateScheduler setting learning rate to 0.000208.
Epoch 2/50

Epoch 3: LearningRateScheduler setting learning rate to 0.000406.
Epoch 3/50


<keras.callbacks.History at 0x7f979a043f90>

In [None]:
sentance_encoder = SentenceTransformer('all-MiniLM-L6-v2')
def summarize(senatnce):

    embeddings = sentance_encoder.encode(sentance)
    list_to_add = list(np.zeros((384)))
    embeddings = list(embeddings)

    for i in range(20-int(len(embeddings))):
      embeddings.append(list_to_add)

    embedding_matrix = np.array(embeddings)[np.newaxis,...]

    predictions = model(embedding_matrix)

    predicted_id = tf.squeeze(tf.cast(tf.argmax(predictions, axis=-1), tf.int32))

    summary = list()
    for i in range(len(sentance)):
      if predicted_id[i] == 2:
        summary.append(sentance[i])

    return summary

In [None]:
sentance = ['london england reuters harry potter star daniel radcliffe gains access to a reported million million fortune as he turns on monday but he insists the money wont cast a spell on him',
 'daniel radcliffe as harry potter in harry potter and the order of the phoenix',
 'to the disappointment of gossip columnists around the world the young actor says he has no plans to fritter his cash away on fast cars drink and celebrity parties',
 'i dont plan to be one of those people who as soon as they turn suddenly buy themselves a massive sports car collection or something similar he told an australian interviewer earlier this month i dont think ill be particularly extravagant',
 'the things i like buying are things that cost about pounds books and cds and dvds',
 'at radcliffe will be able to gamble in a casino buy a drink in a pub or see the horror film hostel part ii currently six places below his number one movie on the uk box office chart',
 'details of how hell mark his landmark birthday are under wraps his agent and publicist had no comment on his plans',
 'ill definitely have some sort of party he said in an interview hopefully none of you will be reading about it',
 'radcliffes earnings from the first five potter films have been held in a trust fund which he has not been able to touch',
 'despite his growing fame and riches the actor says he is keeping his feet firmly on the ground',
 'people are always looking to say kid star goes off the rails he told reporters last month but i try very hard not to go that way because it would be too easy for them',
 'his latest outing as the boy wizard in harry potter and the order of the phoenix is breaking records on both sides of the atlantic and he will reprise the role in the last two films watch ireporter give her review of potters latest',
 'there is life beyond potter however',
 'the londoner has filmed a tv movie called my boy jack about author rudyard kipling and his son due for release later this year he will also appear in december boys an australian film about four boys who escape an orphanage',
 'earlier this year he made his stage debut playing a tortured teenager in peter shaffers equus',
 'meanwhile he is braced for even closer media scrutiny now that hes legally an adult i just think im going to be more sort of fair game he told reuters email to a friend',
 'copyright reuters all rights reservedthis material may not be published broadcast rewritten or redistributed']

In [None]:
summarize(sentance)

['london england reuters harry potter star daniel radcliffe gains access to a reported million million fortune as he turns on monday but he insists the money wont cast a spell on him',
 'daniel radcliffe as harry potter in harry potter and the order of the phoenix',
 'to the disappointment of gossip columnists around the world the young actor says he has no plans to fritter his cash away on fast cars drink and celebrity parties',
 'i dont plan to be one of those people who as soon as they turn suddenly buy themselves a massive sports car collection or something similar he told an australian interviewer earlier this month i dont think ill be particularly extravagant',
 'the things i like buying are things that cost about pounds books and cds and dvds',
 'at radcliffe will be able to gamble in a casino buy a drink in a pub or see the horror film hostel part ii currently six places below his number one movie on the uk box office chart',
 'details of how hell mark his landmark birthday are