## Importing Libraries 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string
from string import punctuation
from nltk.tokenize import word_tokenize
from math import log
from nltk.stem import PorterStemmer, WordNetLemmatizer

import os
import re
import pickle
import unicodedata
from random import randint


from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding, TimeDistributed

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

## Data Pre-processing 

#### Get the dataset 

In [None]:
df1 = pd.read_csv('news_summary.csv', encoding='iso-8859-1').reset_index(drop=True)
df2 = pd.read_csv('news_summary_more.csv', encoding='iso-8859-1').reset_index(drop=True)

In [None]:
df1.sample(5)

In [None]:
df2.sample(5)

In [None]:
# Remove unwanted columns
df1 = df1[['text', 'headlines']]

# Remove any rows with missing data
df1.dropna(inplace=True)

# Concatenate df1 and df2 in df 
df = pd.concat([df1, df2], axis='rows')
del df1, df2

# Shuffling the data frame
df = df.sample(frac=1).reset_index(drop=True)

print(f'Dataset size Dataset: {len(df)}')
df.sample(5)

#### Convert to Lower case  

In [None]:
df.text = df.text.apply(str.lower)
df.headlines = df.headlines.apply(str.lower)

df.sample(5)

#### Remove Punctuations

In [None]:
# Remove puncuation from word
def rm_punc_from_word(word):
    clean_alphabet_list = [alphabet for alphabet in word if alphabet not in string.punctuation]
    return ''.join(clean_alphabet_list)

# Remove puncuation from text
def rm_punc_from_text(text):
    clean_word_list = [rm_punc_from_word(word) for word in text]
    return ''.join(clean_word_list)

# Remove numbers from text
def rm_number_from_text(text):
    text = re.sub('[0-9]+', '', text)
    return ' '.join(text.split())

#### Remove Stop words 

In [None]:
# Function to remove stop words
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [None]:
# Define function for clean text
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    text = re.sub("(__+)", ' ', str(text)).lower()
    text = re.sub("(--+)", ' ', str(text)).lower()
    text = re.sub("(~~+)", ' ', str(text)).lower()
    text = re.sub("(\+\++)", ' ', str(text)).lower()
    text = re.sub("(\.\.+)", ' ', str(text)).lower()

    text = re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(text)).lower()
    text = text.lower()
    text = remove_stopwords(text)
    text = rm_punc_from_text(text)
    text = rm_number_from_text(text)
    text = re.sub('–', '', text)
    text = ' '.join(text.split())
    
    return text

df.text = df.text.apply(clean_text)
df.headlines = df.headlines.apply(clean_text)
df.sample(5)

#### Word Tokenization and Stemming 

In [None]:
# Define function for text preprocessing
def preprocess_text(text):
    
    # Tokenize into sentences
    sentences = sent_tokenize(text)
     # Tokenize the text
    words = word_tokenize(text)
 
    # Stem each word
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
        

    # Join the preprocessed sentences
    preprocessed_sentences = []
    preprocessed_sentences.append(' '.join(stemmed_words))
    # Join the preprocessed sentences
    preprocessed_text = ' '.join(preprocessed_sentences)
    
    return preprocessed_text

# Apply the preprocessing function to the text and headlines columns
df['text'] = df['text'].apply(preprocess_text)
df['headlines'] = df['headlines'].apply(preprocess_text) 


In [None]:
# Save the cleaned dataset
df.to_csv('cleaned_data.csv')

In [None]:
df

#### TF-IDF

In [22]:
# Load the cleaned dataset
df = pd.read_csv('cleaned_data.csv')

# Define the function for computing the TF-IDF scores
def compute_tfidf(corpus):
    # Compute the document frequency for each term in the corpus
    df_dict = {}
    for doc in corpus:
        for term in set(doc):
            df_dict[term] = df_dict.get(term, 0) + 1  # The dict.get() method returns the value for a given key 
                                                     # if it exists in the dictionary, or a default value (in this case, 0) 
                                                     # if it doesn't exist. We can then add 1 to this value to increment the count.
    
    # Compute the inverse document frequency for each term
    N = len(corpus)         # N is the total number of documents in the corpus
    # computes  IDF by taking the natural logarithm of N divided by its document frequency.
    idf_dict = {term: log(N/df) for term, df in df_dict.items()}      # df(term) is the number of documents in the corpus that contain the term.
    
    # Compute the term frequency - inverse document frequency for each term in each document
    tfidf_matrix = []
    for doc in corpus:
        tfidf_doc = {}
        for term in set(doc):
            df = doc.count(term) / len(doc)
            idf = idf_dict[term]
            tfidf_doc[term] = df * idf
        tfidf_matrix.append(tfidf_doc)
    
    return tfidf_matrix


# Preprocess the text data
corpus = [word_tokenize(text) for text in df['text']]

# Compute the TF-IDF scores for the text data
tfidf_matrix = compute_tfidf(corpus)

# Print the top 10 most frequent words in the corpus
freq_dist = nltk.FreqDist([term for doc in corpus for term in doc])
print('The Top 10 most frequent words in the corpus is: \n',freq_dist.most_common(10))
print('-------------------------------------------------')
# Print the TF-IDF scores for the first document
print('The TF-IDF scores for the first article is: \n',tfidf_matrix[0])

The Top 10 most frequent words in the corpus is: 
 [('said', 15367), ('year', 6535), ('ad', 6501), ('india', 5484), ('also', 3405), ('polic', 3159), ('us', 3053), ('minist', 2991), ('govern', 2978), ('old', 2963)]
-------------------------------------------------
The TF-IDF scores for the first article is: 
 {'major': 0.11445516428587571, 'inflammatori': 0.22473511888715572, 'ace': 0.21695992773981032, 'base': 0.0810684701065436, 'till': 0.11838138163420596, 'centr': 0.10445231358372034, 'drug': 0.3966505224684188, 'among': 0.10607767463070772, 'court': 0.14661977085588, 'indian': 0.0643922222275481, 'submit': 0.1358257973802258, 'allow': 0.10031614243073968, 'proxyvon': 0.2544273429052128, 'relev': 0.1851043872981442, 'ministri': 0.11177559535844202, 'arriv': 0.13499921837016957, 'combin': 0.1543921544156859, 'medicin': 0.32591467157551884, 'delhi': 0.0730783182962899, 'high': 0.09318013373970757, 'dose': 0.21695992773981032, 'wockhardt': 0.2544273429052128, 'health': 0.12029640695224

#### PoS tagging

In [23]:
# Load the cleaned dataset
df = pd.read_csv('cleaned_data.csv')

# Define function for PoS tagging
def pos_tagging(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Perform PoS tagging
    pos_tags = nltk.pos_tag(tokens)
    # Extract the POS tags
    pos_tags_only = [tag[1] for tag in pos_tags]
    # Join the PoS tags into a string
    pos_tags_string = " ".join(pos_tags_only)
    return pos_tags_string

# Apply PoS tagging to the text and headlines columns
df['text_pos'] = df['text'].apply(pos_tagging)
df['headlines_pos'] = df['headlines'].apply(pos_tagging)

# Save the POS-tagged dataset
df.to_csv('news_summ_pos.csv', index=False)

df.drop('Unnamed: 0',axis=1,inplace=True)
df = pd.DataFrame(df)
df

Unnamed: 0,text,headlines,text_pos,headlines_pos
0,delhi high court allow indian drug major wockh...,hc allow wockhardt sell drug ban govt till sept,RB JJ NN JJ JJ NN JJ NN NN IN JJ JJ NN NN NN J...,NN VB NN JJ NN NN NN NN VBD
1,talk ms dhoni slam three consecut fifti odi se...,ms dhoni superstar time great justin langer,NN NN NN VBD CD NN NN IN JJ JJ NN NN JJ NN NN ...,NN NN NN NN JJ NN NN
2,accord monitor committe overse clean yamuna ab...,delhi slum major caus yamuna pollut panel,NN NN NN JJ JJ NN NN NN NN JJ NN NN VBZ CD JJ ...,NN NN JJ NN NN NN NN
3,comedian televis host krushna abhishek said ka...,kapil sharma give one liner perform krushna,JJ NN NN NN NN VBD JJ JJ VB CD JJR NN NN VBD J...,NNS VBD JJ CD NN NN NN
4,border road organis bro reportedli construct t...,tunnel propos arunach cut distanc china border,NN NN NN NN JJ NN CD NN NN JJ NN NN NN NN NN N...,NN NN NN NN NN NN NN
...,...,...,...,...
24508,former pm manmohan singh monday alleg central ...,bjp govt take nation wrong path ex pm manmohan...,JJ NN NN NN NN IN JJ NNS VBP NN IN JJ NN VBD J...,NN NNS VBP NN JJ NN NN NN NN NN
24509,photograph actress kareena kapoor celebr chris...,pic kareena celebr christma eve share onlin,NN NN VBD NNP NN NN VBP NN NN NN FW JJ NN NNS ...,NN NN NN NN VBP NN NN
24510,sridevi said alway pressur actor look good eve...,star look best even tackl hell within sridevi,NN VBD RB JJ NN VBP JJ RB VBP NN NN RBR NN JJS...,NN NN RBS RB VBP NN IN NN
24511,comment sexual harass alleg made tanushre dutt...,silenc voic pooja tanushre row,NN JJ NN NN VBD JJ NN NN NN NN NN VBD VBP JJ N...,NN NN NN NN NN


In [24]:
df = pd.read_csv('cleaned_data.csv')
df

Unnamed: 0.1,Unnamed: 0,text,headlines
0,0,delhi high court allow indian drug major wockh...,hc allow wockhardt sell drug ban govt till sept
1,1,talk ms dhoni slam three consecut fifti odi se...,ms dhoni superstar time great justin langer
2,2,accord monitor committe overse clean yamuna ab...,delhi slum major caus yamuna pollut panel
3,3,comedian televis host krushna abhishek said ka...,kapil sharma give one liner perform krushna
4,4,border road organis bro reportedli construct t...,tunnel propos arunach cut distanc china border
...,...,...,...
24508,24508,former pm manmohan singh monday alleg central ...,bjp govt take nation wrong path ex pm manmohan...
24509,24509,photograph actress kareena kapoor celebr chris...,pic kareena celebr christma eve share onlin
24510,24510,sridevi said alway pressur actor look good eve...,star look best even tackl hell within sridevi
24511,24511,comment sexual harass alleg made tanushre dutt...,silenc voic pooja tanushre row


In [22]:
# df.headlines = df.headlines.apply(lambda x: f'_START_ {x} _END_')

In [23]:
# بديل ---> NER 
# تستخدم لتحديد بداية النص ونهايته  
# start_token = 'sostok' # start of Summary
# end_token = 'eostok'   # end of Summary
# df.headlines = df.headlines.apply(lambda x: f'{start_token} {x} {end_token}')
# df.sample(5)


In [25]:
#Model to summarize the text between 0-15 words for Summary and 0-100 words for Text
max_text_len=100
max_summary_len=15

In [26]:
# This function I use to determine the biggest length and the smallest height I can deal with 
# select the summary and text between their defined max lens respectively
def trim_text_and_summary(df, max_text_len, max_summary_len):
    cleaned_text = np.array(df['text'])
    cleaned_summary = np.array(df['headlines'])

    short_text = []
    short_summary = []

    for i in range(len(cleaned_text)):
        if len(cleaned_text[i].split()) <= max_text_len and len(
            cleaned_summary[i].split()
        ) <= max_summary_len:
            short_text.append(cleaned_text[i])
            short_summary.append(cleaned_summary[i])
        # change headlines to summary  
    df = pd.DataFrame({'text': short_text, 'summary': short_summary})
    return df


df = trim_text_and_summary(df, max_text_len, max_summary_len)
print(f'Dataset size: {len(df)}')
df.sample(5)



Dataset size: 23687


Unnamed: 0,text,summary
11224,pm narendra modi saturday met european council...,pm eu leader discuss way fight terror g meet
22465,ladi bu driver us milwauke rescu less year old...,us ladi bu driver rescu babi roam highway cold...
20760,pakistan state run news channel ptv wrote beij...,imran khan govt fire md state run tv beg error
10598,fijian cricket ilikena lasarusa talebulamainei...,cricket longest known surnam histori
12996,video clip show weatherman struggl stand repor...,report stand peopl walk calmli viral hurrican ...


In [27]:
# تستخدم لحذف الكلمات الشاذة 
# rare word analysis
def get_rare_word_percent(tokenizer, threshold):
    # threshold: if the word's occurrence is less than this then it's rare word

    count = 0
    total_count = 0
    frequency = 0
    total_frequency = 0

    for key, value in tokenizer.word_counts.items():
        total_count += 1
        total_frequency += value
        if value < threshold:
            count += 1
            frequency += value

    return {
        'percent': round((count / total_count) * 100, 2),
        'total_coverage': round(frequency / total_frequency * 100, 2),
        'count': count,
        'total_count': total_count
    }


### Building Our Model 

#### Train / Test Split

In [28]:
# Splitting the training and validation sets
x_train, x_val, y_train, y_val = train_test_split( np.array(df['text']), np.array(df['summary']), 
    test_size=0.2,
    random_state=1,
    shuffle=True
)   # x_val = x_test  AND  y_val = y_test

In [29]:
x_train.shape

(18949,)

In [30]:
x_val.shape

(4738,)

In [None]:
#Lets tokenize the text to get the vocab count , you can use Spacy here also

from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences

#prepare a tokenizer for reviews on training data
x_tokenizer = Tokenizer() 
x_tokenizer.fit_on_texts(list(x_train))

In [None]:
#prepare a tokenizer for reviews on training data
x_tokenizer = Tokenizer(num_words=tot_cnt-cnt) 
x_tokenizer.fit_on_texts(list(x_train))

#convert text sequences into integer sequences (i.e one-hot encodeing all the words)
x_tr_seq    =   x_tokenizer.texts_to_sequences(x_train) 
x_val_seq   =   x_tokenizer.texts_to_sequences(x_val)

#padding zero upto maximum length
x_train    =   pad_sequences(x_tr_seq,  maxlen=max_text_len, padding='post')
x_val   =   pad_sequences(x_val_seq, maxlen=max_text_len, padding='post')

#size of vocabulary ( +1 for padding token)
x_voc   =  x_tokenizer.num_words + 1

print("Size of vocabulary in X = {}".format(x_voc))

In [None]:
#prepare a tokenizer for reviews on training data
y_tokenizer = Tokenizer()   
y_tokenizer.fit_on_texts(list(y_train))

In [None]:
#prepare a tokenizer for reviews on training data
y_tokenizer = Tokenizer(num_words=tot_cnt-cnt) 
y_tokenizer.fit_on_texts(list(y_train))

#convert text sequences into integer sequences (i.e one hot encode the text in Y)
y_tr_seq    =   y_tokenizer.texts_to_sequences(y_train) 
y_val_seq   =   y_tokenizer.texts_to_sequences(y_val) 

#padding zero upto maximum length
y_train    =   pad_sequences(y_tr_seq, maxlen=max_summary_len, padding='post')
y_val   =   pad_sequences(y_val_seq, maxlen=max_summary_len, padding='post')

#size of vocabulary
y_voc  =   y_tokenizer.num_words +1
print("Size of vocabulary in Y = {}".format(y_voc))

In [None]:
ind=[]
for i in range(len(y_train)):
    cnt=0
    for j in y_train[i]:
        if j!=0:
            cnt=cnt+1
    if(cnt==2):
        ind.append(i)

y_train=np.delete(y_train,ind, axis=0)
x_train=np.delete(y_train,ind, axis=0)

In [None]:
ind=[]
for i in range(len(y_val)):
    cnt=0
    for j in y_val[i]:
        if j!=0:
            cnt=cnt+1
    if(cnt==2):
        ind.append(i)

y_val=np.delete(y_val,ind, axis=0)
x_val=np.delete(x_val,ind, axis=0)

In [None]:
from keras import backend as K 
import gensim
from numpy import *
import numpy as np
import pandas as pd 
import re
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

print("Size of vocabulary from the w2v model = {}".format(x_voc))

K.clear_session()

latent_dim = 300
embedding_dim=200

# Encoder
encoder_inputs = Input(shape=(max_text_len,))

#embedding layer
enc_emb =  Embedding(x_voc, embedding_dim,trainable=True)(encoder_inputs)

#encoder lstm 1
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

#encoder lstm 2
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

#encoder lstm 3
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True,dropout=0.4,recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))

#embedding layer
dec_emb_layer = Embedding(y_voc, embedding_dim,trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.4,recurrent_dropout=0.2)
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c])

#dense layer
decoder_dense =  TimeDistributed(Dense(y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model 
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

In [None]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=2)

In [None]:
history=model.fit([x_tr,y_tr[:,:-1]], y_tr.reshape(y_tr.shape[0],y_tr.shape[1], 1)[:,1:] ,epochs=50,callbacks=[es],batch_size=128, validation_data=([x_val,y_val[:,:-1]], y_val.reshape(y_val.shape[0],y_val.shape[1], 1)[:,1:]))

In [None]:
reverse_target_word_index=y_tokenizer.index_word
reverse_source_word_index=x_tokenizer.index_word
target_word_index=y_tokenizer.word_index

In [None]:
# Encode the input sequence to get the feature vector
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(max_text_len,latent_dim))

# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs) 
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2) 

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2])

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
      
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]
        
        if(sampled_token!='eostok'):
            decoded_sentence += ' '+sampled_token

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) >= (max_summary_len-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence

In [None]:
def seq2summary(input_seq):
    newString=''
    for i in input_seq:
        if((i!=0 and i!=target_word_index['sostok']) and i!=target_word_index['eostok']):
            newString=newString+reverse_target_word_index[i]+' '
    return newString

def seq2text(input_seq):
    newString=''
    for i in input_seq:
        if(i!=0):
            newString=newString+reverse_source_word_index[i]+' '
    return newString

In [None]:
for i in range(0,100):
    print("Review:",seq2text(x_tr[i]))
    print("Original summary:",seq2summary(y_tr[i]))
    print("Predicted summary:",decode_sequence(x_tr[i].reshape(1,max_text_len)))
    print("\n")