In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed
from tensorflow.keras.models import Model
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

# Ensure NLTK data is downloaded
nltk.download('stopwords')

# 1. Data Collection
def load_data(filepath):
    data = pd.read_csv(filepath, encoding='latin1')  # or encoding='ISO-8859-1'
    return data

# 2. Data Processing
def clean_text(text):
    text = BeautifulSoup(text, "lxml").text  # Remove HTML tags
    text = re.sub(r'\([^)]*\)', '', text)  # Remove text in parentheses
    text = re.sub('"', '', text)  # Remove quotations
    text = re.sub(r"'s\b", "", text)
    text = re.sub("[^a-zA-Z]", " ", text)  # Remove special characters and digits
    text = text.lower()  # Lowercase all characters
    return text

def preprocess_data(data, text_column, summary_column):
    data['cleaned_text'] = data[text_column].apply(lambda x: clean_text(x))
    data['cleaned_summary'] = data[summary_column].apply(lambda x: clean_text(x))
    return data

# 3. Exploratory Data Analysis (EDA)
def eda(data):
    data_len = [len(s.split()) for s in data['cleaned_text']]
    plt.hist(data_len, bins=30)
    plt.xlabel('Number of Words')
    plt.ylabel('Frequency')
    plt.show()

# 4. Text Preprocessing
def tokenize_text(data, max_len):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data)
    sequences = tokenizer.texts_to_sequences(data)
    padded = pad_sequences(sequences, maxlen=max_len, padding='post')
    return tokenizer, padded

def preprocess_text(data):
    stop_words = set(stopwords.words('english'))
    data['cleaned_text'] = data['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    data['cleaned_summary'] = data['cleaned_summary'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    return data

# 5. Abstractive Summarization Model (Seq2Seq)
def build_model(vocab_size, max_len_text, max_len_summary):
    embedding_dim = 300
    latent_dim = 500

    # Encoder
    encoder_inputs = Input(shape=(max_len_text,))
    enc_emb = Embedding(vocab_size, embedding_dim, trainable=True)(encoder_inputs)

    # LSTM 1
    encoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True)
    encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

    # LSTM 2
    encoder_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True)
    encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

    # LSTM 3
    encoder_lstm3 = LSTM(latent_dim, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm3(encoder_output2)

    # Decoder
    decoder_inputs = Input(shape=(None,))
    dec_emb_layer = Embedding(vocab_size, embedding_dim, trainable=True)
    dec_emb = dec_emb_layer(decoder_inputs)

    # LSTM using encoder_states as initial state
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

    # Dense layer
    decoder_dense = TimeDistributed(Dense(vocab_size, activation='softmax'))
    decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    return model

# 6. Model Training
def train_model(model, x_train, y_train, x_val, y_val, batch_size, epochs):
    model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
    model.fit([x_train['text'], x_train['summary'][:, :-1]],
              y_train['summary'][:, 1:],
              epochs=epochs,
              batch_size=batch_size,
              validation_data=([x_val['text'], x_val['summary'][:, :-1]], y_val['summary'][:, 1:]))
    return model

# 7. Evaluation using ROUGE, BLEU
def evaluate_model(model, x_val, y_val, tokenizer, max_len_summary):
    y_pred = model.predict([x_val['text'], x_val['summary'][:, :-1]])
    y_pred = np.argmax(y_pred, axis=-1)

    rouge = Rouge()
    references = [' '.join([tokenizer.index_word[i] for i in y if i != 0]) for y in y_val['summary'][:, 1:]]
    hypotheses = [' '.join([tokenizer.index_word[i] for i in y if i != 0]) for y in y_pred]

    scores = rouge.get_scores(hypotheses, references, avg=True)

    bleu_scores = []
    for i in range(len(y_pred)):
        ref = [tokenizer.index_word[j] for j in y_val['summary'][i, 1:] if j != 0]
        pred = [tokenizer.index_word[j] for j in y_pred[i] if j != 0]
        bleu_scores.append(sentence_bleu([ref], pred))

    return scores, np.mean(bleu_scores)

# 8. Model Testing
def test_model(model, text, tokenizer, max_len_text, max_len_summary):
    text = clean_text(text)
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])

    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len_text, padding='post')

    pred = model.predict([padded, np.zeros((1, max_len_summary))])
    pred = np.argmax(pred, axis=-1)

    summary = ' '.join([tokenizer.index_word[i] for i in pred[0] if i != 0])
    return summary

# Load the data
filepath = 'd:\\PROJECTS\\News Summarizer\\news_summary.csv'  # Change this to the correct path
data = load_data(filepath)

# Print column names to verify correct column names
print("Column names:", data.columns)

# Columns containing the text and summary
text_column = 'text'
summary_column = 'headlines'

# Check if the specified columns exist in the DataFrame
if text_column not in data.columns or summary_column not in data.columns:
    raise KeyError(f"Columns '{text_column}' or '{summary_column}' not found in the dataset.")

data = preprocess_data(data, text_column=text_column, summary_column=summary_column)
print("First few rows after preprocessing:")
print(data.head())

eda(data)

max_len_text = 300
max_len_summary = 50

data = preprocess_text(data)
tokenizer, padded_texts = tokenize_text(data['cleaned_text'], max_len_text)
_, padded_summaries = tokenize_text(data['cleaned_summary'], max_len_summary)

print("Shape of padded_texts:", padded_texts.shape)
print("Shape of padded_summaries:", padded_summaries.shape)

# Ensure padded_summaries is in the correct format
padded_summaries = np.array(padded_summaries)

# Perform train-test split
text_train, text_val, summary_train, summary_val = train_test_split(
    padded_texts, padded_summaries, test_size=0.2, random_state=0)

# Create dictionaries for training and validation sets
x_train = {'text': text_train, 'summary': summary_train}
x_val = {'text': text_val, 'summary': summary_val}
y_train = {'text': text_train, 'summary': summary_train}
y_val = {'text': text_val, 'summary': summary_val}

print("Shapes of train and validation sets:")
print("x_train:", {k: v.shape for k, v in x_train.items()})
print("y_train:", {k: v.shape for k, v in y_train.items()})
print("x_val:", {k: v.shape for k, v in x_val.items()})
print("y_val:", {k: v.shape for k, v in y_val.items()})

model = build_model(len(tokenizer.word_index) + 1, max_len_text, max_len_summary)
model = train_model(model, x_train, y_train, x_val, y_val, batch_size=64, epochs=10)

scores, bleu_score = evaluate_model(model, x_val, y_val, tokenizer, max_len_summary)
print('ROUGE Scores:', scores)
print('BLEU Score:', bleu_score)

# Test the model
test_text = "Your input text here."
print('Summary:', test_model(model, test_text, tokenizer, max_len_text, max_len_summary))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Charan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  data = pd.read_csv(filepath, encoding='latin1')  # or encoding='ISO-8859-1'


Column names: Index(['author', 'date', 'headlines', 'read_more', 'text', 'ctext',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       ...
       'Unnamed: 272', 'Unnamed: 273', 'Unnamed: 274', 'Unnamed: 275',
       'Unnamed: 276', 'Unnamed: 277', 'Unnamed: 278', 'Unnamed: 279',
       'Unnamed: 280', 'Unnamed: 281'],
      dtype='object', length=282)


FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?