# NLTK Statistical Based Text Summarizer

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def frequency_table(text):
    """
    Function to create a frequency table of words in the text, ignoring stopwords.
    """
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text)
    freq_table = {}
    
    for word in words:
        word = word.lower()
        if word not in stop_words:
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1
                
    return freq_table

def score_sentences(sentences, freq_table):
    """
    Function to score sentences based on the frequency of the words that appear in them.
    """
    sentence_weight = {}
    
    for sentence in sentences:
        sentence_wordcount = 0
        words = word_tokenize(sentence)
        for word in words:
            if word.lower() in freq_table:
                sentence_wordcount += 1
                if sentence[:10] in sentence_weight:
                    sentence_weight[sentence[:10]] += freq_table[word.lower()]
                else:
                    sentence_weight[sentence[:10]] = freq_table[word.lower()]
                    
        sentence_weight[sentence[:10]] = sentence_weight.get(sentence[:10], 0) / sentence_wordcount
        
    return sentence_weight

def summarize_text(text, max_summary_length=3):
    """
    Function to summarize the text.
    """
    # Create a frequency table for words
    freq_table = frequency_table(text)
    
    # Tokenize the sentences
    sentences = sent_tokenize(text)
    
    # Score the sentences
    sentence_scores = score_sentences(sentences, freq_table)
    
    # Sort the sentences in descending order of importance
    sorted_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)
    
    summary = ' '.join([item[0] for item in sorted_sentences[:max_summary_length]])
    
    return summary

text = """
    As AI continues to evolve, the implications for the tech industry are profound. The development of advanced AI systems has the potential to reshape the landscape of technology and business, offering new opportunities and challenges alike. With the increasing integration of AI into various sectors, companies must adapt to stay competitive and innovative.
"""

summary = summarize_text(text)
print("Summary:", summary)

Summary: 
    As AI With the i The develo


[nltk_data] Downloading package punkt to C:\Users\Fatima
[nltk_data]     Azfar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Fatima
[nltk_data]     Azfar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Hugging Face Pre-Trained Text Summarizer

In [2]:
from transformers import pipeline

def summarize_text(text):
    """
    Function to summarize text using a pre-trained model from Hugging Face's transformers library.
    """
    # Load the summarization pipeline
    summarizer = pipeline("summarization")
    
    # Generate summary
    summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
    
    # Extract and return the summary text
    return summary[0]['summary_text']

text = """
    As AI continues to evolve, the implications for the tech industry are profound. The development of advanced AI systems has the potential to reshape the landscape of technology and business, offering new opportunities and challenges alike. With the increasing integration of AI into various sectors, companies must adapt to stay competitive and innovative. Moreover, the ethical and societal impacts of AI deployments must be carefully considered to ensure they benefit society as a whole.
"""

summary = summarize_text(text)
print("Summary:", summary)




  from pandas.core import (
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:02<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Your max_length is set to 130, but your input_length is only 91. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)


Summary:  The development of advanced AI systems has the potential to reshape the landscape of technology and business . With the increasing integration of AI into various sectors, companies must adapt to stay competitive . The ethical and societal impacts of AI deployments must be carefully considered .


# Sequence-to-Sequence (Seq2Seq) model with LSTM units

Here's a breakdown of the components and the architecture:

1. **Encoder-Decoder Architecture**: This model consists of two primary components: an encoder and a decoder.
   - **Encoder**: Takes the input sequence and processes it into a fixed-sized vector (or state), capturing the essence of the input data.
   - **Decoder**: Takes the output from the encoder and generates the target sequence. The initial state of the decoder is set to the final state of the encoder, allowing the decoder to use the learned context.

2. **LSTM Layers**: Both the encoder and decoder use Long Short-Term Memory (LSTM) layers, which are a type of recurrent neural network (RNN) suitable for sequence prediction problems. LSTM helps the model to retain long-term dependencies and handle vanishing gradient problems that can occur with standard RNNs.

3. **Embedding Layer**: Both the encoder and decoder are equipped with an embedding layer that transforms the integer-encoded vocabulary into dense vectors of a fixed size. This provides a more expressive representation of words and reduces the dimensionality compared to one-hot encoding.

4. **Dense Layer**: The output of the decoder's LSTM is passed through a dense (fully connected) layer with a softmax activation function to predict the probability distribution over the vocabulary for each time step in the output sequence.

# Dataset Used for Training
We are using the BBC News Summary data to train our model: https://www.kaggle.com/datasets/pariza/bbc-news-summary?select=BBC+News+Summary

### Data Preparation

In [10]:
import os
import chardet

def read_files(directory):
    files_content = []
    for filename in sorted(os.listdir(directory)):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            # Detect encoding
            with open(filepath, 'rb') as file:  # Open file in binary mode
                raw_data = file.read()
                encoding = chardet.detect(raw_data)['encoding']
            
            # Read file with detected encoding
            with open(filepath, 'r', encoding=encoding) as file:
                files_content.append(file.read().strip())
    return files_content

def load_data(main_directory):
    """
    Function to load news articles and their summaries from given directory structure.
    """
    categories = ['business', 'entertainment', 'politics', 'sport', 'tech']  # List of categories
    texts = []
    summaries = []

    # Paths for articles and summaries
    articles_path = os.path.join(main_directory, 'News Articles')
    summaries_path = os.path.join(main_directory, 'Summaries')

    for category in categories:
        # Full path to category folder for articles and summaries
        category_articles_path = os.path.join(articles_path, category)
        category_summaries_path = os.path.join(summaries_path, category)

        # Read all articles and summaries from category folder
        category_articles = read_files(category_articles_path)
        category_summaries = read_files(category_summaries_path)

        # Extend the main lists
        texts.extend(category_articles)
        summaries.extend(category_summaries)

    return texts, summaries

main_directory = 'BBC News Summary'

texts, summaries = load_data(main_directory)
print("Number of texts: ", len(texts))
print("Number of summaries: ", len(summaries))
print("Example text: ", texts[0])
print("Example summary: ", summaries[0])

Number of texts:  2225
Number of summaries:  2225
Example text:  Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service

In [11]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts + summaries)

# Convert texts to sequences
text_seq = tokenizer.texts_to_sequences(texts)
summary_seq = tokenizer.texts_to_sequences(summaries)

# Pad sequences
text_seq = pad_sequences(text_seq, maxlen=50)
summary_seq = pad_sequences(summary_seq, maxlen=20)

### Model Design

In [12]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Parameters
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
lstm_units = 128

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(lstm_units, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

### Training

In [13]:
# Prepare decoder input data that just contains the start token
decoder_input_data = np.zeros_like(summary_seq)
decoder_input_data[:, 1:] = summary_seq[:, :-1]
decoder_input_data[:, 0] = 1  # Assuming 1 is the start token

# Prepare decoder target data
decoder_target_data = np.expand_dims(summary_seq, -1)

# Training the model
model.fit([text_seq, decoder_input_data], decoder_target_data, batch_size=16, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x29952dbb990>

### Inference

In [20]:
# Inference setup: Define encoder model
encoder_model = Model(encoder_inputs, encoder_states)

# Inference setup: Define decoder model
decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# Function to decode sequence
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1 with only the start character.
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = 1  # Start token

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = tokenizer.index_word.get(sampled_token_index, 'unknown')  # Handle unknown tokens

        # Append sampled word (or 'unknown') to the decoded sentence
        decoded_sentence += ' ' + sampled_char

        # Exit condition: either hit max length or find stop character.
        if (sampled_char == 'end' or len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence


print(decode_sequence(text_seq[0:1]))

 for example was in the us and whether people's privacy


In [23]:
input_text = input("Enter a text to summarize: ")
input_seq = tokenizer.texts_to_sequences([input_text])
input_seq = pad_sequences(input_seq, maxlen=50)
summary = decode_sequence(input_seq)
print("Summary:", summary)

Summary:  for example was in the us and whether people's privacy
