In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

In [2]:
# Import file and load in dataframe
#------------------------------------------------------------
file ='bbc_text_cls.csv'
# Load the CSV file into a DataFrame
df = pd.read_csv(file, encoding="ISO-8859-1")

In [5]:
# For each doc, split by sentence
#------------------------------------------------------------
df_numpy = df['text'].to_numpy()
docs = []
for index, doc in enumerate(df_numpy):
    # Split the text into sentences using nltk's sent_tokenize
    sentences = sent_tokenize(doc)
    docs.append(sentences)

In [14]:
## --- FUNCTIONS ---

# Function to print a document in a nice-readable format
#------------------------------------------------------------
def print_single_document(doc):
    for i, sentence in enumerate(doc):
        print(f"{sentence}")
        if i < len(doc) - 1:
            print()  # Add a blank line between sentences

# Function for text summarization
#------------------------------------------------------------
def text_summarization(doc, num_sentences = 2, just_print=0):
    # OUTPUT
    sum_doc = []
    # doc must be a list that contains sentences
    L = len(doc) # doc's size
    if num_sentences > L:
        print("Warning: Please ensure that the number of sentences for summarization are less than the sentences contained inside your document")
    else:
        ## compute tfidf vectors for each word in each sentence of the document
        vectorizer = TfidfVectorizer(stop_words='english',decode_error='ignore')
        docs_tfidf = vectorizer.fit_transform(doc)
        ## compute average tfidf score per sentence
        # average tfidf score per sentence
        scores = np.array(np.mean(docs_tfidf, axis=1)).flatten() 
        # reverse sorting by index
        sorted_indices = np.argsort(scores)[::-1] 
        ## Select and print the top N sentences based on their original order
        sum_indices = sorted_indices[:num_sentences]
        # Sort the top indices to maintain their original order
        sum_indices.sort()  
        for _,index in enumerate(sum_indices):
            sum_doc.append(doc[index])
        if just_print ==1:
            print_single_document(sum_doc)
        else:
            return sum_doc

In [115]:
## CHOOSE DOCUMENT IN docs
#------------------------------------------------------------
doc_index = 12

In [116]:
## PRINT THE ENTIRE DOCUMENT
#------------------------------------------------------------
print_single_document(docs[doc_index])

Peugeot deal boosts Mitsubishi

Struggling Japanese car maker Mitsubishi Motors has struck a deal to supply French car maker Peugeot with 30,000 sports utility vehicles (SUV).

The two firms signed a Memorandum of Understanding, and say they expect to seal a final agreement by Spring 2005.


The SUVs will be built in Japan using Peugeot's diesel engines and sold mainly in the European market.

Falling sales have left Mitsubishi Motors with underused capacity, and the production deal with Peugeot gives it a chance to utilise some of it.


Its sales have slid 41% in the past year, catalysed by the revelation that the company had systematically been hiding records of faults and then secretly repairing vehicles.

As a result, the Japanese car maker has sought a series of financial bailouts.

Last month it said it was looking for a further 540bn yen ($5.2bn; Â£2.77bn) in fresh financial backing, half of it from other companies in the Mitsubishi group.

US-German carmaker DaimlerChrylser, a 

In [117]:
## PRINT THE SUMMARIZED VERSION
#------------------------------------------------------------
num_sentences = 3
text_summarization(docs[doc_index], num_sentences = num_sentences, just_print=1)

Peugeot deal boosts Mitsubishi

Struggling Japanese car maker Mitsubishi Motors has struck a deal to supply French car maker Peugeot with 30,000 sports utility vehicles (SUV).

The deal with Peugeot was celebrated by Mitsubishi's newly-appointed chief executive Takashi Nishioka, who took over after three top bosses stood down last month to shoulder responsibility for the firm's troubles.

Last month, it signed a production agreement with Japanese rival Nissan Motor to supply it with 36,000 small cars for sale in Japan.
