In [7]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from scipy.ndimage import gaussian_filter1d

In [5]:
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

In [8]:
# Import file and load in dataframe
#------------------------------------------------------------
file ='bbc_text_cls.csv'
# Load the CSV file into a DataFrame
df = pd.read_csv(file, encoding="ISO-8859-1")

In [9]:
# For each doc, split by sentence
#------------------------------------------------------------
df_numpy = df['text'].to_numpy()
docs = []
for index, doc in enumerate(df_numpy):
    # Split the text into sentences using nltk's sent_tokenize
    sentences = sent_tokenize(doc)
    docs.append(sentences)

In [10]:
## --- FUNCTIONS ---

# Function to print a document in a nice-readable format
#------------------------------------------------------------
def print_single_document(doc):
    for i, sentence in enumerate(doc):
        print(f"{sentence}")
        if i < len(doc) - 1:
            print()  # Add a blank line between sentences

In [11]:
## CHOOSE DOCUMENT IN docs
#------------------------------------------------------------
doc_index = 2222

In [12]:
## PRINT THE ENTIRE DOCUMENT
#------------------------------------------------------------
print_single_document(docs[doc_index])

Be careful how you code

A new European directive could put software writers at risk of legal action, warns former programmer and technology analyst Bill Thompson.

If it gets its way, the Dutch government will conclude its presidency of the European Union by pushing through a controversial measure that has been rejected by the European Parliament, lacks majority support from national governments and will leave millions of European citizens in legal limbo and facing the possibility of court cases against them.

If the new law was about border controls, defence or even the new constitution, then our TV screens would be full of experts agonising over the impact on our daily lives.

Sadly for those who will be directly affected, the controversy concerns the patenting of computer programs, a topic that may excite the bloggers, campaigning groups and technical press but does not obsess Middle Britain.

After all, how much fuss can you generate about the Directive on the Patentability of Com

In [13]:
docs[doc_index]

['Be careful how you code\n\nA new European directive could put software writers at risk of legal action, warns former programmer and technology analyst Bill Thompson.',
 'If it gets its way, the Dutch government will conclude its presidency of the European Union by pushing through a controversial measure that has been rejected by the European Parliament, lacks majority support from national governments and will leave millions of European citizens in legal limbo and facing the possibility of court cases against them.',
 'If the new law was about border controls, defence or even the new constitution, then our TV screens would be full of experts agonising over the impact on our daily lives.',
 'Sadly for those who will be directly affected, the controversy concerns the patenting of computer programs, a topic that may excite the bloggers, campaigning groups and technical press but does not obsess Middle Britain.',
 'After all, how much fuss can you generate about the Directive on the Pate

In [22]:
## SETUP SUMMARIZER
#------------------------------------------------------------
summarizer = TextRankSummarizer()
parser = PlaintextParser.from_string(
    docs[doc_index],
    Tokenizer("english"))

In [17]:
## PRINT THE SUMMARIZED VERSION
#------------------------------------------------------------
num_sentences = 4
summary = summarizer(parser.document, sentences_count = num_sentences)

In [20]:
summary

(<Sentence: ', 'If it gets its way, the Dutch government will conclude its presidency of the European Union by pushing through a controversial measure that has been rejected by the European Parliament, lacks majority support from national governments and will leave millions of European citizens in legal limbo and facing the possibility of court cases against them.>,
 <Sentence: ', "Yet if the new directive is nodded through at the next meeting of one of the EU's ministerial councils, as seems likely, it will allow programs to be patented in Europe just as they are in the US.>,
 <Sentence: ', 'First, there is the abuse of the democratic process involved in disregarding the views of the parliament and abandoning all of their carefully argued amendments.>,
 <Sentence: ', 'Much of the really useful software we use every day, programs like the Apache web server, the GNU/Linux operating system and the fearsomely popular Firefox browser, is developed outside company structures by people who d

In [19]:
print_single_document(summary)

', 'If it gets its way, the Dutch government will conclude its presidency of the European Union by pushing through a controversial measure that has been rejected by the European Parliament, lacks majority support from national governments and will leave millions of European citizens in legal limbo and facing the possibility of court cases against them.

', "Yet if the new directive is nodded through at the next meeting of one of the EU's ministerial councils, as seems likely, it will allow programs to be patented in Europe just as they are in the US.

', 'First, there is the abuse of the democratic process involved in disregarding the views of the parliament and abandoning all of their carefully argued amendments.

', 'Much of the really useful software we use every day, programs like the Apache web server, the GNU/Linux operating system and the fearsomely popular Firefox browser, is developed outside company structures by people who do not have legal departments to check for patent in