In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from scipy.ndimage import gaussian_filter1d

In [2]:
# Import file and load in dataframe
#------------------------------------------------------------
file ='bbc_text_cls.csv'
# Load the CSV file into a DataFrame
df = pd.read_csv(file, encoding="ISO-8859-1")

In [3]:
# For each doc, split by sentence
#------------------------------------------------------------
df_numpy = df['text'].to_numpy()
docs = []
for index, doc in enumerate(df_numpy):
    # Split the text into sentences using nltk's sent_tokenize
    sentences = sent_tokenize(doc)
    docs.append(sentences)

In [33]:
## --- FUNCTIONS ---

# Function to print a document in a nice-readable format
#------------------------------------------------------------
def print_single_document(doc):
    for i, sentence in enumerate(doc):
        print(f"{sentence}")
        if i < len(doc) - 1:
            print()  # Add a blank line between sentences

# Function for text summarization
#------------------------------------------------------------
def textrank_summarization(doc, num_sentences = 2, just_print=0):
    # OUTPUT
    sum_doc = []
    # doc must be a list that contains sentences
    L = len(doc) # doc's size
    if num_sentences > L:
        print("Warning: Please ensure that the number of sentences for summarization are less than the sentences contained inside your document")
    else:
        vectorizer = TfidfVectorizer(stop_words='english',decode_error='ignore')
        doc_tfidf = vectorizer.fit_transform(doc) # MxN , where M:sentences, N:words
        edges = cosine_similarity(doc_tfidf) # MxM
        # Divide each row by its sum (so that they sum up to 1)
        row_sum = edges.sum(axis=1)
        A = edges / row_sum[:, np.newaxis]
        # Apply Gaussian smoothing
        A = gaussian_filter1d(A, 1.0)
        # Solve eigenvector problem - TextRank 
        # Au = λu
        eigenvals, eigenvecs = np.linalg.eig(A.T)
        # Eigenvector corresponding to eigenvalue = 1 
        u = eigenvecs[:,0]
        # u represents p(inf) but its elements must add up to 1 
        u /= u.sum()
        # the limiting distribution represents our scores
        scores = u
        # reverse sorting by index
        sorted_indices = np.argsort(scores)[::-1] 
        ## Select and print the top N sentences based on their original order
        sum_indices = sorted_indices[:num_sentences]
        # Sort the top indices to maintain their original order
        sum_indices.sort()  
        for _,index in enumerate(sum_indices):
            sum_doc.append(doc[index])
        if just_print ==1:
            print_single_document(sum_doc)
        else:
            return sum_doc

In [49]:
## CHOOSE DOCUMENT IN docs
#------------------------------------------------------------
doc_index = 2222

In [50]:
## PRINT THE ENTIRE DOCUMENT
#------------------------------------------------------------
print_single_document(docs[doc_index])

Be careful how you code

A new European directive could put software writers at risk of legal action, warns former programmer and technology analyst Bill Thompson.

If it gets its way, the Dutch government will conclude its presidency of the European Union by pushing through a controversial measure that has been rejected by the European Parliament, lacks majority support from national governments and will leave millions of European citizens in legal limbo and facing the possibility of court cases against them.

If the new law was about border controls, defence or even the new constitution, then our TV screens would be full of experts agonising over the impact on our daily lives.

Sadly for those who will be directly affected, the controversy concerns the patenting of computer programs, a topic that may excite the bloggers, campaigning groups and technical press but does not obsess Middle Britain.

After all, how much fuss can you generate about the Directive on the Patentability of Com

In [51]:
## PRINT THE SUMMARIZED VERSION
#------------------------------------------------------------
num_sentences = 4
textrank_summarization(docs[doc_index], num_sentences = num_sentences, just_print=1)

Be careful how you code

A new European directive could put software writers at risk of legal action, warns former programmer and technology analyst Bill Thompson.

If coders are treated like this today, who is to say that it will not be you tomorrow?

But small companies, and the free and open software movement do not have any patents to trade.

Much of the really useful software we use every day, programs like the Apache web server, the GNU/Linux operating system and the fearsomely popular Firefox browser, is developed outside company structures by people who do not have legal departments to check for patent infringements.
