In [12]:
import nltk
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

from sklearn.feature_extraction.text import TfidfVectorizer

In [1]:
from scripts.clear_data import FileDeleter

In [5]:
%run scripts/setup.py

Beginning download...
Starting pariza/bbc-news-summary Download
Completed pariza/bbc-news-summary Download
Starting rmisra/news-category-dataset Download
Completed rmisra/news-category-dataset Download
Download completed!
json converted to DataFrame!
Downloading articles...
Articles downloaded!
Moving and renaming files...
Shutil task completed!


In [2]:
fd = FileDeleter()
fd.delete_all_in_directory()

Deleted directory: data/processed/test
Deleted directory: data/processed/train
Deleted directory: data/processed/val


In [25]:
nltk.download('popular')

[nltk_data]    |   Unzipping corpora\wordnet_ic.zip.
[nltk_data]    | Downloading package words to
[nltk_data]    |     C:\Users\19105\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\words.zip.
[nltk_data]    | Downloading package maxent_ne_chunker to
[nltk_data]    |     C:\Users\19105\AppData\Roaming\nltk_data...
[nltk_data]    |   Package maxent_ne_chunker is already up-to-date!
[nltk_data]    | Downloading package punkt to
[nltk_data]    |     C:\Users\19105\AppData\Roaming\nltk_data...
[nltk_data]    |   Package punkt is already up-to-date!
[nltk_data]    | Downloading package snowball_data to
[nltk_data]    |     C:\Users\19105\AppData\Roaming\nltk_data...
[nltk_data]    |   Package snowball_data is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\19105\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]   

True

In [2]:
def get_sentences(document):
    sentences = sent_tokenize(document)
    return sentences

In [3]:
def preprocess(sents):
    sentences_processed = []
    for sentence in sents:
        sentence_reduced = sentence.replace("[^a-zA-Z0-9_]", '')
        sentence_reduced = [word.lower() for word in sentence_reduced.split(' ') if word.lower() not in stopwords.words('english')]
        sentences_processed.append(' '.join(word for word in sentence_reduced))
    return sentences_processed

In [4]:
def vectorize(sentences, vectorizer_type='count'):
    if vectorizer_type == 'count':
        # Get vocabulary for entire document
        sentences = [sent.split(' ') for sent in sentences]
        all_words = list(set([word for s in sentences for word in s]))

        # Create feature vector for each sentence
        feature_vecs = []
        for sentence in sentences:
            feature_vec = [0] * len(all_words)
            for word in sentence:
                feature_vec[all_words.index(word)] += 1
            feature_vecs.append(feature_vec)
    else:
        vectorizer = TfidfVectorizer()
        feature_vecs = vectorizer.fit_transform(sentences)
        feature_vecs = feature_vecs.todense().tolist()
        
    return feature_vecs

In [5]:
def generate_adjacency_matrix(feature_vecs):
    # Create empty adjacency matrix
    adjacency_matrix = np.zeros((len(feature_vecs), len(feature_vecs)))
 
    # Populate the adjacency matrix using the similarity of all pairs of sentences
    for i in range(len(feature_vecs)):
        for j in range(len(feature_vecs)):
            if i == j: #ignore if both are the same sentence
                continue 
            adjacency_matrix[i][j] = 1 - cosine_distance(feature_vecs[1], feature_vecs[j])
    
    return adjacency_matrix

In [6]:
def summarize(sentences,adjacency_matrix,top_n):

    # Create the graph representing the document
    document_graph = nx.from_numpy_array(adjacency_matrix)

    # Apply PageRank algorithm to get centrality scores for each node/sentence
    scores = nx.pagerank(document_graph)
    scores_list = list(scores.values())

    # Sort and pick top sentences
    ranking_idx = np.argsort(scores_list)[::-1]
    ranked_sentences = [sentences[i] for i in ranking_idx]   

    summary = []
    for i in range(top_n):
        summary.append(ranked_sentences[i])

    summary = " ".join(summary)

    return summary

In [7]:
from scripts.build_features import BuildFeatures

bf = BuildFeatures()
datasets = bf.get_datasets()

In [8]:
train_data = datasets['train']

In [9]:
test_text = train_data[0]['text']

In [10]:
sentences_extracted = get_sentences(test_text)
sentences_processed = preprocess(sentences_extracted)

In [11]:
feature_vecs = vectorize(sentences_processed,vectorizer_type='count')
adjacency_matrix = generate_adjacency_matrix(feature_vecs)
summary = summarize(sentences_extracted,adjacency_matrix,top_n=5)
print(summary)

Market benchmarks in Europe and Asia fell by as much as 4% as traders tried to figure out how large Putin’s incursion would be and the scale of Western retaliation. In early trading, the FTSE 100 in London fell 2.5% to 7,311.69 as Europe awakened to news of explosions in the Ukrainian capital of Kyiv, the major city of Kharkiv and other areas. The euro fell to $1.1243 from $1.1306. India’s Sensex fell 3.4% to 55,283.65. President Joe Biden denounced the attack as “unprovoked and unjustified” and said Moscow would be held accountable, which many took to mean Washington and its allies would impose additional sanctions.


In [13]:
import evaluate

rouge = evaluate.load("rouge")

In [14]:
test_summary = train_data[0]['summary']

In [19]:
rouge_scores = rouge.compute(predictions=[summary], references=[test_summary])

In [20]:
rouge_scores

{'rouge1': 0.0588235294117647,
 'rouge2': 0.0,
 'rougeL': 0.04411764705882353,
 'rougeLsum': 0.04411764705882353}