# Document Summarization

In [1]:
toy_text = """
Elephants are large mammals of the family Elephantidae 
and the order Proboscidea. Two species are traditionally recognised, 
the African elephant and the Asian elephant. Elephants are scattered 
throughout sub-Saharan Africa, South Asia, and Southeast Asia. Male 
African elephants are the largest extant terrestrial animals. All 
elephants have a long trunk used for many purposes, 
particularly breathing, lifting water and grasping objects. Their 
incisors grow into tusks, which can serve as weapons and as tools 
for moving objects and digging. Elephants' large ear flaps help 
to control their body temperature. Their pillar-like legs can 
carry their great weight. African elephants have larger ears 
and concave backs while Asian elephants have smaller ears 
and convex or level backs.  
"""

In [2]:
import nltk

sentences = nltk.sent_tokenize(toy_text)
len(sentences)

9

In [3]:
import numpy as np
import re

stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [4]:
norm_sentences = normalize_corpus(sentences)
norm_sentences

array(['elephants large mammals family elephantidae order proboscidea',
       'two species traditionally recognised african elephant asian elephant',
       'elephants scattered throughout subsaharan africa south asia southeast asia',
       'male african elephants largest extant terrestrial animals',
       'elephants long trunk used many purposes particularly breathing lifting water grasping objects',
       'incisors grow tusks serve weapons tools moving objects digging',
       'elephants large ear flaps help control body temperature',
       'pillarlike legs carry great weight',
       'african elephants larger ears concave backs asian elephants smaller ears convex level backs'],
      dtype='<U93')

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_sentences)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
td_matrix = tv_matrix.transpose()
td_matrix = np.multiply(td_matrix, td_matrix > 0)
td_matrix.shape

(63, 9)

In [6]:
from scipy.sparse.linalg import svds
    
def low_rank_svd(matrix, singular_count=2):
    
    u, s, vt = svds(matrix, k=singular_count)
    return u, s, vt

In [7]:
num_sentences = 3
num_topics = 2

u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)  
                                         
sv_threshold = 0.5
min_sigma_value = max(s) * sv_threshold
s[s < min_sigma_value] = 0

salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt)))
print(np.round(salience_scores, 2))

[0.69 0.68 0.29 0.6  0.33 0.14 0.68 0.   0.66]


In [8]:
top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1]
top_sentence_indices.sort()

In [9]:
print('\n'.join(np.array(sentences)[top_sentence_indices]))


Elephants are large mammals of the family Elephantidae 
and the order Proboscidea.
Two species are traditionally recognised, 
the African elephant and the Asian elephant.
Elephants' large ear flaps help 
to control their body temperature.
