In [15]:
#Document Summarization¶
toy_text = """
A doctor in statistics from Osmania University, Venugopala Rao Manneni is an experienced data analyst who has over 15 years of work experience in a diverse areas  of verticals such as manufacturing, service, media, telecom,  retail,  pharma  and  education.  
Prior  to  Juxt-Smart  Mandate,  he  has  worked  with  reputed organizations like TNS India (Kantar, WPP) and NFO MBL and served clients across UK, France and Asia Pacific region
Venu’s  primary  responsibility  is  to  architecting  the  solutions  for  the  data  driven  problems  using  statistical methods, Machine Learning and deep learning algorithms for both structured and unstructured data.  
Venu  holds  an  Phd & M.Phil  from  Osmania  University,  Hyderabad.  The  focus  of  his  PhD  (2012)was  on  AI& Machine Learning  Methods  titled  “Neural  network  applications  in  classification  problems”  and  his  MPhil (2008)  thesis focused  on  Multivariate  analysis  applications  in  business  and  research  titled  “Evolution  of  business organization  using  multivariate  techniques”.  
He  was  the  topper  in  MSc.  Statistics Program  in  2003  at  the University. He was adjudged the best Quantum programmer in the year 2004 in TNS India and has over 10 publications in prestigious national/international journals on Hybrid analytical approaches in data mining and forecasting. 
"""


In [16]:
import nltk

sentences = nltk.sent_tokenize(toy_text)
len(sentences)

7

In [17]:
import numpy as np
import re

stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [18]:
norm_sentences = normalize_corpus(sentences)
norm_sentences

array(['doctor statistics osmania university venugopala rao manneni experienced data analyst years work experience diverse areas verticals manufacturing service media telecom retail pharma education',
       'prior juxtsmart mandate worked reputed organizations like tns india kantar wpp nfo mbl served clients across uk france asia pacific region venus primary responsibility architecting solutions data driven problems using statistical methods machine learning deep learning algorithms structured unstructured data',
       'venu holds phd mphil osmania university hyderabad',
       'focus phd ai machine learning methods titled neural network applications classification problems mphil thesis focused multivariate analysis applications business research titled evolution business organization using multivariate techniques',
       'topper msc', 'statistics program university',
       'adjudged best quantum programmer year tns india publications prestigious nationalinternational journals hybr

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_sentences)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
td_matrix = tv_matrix.transpose()
td_matrix = np.multiply(td_matrix, td_matrix > 0)
td_matrix.shape

(98, 7)

In [20]:
len(vocab)

98

In [21]:
from scipy.sparse.linalg import svds
    
def low_rank_svd(matrix, singular_count=2):
    
    u, s, vt = svds(matrix, k=singular_count)
    return u, s, vt

In [28]:
num_sentences = 3
num_topics = 2

u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)  
                                         
sv_threshold = 0.5
min_sigma_value = max(s) * sv_threshold
s[s < min_sigma_value] = 0

salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt)))
print(np.round(salience_scores, 2))

[0.64 0.74 0.62 0.61 0.   0.71 0.47]


In [26]:
top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1]
top_sentence_indices.sort()

In [27]:
print('\n'.join(np.array(sentences)[top_sentence_indices]))

Prior  to  Juxt-Smart  Mandate,  he  has  worked  with  reputed organizations like TNS India (Kantar, WPP) and NFO MBL and served clients across UK, France and Asia Pacific region
Venu’s  primary  responsibility  is  to  architecting  the  solutions  for  the  data  driven  problems  using  statistical methods, Machine Learning and deep learning algorithms for both structured and unstructured data.
