In [171]:
import math

from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords

import os,glob
documents = {}
import requests
import datetime
from bs4 import BeautifulSoup

#Uncomment this code to read text from the folder
#folder_path = '../datasets/IDF Texts'
#for filename in glob.glob(os.path.join(folder_path, '*.txt')):
  #with open(filename, 'r') as f:
    #text = f.read()
    #documents[text[:15]]= text

url = 'https://www.cnn.com/studentnews/article/sitemap-2020-4.html'
currentDate = datetime.datetime.now().strftime("%Y-%m-%d")
limit = 2
keyword = "coronavirus"

print("********************* INPUT **********************")
print("URL: " + url)
print("Date: " + currentDate)
print("No. of Articles: " + str(limit))
print("Keywords: " + keyword)

print("********************* Extracted OUTPUT Content **********************")
code = requests.get(url)
plain = code.text
s = BeautifulSoup(plain, "html.parser")

i=0
for span in s.findAll('span', {'class':'date'}):
    if span.text == currentDate:
        if keyword in span.next_sibling.text:
            if i >= limit:
                break
            i+=1
            print(i)
            href = span.next_sibling.a.get('href')
            article_code = requests.get(href)
            article_plain = article_code.text
            article_soap = BeautifulSoup(article_plain, "html.parser")
            document = ''
            #print(article_soap)
            for span in article_soap.findAll('div', {'class':'zn-body__paragraph'}):
                document += span.text
            documents[document[:15]] = document
            print(document)
            print("**************************************************")

def _create_frequency_matrix(documents):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for document in documents:
        freq_table = {}
        words = word_tokenize(documents[document])
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[document] = freq_table

    return frequency_matrix


def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for document, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_document = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_document

        tf_matrix[document] = tf_table

    return tf_matrix


def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for document, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table


def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for document, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[document] = idf_table

    return idf_matrix


def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (doc1, f_table1), (doc2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[doc1] = tf_idf_table

    return tf_idf_matrix


def _score_sentences(tf_idf_matrix, documents) -> dict:
 
    sentenceValue = {}
    sentences = {}
    total_sentences_in_all_documents = 0
    for document, f_table in tf_idf_matrix.items():
        
        sentences_tokenized = sent_tokenize(documents[document]) # NLTK function        
        for sentence in sentences_tokenized :
            
            total_score_per_sentence = 0
            words = word_tokenize(sentence)
            count_words_in_sentence = len(words)
            for word in words:
                for search_word, score in f_table.items():
                    if search_word == word :
                        total_score_per_sentence += score
            sentenceValue[sentence[:15]] = total_score_per_sentence / count_words_in_sentence  
            sentences[sentence[:15]] = sentence
            total_sentences_in_all_documents += 1
        
    return total_sentences_in_all_documents, sentences, sentenceValue


def _find_average_score(sentenceValue) -> int:

    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average


def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += "\n \n" + sentences[sentence].replace('\n', ' ').replace('\r', '')
            sentence_count += 1

    return sentence_count, summary

********************* INPUT **********************
URL: https://www.cnn.com/studentnews/article/sitemap-2020-4.html
Date: 2020-04-25
No. of Articles: 2
Keywords: coronavirus
********************* Extracted OUTPUT Content **********************
1
In recent weeks, the Asian city-state has had a dramatic spike in coronavirus infections, with thousands of new cases linked to clusters in foreign worker dormitories. To control the spread, the government has attempted to isolate the dormitories, test workers and move symptomatic patients into quarantine facilities. But those measures have left hundreds of thousands of workers trapped in their dormitories, living cheek by jowl in cramped conditions that make social distancing near impossible.Singapore is home to about 1.4 million migrant workers who come largely from South and Southeast Asia. As housekeepers, domestic helpers, construction workers and manual laborers, these migrants are essential to keeping Singapore functioning -- but are als

**************************************************


In [172]:
 # 1 Length of documents
total_documents = len(documents)
print("Total no. of documents : " + str(total_documents))

Total no. of documents : 2


In [173]:
 # 2 Create the Frequency matrix of the words in each sentence.
freq_matrix = _create_frequency_matrix(documents)
print("Frequency Matrix : \n")
print(freq_matrix)

Frequency Matrix : 

{'In recent weeks': {'recent': 1, 'week': 1, ',': 119, 'asian': 1, 'city-st': 2, 'ha': 9, 'dramat': 1, 'spike': 3, 'coronaviru': 6, 'infect': 2, 'thousand': 2, 'new': 4, 'case': 7, 'link': 1, 'cluster': 2, 'foreign': 7, 'worker': 35, 'dormitori': 15, '.': 44, 'control': 1, 'spread': 2, 'govern': 15, 'attempt': 1, 'isol': 1, 'test': 1, 'move': 2, 'symptomat': 1, 'patient': 1, 'quarantin': 1, 'facil': 1, 'measur': 5, 'left': 3, 'hundr': 1, 'trap': 1, 'live': 8, 'cheek': 1, 'jowl': 1, 'cramp': 2, 'condit': 4, 'make': 4, 'social': 1, 'distanc': 3, 'near': 1, 'impossible.singapor': 1, 'home': 2, '1.4': 1, 'million': 2, 'migrant': 15, 'come': 1, 'larg': 1, 'south': 1, 'southeast': 1, 'asia': 1, 'housekeep': 1, 'domest': 1, 'helper': 1, 'construct': 3, 'manual': 1, 'labor': 4, 'essenti': 1, 'keep': 1, 'singapor': 18, 'function': 1, '--': 9, 'also': 7, 'lowest': 1, 'paid': 4, 'vulner': 1, 'peopl': 2, 'city.rubel': 1, 'goe': 1, 'onli': 4, 'one': 3, 'name': 1, 'came': 1, 'ba

In [174]:
# 3 Calculate TermFrequency and generate a matrix
tf_matrix = _create_tf_matrix(freq_matrix)
print("Term Frequency Matrix : \n")
print(tf_matrix)

Term Frequency Matrix : 



{'In recent weeks': {'recent': 0.0017761989342806395, 'week': 0.0017761989342806395, ',': 0.2113676731793961, 'asian': 0.0017761989342806395, 'city-st': 0.003552397868561279, 'ha': 0.015985790408525755, 'dramat': 0.0017761989342806395, 'spike': 0.0053285968028419185, 'coronaviru': 0.010657193605683837, 'infect': 0.003552397868561279, 'thousand': 0.003552397868561279, 'new': 0.007104795737122558, 'case': 0.012433392539964476, 'link': 0.0017761989342806395, 'cluster': 0.003552397868561279, 'foreign': 0.012433392539964476, 'worker': 0.06216696269982238, 'dormitori': 0.02664298401420959, '.': 0.07815275310834814, 'control': 0.0017761989342806395, 'spread': 0.003552397868561279, 'govern': 0.02664298401420959, 'attempt': 0.0017761989342806395, 'isol': 0.0017761989342806395, 'test': 0.0017761989342806395, 'move': 0.003552397868561279, 'symptomat': 0.0017761989342806395, 'patient': 0.0017761989342806395, 'quarantin': 0.0017761989342806395, 'facil': 0.0017761989342806395, 'measur': 0.0088809946

In [175]:
# 4 creating table for documents per words
count_doc_per_words = _create_documents_per_words(freq_matrix)
print("Documents per word : \n")
print(count_doc_per_words)

Documents per word : 

{'recent': 2, 'week': 1, ',': 2, 'asian': 1, 'city-st': 1, 'ha': 2, 'dramat': 1, 'spike': 2, 'coronaviru': 2, 'infect': 2, 'thousand': 1, 'new': 2, 'case': 2, 'link': 1, 'cluster': 1, 'foreign': 1, 'worker': 1, 'dormitori': 1, '.': 2, 'control': 2, 'spread': 2, 'govern': 1, 'attempt': 1, 'isol': 2, 'test': 2, 'move': 2, 'symptomat': 1, 'patient': 2, 'quarantin': 1, 'facil': 1, 'measur': 2, 'left': 1, 'hundr': 2, 'trap': 1, 'live': 2, 'cheek': 1, 'jowl': 1, 'cramp': 1, 'condit': 1, 'make': 2, 'social': 1, 'distanc': 1, 'near': 1, 'impossible.singapor': 1, 'home': 1, '1.4': 1, 'million': 2, 'migrant': 1, 'come': 2, 'larg': 2, 'south': 1, 'southeast': 1, 'asia': 1, 'housekeep': 1, 'domest': 1, 'helper': 1, 'construct': 1, 'manual': 1, 'labor': 1, 'essenti': 1, 'keep': 1, 'singapor': 1, 'function': 1, '--': 2, 'also': 2, 'lowest': 1, 'paid': 1, 'vulner': 1, 'peopl': 2, 'city.rubel': 1, 'goe': 1, 'onli': 2, 'one': 2, 'name': 1, 'came': 1, 'bangladesh': 1, 'six': 1, 'y

In [176]:
# 5 Calculate IDF and generate a matrix
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
print("IDF Matrix : \n")
print(idf_matrix)

IDF Matrix : 

{'In recent weeks': {'recent': 0.0, 'week': 0.3010299956639812, ',': 0.0, 'asian': 0.3010299956639812, 'city-st': 0.3010299956639812, 'ha': 0.0, 'dramat': 0.3010299956639812, 'spike': 0.0, 'coronaviru': 0.0, 'infect': 0.0, 'thousand': 0.3010299956639812, 'new': 0.0, 'case': 0.0, 'link': 0.3010299956639812, 'cluster': 0.3010299956639812, 'foreign': 0.3010299956639812, 'worker': 0.3010299956639812, 'dormitori': 0.3010299956639812, '.': 0.0, 'control': 0.0, 'spread': 0.0, 'govern': 0.3010299956639812, 'attempt': 0.3010299956639812, 'isol': 0.0, 'test': 0.0, 'move': 0.0, 'symptomat': 0.3010299956639812, 'patient': 0.0, 'quarantin': 0.3010299956639812, 'facil': 0.3010299956639812, 'measur': 0.0, 'left': 0.3010299956639812, 'hundr': 0.0, 'trap': 0.3010299956639812, 'live': 0.0, 'cheek': 0.3010299956639812, 'jowl': 0.3010299956639812, 'cramp': 0.3010299956639812, 'condit': 0.3010299956639812, 'make': 0.0, 'social': 0.3010299956639812, 'distanc': 0.3010299956639812, 'near': 0.30

In [177]:
# 6 Calculate TF-IDF and generate a matrix
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
print("TF-IDF Matrix : \n")
print(tf_idf_matrix)

TF-IDF Matrix : 

{'In recent weeks': {'recent': 0.0, 'week': 0.000534689157484869, ',': 0.0, 'asian': 0.000534689157484869, 'city-st': 0.001069378314969738, 'ha': 0.0, 'dramat': 0.000534689157484869, 'spike': 0.0, 'coronaviru': 0.0, 'infect': 0.0, 'thousand': 0.001069378314969738, 'new': 0.0, 'case': 0.0, 'link': 0.000534689157484869, 'cluster': 0.001069378314969738, 'foreign': 0.0037428241023940824, 'worker': 0.01871412051197041, 'dormitori': 0.008020337362273033, '.': 0.0, 'control': 0.0, 'spread': 0.0, 'govern': 0.008020337362273033, 'attempt': 0.000534689157484869, 'isol': 0.0, 'test': 0.0, 'move': 0.0, 'symptomat': 0.000534689157484869, 'patient': 0.0, 'quarantin': 0.000534689157484869, 'facil': 0.000534689157484869, 'measur': 0.0, 'left': 0.0016040674724546067, 'hundr': 0.0, 'trap': 0.000534689157484869, 'live': 0.0, 'cheek': 0.000534689157484869, 'jowl': 0.000534689157484869, 'cramp': 0.001069378314969738, 'condit': 0.002138756629939476, 'make': 0.0, 'social': 0.000534689157484

In [178]:
# 7 Important Algorithm: score the sentences
total_sentences_in_all_documents, sentences, sentence_scores = _score_sentences(tf_idf_matrix, documents)
print("Total Sentences in all documents : " + str(total_sentences_in_all_documents))
print("Part of a Sentence and its score : \n")
print(sentence_scores)


Total Sentences in all documents : 123
Part of a Sentence and its score : 

{'In recent weeks': 0.0007743774004953274, 'To control the ': 0.0, 'But those measu': 0.00029704953193603826, 'As housekeepers': 0.00013579407174218894, 'Now, with his h': 2.673445787424345e-05, '"I\'m scared of ': 3.5061584097368456e-05, 'Since March 17,': 0.00012152026306474295, 'Even as the num': 0.0003459753371960917, 'Singapore was b': 9.435691014438865e-05, 'But much of Sin': 0.00017435516004941378, 'Without much la': 0.0001008847466952583, 'So the governme': 0.0003675987957708474, "Of the city's 5": 0.00022683782438752016, 'The cheap forei': 0.000243040526129486, 'Authorities tra': 0.0005092277690332084, 'About 200,000 w': 0.00017248037338221577, 'Government regu': 0.00022210165003217633, 'The workers, th': 0.0003804519005180798, '"When the gover': 0.00011881981277441533, '"The failure, I': 8.624018669110789e-05, '"As the number ': 1.572615169073144e-05, 'They also tried': 0.00010396733617761342, 'Zasim,

In [179]:
# 8 Find the threshold
threshold = _find_average_score(sentence_scores)
print("Calulated threshold : ")
print(threshold)

Calulated threshold : 
0.000174693109619421


In [180]:
# 9 Important Algorithm: Generate the summary
sentence_count_result, summary = _generate_summary(sentences, sentence_scores, 1 * threshold)
print("total number of sentences : ", format(total_sentences_in_all_documents), "\n" )
print("total number of sentences after Summarization : ", format(sentence_count_result), "\n")
print("Input Sentences : \n")
print(sentences)

total number of sentences :  123 

total number of sentences after Summarization :  41 

Input Sentences : 



In [181]:
print("Output Summary :")
print(result)

Output Summary :

 
Between _them_ it was more the intimacyof sisters.
 
It was MissTaylor's loss which first brought grief.
 
It was on the wedding-dayof this beloved friend that Emma first sat in mournful thoughtof any continuance.
 
The wedding over, and the bride-people gone,her father and herself were left to dine together, with no prospectof a third to cheer a long evening.
 
Her father composed himselfto sleep after dinner, as usual, and she had then only to sitand think of what she had lost.
 
The event had every promise of happiness for her friend.
 
Mr.  Westonwas a man of unexceptionable character, easy fortune, suitable age,and pleasant manners; and there was some satisfaction in consideringwith what self-denying, generous friendship she had always wishedand promoted the match; but it was a black morning's work for her.
 
She dearly loved her father, but hewas no companion for her.
 
The evil of the actual disparity in their ages (and Mr.  Woodhouse hadnot married early) wa