In [3]:
import math

from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords

import os,glob
documents = {}
#folder_path = '../datasets/IDF Texts'
#for filename in glob.glob(os.path.join(folder_path, '*.txt')):
#  with open(filename, 'r') as f:
#    text = f.read()
#    documents[text[:15]]= text
import requests
from bs4 import BeautifulSoup

url = 'https://www.cnn.com/studentnews/article/sitemap-2020-4.html'
code = requests.get(url)
plain = code.text
s = BeautifulSoup(plain, "html.parser")
limit = 1
i=0
for span in s.findAll('span', {'class':'date'}):
    if span.text == '2020-04-19':
        if "coronavirus" in span.next_sibling.text:
            if i >= limit:
                break
            i+=1
            print(i)
            href = span.next_sibling.a.get('href')
            article_code = requests.get(href)
            article_plain = article_code.text
            article_soap = BeautifulSoup(article_plain, "html.parser")
            document = ''
            #print(article_soap)
            for span in article_soap.findAll('div', {'class':'zn-body__paragraph'}):
                document += span.text
            documents[document[:15]] = document
            print(document)
            print("**************************************************")

def _create_frequency_matrix(documents):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for document in documents:
        freq_table = {}
        words = word_tokenize(documents[document])
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[document] = freq_table

    return frequency_matrix


def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for document, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_document = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_document

        tf_matrix[document] = tf_table

    return tf_matrix


def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for document, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table


def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for document, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[document] = idf_table

    return idf_matrix


def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (doc1, f_table1), (doc2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[doc1] = tf_idf_table

    return tf_idf_matrix


def _score_sentences(tf_idf_matrix, documents) -> dict:
 
    sentenceValue = {}
    sentences = {}
    total_sentences_in_all_documents = 0
    for document, f_table in tf_idf_matrix.items():
        
        sentences_tokenized = sent_tokenize(documents[document]) # NLTK function        
        for sentence in sentences_tokenized :
            
            total_score_per_sentence = 0
            words = word_tokenize(sentence)
            count_words_in_sentence = len(words)
            for word in words:
                for search_word, score in f_table.items():
                    if search_word == word :
                        total_score_per_sentence += score
            sentenceValue[sentence[:15]] = total_score_per_sentence / count_words_in_sentence  
            sentences[sentence[:15]] = sentence
            total_sentences_in_all_documents += 1
        
    return total_sentences_in_all_documents, sentences, sentenceValue


def _find_average_score(sentenceValue) -> int:

    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average


def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += "\n \n" + sentences[sentence].replace('\n', ' ').replace('\r', '')
            sentence_count += 1

    return sentence_count, summary


def run_summarization(documents):

    # 1 Length of documents
    total_documents = len(documents)
    #print(sentences)

    # 2 Create the Frequency matrix of the words in each sentence.
    freq_matrix = _create_frequency_matrix(documents)
    #print(freq_matrix)

    # 3 Calculate TermFrequency and generate a matrix
    tf_matrix = _create_tf_matrix(freq_matrix)
    #print(tf_matrix)

    # 4 creating table for documents per words
    count_doc_per_words = _create_documents_per_words(freq_matrix)
    #print(count_doc_per_words)

    # 5 Calculate IDF and generate a matrix
    idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
    #print(idf_matrix)

    # 6 Calculate TF-IDF and generate a matrix
    tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
    #print(tf_idf_matrix)

    # 7 Important Algorithm: score the sentences
    total_sentences_in_all_documents, sentences, sentence_scores = _score_sentences(tf_idf_matrix, documents)
    #print(len(sentence_scores))

    # 8 Find the threshold
    threshold = _find_average_score(sentence_scores)
    #print(threshold)

    # 9 Important Algorithm: Generate the summary
    sentence_count_result, summary = _generate_summary(sentences, sentence_scores, 1 * threshold)
    return total_sentences_in_all_documents, sentence_count_result, summary


if __name__ == '__main__':
    total_sentences_in_all_documents, sentence_count_result, result = run_summarization(documents)
    print("total number of sentences : ", format(total_sentences_in_all_documents), "\n" )
    print("total number of sentences after Summarization : ", format(sentence_count_result), "\n")
    print(result)

1
Mount Neboh Baptist Church, a fixture in the cultural center of black America, has lost 11 parishioners in the last month, nine to Covid-19, according to Green and church members. Two died of natural causes. "We deal with death all the time but we've never had to deal with a succession of death like now," said Green, who has been ministering to his flock via Facebook Live and Zoom from the dining room of his New Jersey home. "It was as if every other day I was getting a call that another parishioner had passed."Even after four decades in the ministry, the experience overwhelms Green. The mounting death toll leaves little time for proper grieving.  "We see a lot of violence," Green said via Zoom. "We see gang activity from time to time. I've had to preside over the funerals of kids who were literally killed outside the doors of the church. But we've never seen anything like this."The pandemic has hit black Americans especially hard. It has fallen on Green's close-knit congregation wit