In [1]:
# Importing Libraries
import re
# Importing Spacy English Library
import en_core_web_md
# Importing Spacy
import spacy
# Importing Stop Words
from spacy.lang.en.stop_words import STOP_WORDS
# Importing Count Vectorizer from SKLEARN
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Opening File
file = open('input_text.txt','r',encoding = 'utf-8')
# Reading File Contents
file_lines = file.readlines()
# Closing Connection to File
file.close()

In [3]:
file_lines

['The earliest reports of an illness caused by a coronavirus occurred in the late 1920s, when an acute respiratory infection of domesticated chickens emerged in North America.[15][16] Arthur Schalk and M.C. Hawn in 1931 made the first detailed report which described a new respiratory infection of chickens in North Dakota. The infection of new-born chicks was characterized by gasping and listlessness with high mortality rates of 40â€“90%.[17] Leland David Bushnell and Carl Alfred Brandly isolated the virus in 1933.[18] The virus was then known as infectious bronchitis virus (IBV). Charles D. Hudson and Fred Robert Beaudette cultivated the virus for the first time in 1937.[19] The specimen came to be known as the Beaudette strain. In the late 1940s, two more animal coronaviruses, JHM that causes brain disease (murine encephalitis) and mouse hepatitis virus (MHV) that causes hepatitis in mice were discovered.[20] It was not realized at the time that these three different viruses were rela

In [4]:
index = 0
# Cleaning The File Content Line By Line
while index < len(file_lines):
    if file_lines[index] == '\n':
        file_lines.pop(index)
        continue
        
    # RE To Remove [] Wikipedia Annotations & New Line Special Character
    file_lines[index] = re.sub('[[0-9]+]','',file_lines[index]).replace('\n','').strip()
    index = index + 1
# Combining Seperate Lines into Single Paragraph
paragraph = ''.join(file_lines)

  if __name__ == '__main__':


In [5]:
# Loading the Spacy English MD Library
nlp = en_core_web_md.load()
# Creating Spacy Doc
doc = nlp(paragraph)

In [6]:
# Building Corpus
corpus = []

for s in doc.sents:
    corpus.append(s.text.lower())
print('Number of Sentences in Corpus : ' + str(len(corpus)))

Number of Sentences in Corpus : 30


In [7]:
# Applying Count Vectorizer To Corpus
cv = CountVectorizer(stop_words=list(STOP_WORDS))
cv_fit = cv.fit_transform(corpus)

In [8]:
# Getting Word List from Count Vectorizer
word_list = cv.get_feature_names()

# Getting Sum of Word Occurences
word_counts = cv_fit.toarray().sum(axis=0)

# Creating Dictionary for Words and Occurences
word_freq_dict = dict(zip(word_list, word_counts))

In [9]:
# Sorting Word Counts in Descending Order
sort_word_counts = sorted(word_freq_dict.values(), reverse=True)
# Getting Higher Freqeuncy Words
higher_frequency_words = []
# Words With Top 5 Frequency are considered as Higher Frequency Words
for word, freq in word_freq_dict.items():
    if freq in sort_word_counts[0:5]:
        higher_frequency_words.append(word)

In [10]:
print('Most Frequently Occurring Words In Text')
for word in higher_frequency_words:
    print(word)

Most Frequently Occurring Words In Text
b814
cold
coronavirus
coronaviruses
human
new
novel
virus
viruses


In [11]:
# Scaling The Frequency Of Words With Relative To Highest Frequency
for word in word_freq_dict.keys():
    word_freq_dict[word] = word_freq_dict[word]/sort_word_counts[0]

In [12]:
# Assigning Ranks To Sentences
sentence_ranks = {}

for sentence in doc.sents:
    for word in sentence:
        if word.text.lower() in word_freq_dict.keys():
            if sentence in sentence_ranks.keys():
                sentence_ranks[sentence] += word_freq_dict[word.text.lower()]
            else:
                sentence_ranks[sentence] = word_freq_dict[word.text.lower()]

# Sorting Sentences According To Ranks
sorted_sentences = sorted(sentence_ranks.items(), key=lambda x: x[1], reverse=True)

In [13]:
number_of_sentences = 10
final_para = ""

for i in range(0, number_of_sentences):
    sentence_tuple = sorted_sentences[i]
    final_para = final_para + sentence_tuple[0].text

In [14]:
final_para

'The novel virus caused a cold in volunteers and was inactivated by ether similarly as B814.Scottish virologist June Almeida at St. Thomas Hospital in London, collaborating with Tyrrell, compared the structures of IBV, B814 and 229E in 1967.E.C. Kendall, Malcolm Bynoe, and David Tyrrell working at the Common Cold Unit of the British Medical Research Council collected a unique common cold virus designated B814 in 1961.The IBV-like novel cold viruses were soon shown to be also morphologically related to the mouse hepatitis virus.The virus could not be cultivated using standard techniques which had successfully cultivated rhinoviruses, adenoviruses and other known common cold viruses.In 1965, Tyrrell and Bynoe successfully cultivated the novel virus by serially passing it through organ culture of human embryonic trachea.the novel cold virus OC43 had distinctive club-like spikes when observed with the electron microscope.In the late 1940s, two more animal coronaviruses, JHM that causes bra