<a href="https://colab.research.google.com/github/dxiong2001/malibu-ml/blob/main/textrank_function_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.tokenize import sent_tokenize
import nltk.tokenize.texttiling as tt
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from scipy import spatial
import networkx as nx
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
# nltk.download('gutenberg')

try:
  nltk.data.find('tokenizers/punkt')
  nltk.data.find('corpora/stopwords')
  nltk.data.find('taggers/averaged_perceptron_tagger')
  
except:
  nltk.download('punkt')
  nltk.download('stopwords')
  nltk.download('averaged_perceptron_tagger')
# nltk.download('brown')

nlp = spacy.load("en_core_web_sm")
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [17]:
def textrank(text):

  #Preproces text
  sentences=sent_tokenize(text)
  sentences_clean=[re.sub(r'[^\w\s]','',sentence.lower()) for sentence in sentences]
  sentences_clean=[re.sub(r'\[\]','',sentence) for sentence in sentences_clean]
  stop_words = stopwords.words('english')
  sentence_tokens=[[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_clean]

  w2v=Word2Vec(sentence_tokens,size=128,min_count=1,iter=100)
  sentence_embeddings=[[w2v[word][0] for word in words] for words in sentence_tokens]
  max_len=max([len(tokens) for tokens in sentence_tokens])
  sentence_embeddings=[np.pad(embedding,(0,max_len-len(embedding)),'constant') for embedding in sentence_embeddings]

  similarity_matrix = np.zeros([len(sentence_tokens), len(sentence_tokens)])
  for i,row_embedding in enumerate(sentence_embeddings):
      for j,column_embedding in enumerate(sentence_embeddings):
          similarity_matrix[i][j]=1-spatial.distance.cosine(row_embedding,column_embedding)


  nx_graph = nx.from_numpy_array(similarity_matrix)
  scores = nx.pagerank_numpy(nx_graph)
  
  #print(scores)
  top_sentence={sentence:scores[index] for index,sentence in enumerate(sentences)}
  #print(top_sentence)
  top=dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:4])
  #print(top)
  #print(sentences_clean)
  summ = []
  i=0
  for sent in sentences:
    if sent in top.keys():
        summ.append(re.sub(r'\[\d*\] ','',sent))
        #print(re.sub(r'\[\d*\] ','',sent))
        i=i+1
  return summ

In [3]:
def summarize(text, per):
    nlp = spacy.load('en_core_web_sm')
    doc= nlp(text)
    tokens=[token.text for token in doc]
    word_frequencies={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    sentence_tokens= [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
    return summary



In [4]:

class Summarizer:
  def __init__(self, texttiler):
    self.texttiler = texttiler

  def texttile(self, text):
    tokenized = self.texttiler.tokenize(text)
    return tokenized

  def generate(self, tokenized, top=1, percentage=0.2):
    
    length_t = len(tokenized)
    l = {}
    section_num = 1
    
    for t in range(length_t):
      section = "section " + str(section_num)
      section_text = []
      
      ranked_text = textrank(tokenized[t])
      loop = min(top,len(ranked_text))
      num_char_section = len(tokenized[t])
      num_char = 0
      for i in range(loop):
        section_text.append(ranked_text[i].replace("\n\n",""))
        num_char = num_char + len(ranked_text[i])
        if(num_char/num_char_section > percentage):
          break
      l[section] = section_text
      #textrank(tokenized[t])
      #summ[t][0]=textrank(tokenized[t])[0]
      #summ[t][1]=textrank(tokenized[t])#[1]
      section_num +=1
    return tokenized, l

  def process(self, raw_text):
    processed_text = ""
    for i in raw_text:
      processed_text = processed_text + i.replace("\n", "") + " "

    return processed_text



In [22]:
#Example
import time
text='''Preliminary clinical data from South Africa suggest that the COVID-19 Omicron variant may be less dangerous than initially feared. 

The South African Medical Research Council posted a report on Saturday outlining early Omicron cases from several hospitals. They found that patients infected with the Omicron strain tended to have less severe disease—most of the hospitalized patients did not need supplemental oxygen, a few developed COVID pneumonia, and fewer still required intensive care. 

"The signals are a bit encouraging" but Omicron's risk profile is still incomplete, Anthony Fauci, head of the National Institute of Allergy and Infectious Diseases, told CNN on Sunday. More clinical data from around the world will need to be assessed for epidemiologists to get a true sense of where Omicron stands in relation to other variants. "Clearly, in South Africa, Omicron has a transmission advantage," Fauci added, "although it's too early to make any definitive statements about it, thus far it does not look like there's a great degree of severity to it."

The South African report is based on limited data, however. It included just 166 patients, most of whom were admitted to hospitals for reasons unrelated to COVID-19—their infections were only found because those hospitals are testing all incoming patients for the coronavirus. Many did not have respiratory symptoms and stayed at the hospital for a relatively short time. Vaccination status was not reported for all of the patients.

"We're just not seeing the number of patients that have been seen in previous surges who are seriously ill, even this soon into the surge," Michael Osterholm, director of the University of Minnesota's Center for Infectious Diseases Research and Policy, told STAT, adding that he is "impressed by the relative lack of severe illness" with Omicron so far. 



Data from the South African report show that the patients included tended to skew younger, which the report's authors write could be a vaccination effect, "as 57 percent of people over the age of 50 have been vaccinated in the province compared to 34 percent in the 18-to-49-year group."

Although Omicron appears to produce milder disease, scientists will need to study the effects of Omicron in different demographics, Marion Koopmans, head of virology at Erasmus Medical Center in Rotterdam, the Netherlands, told STAT. "Only once we have seen this spread across age groups, we will be able to tell," she said.

Omicron has been detected in at least 16 states and at least 45 nations worldwide. Nevertheless, the Delta variant still seems to be the primary cause for concern, making up the majority of severe cases in the US—despite Omicron's high transmissibility.

Omicron may have picked up genetic materials from the virus that causes the common cold in humans, resulting in the variant's high contagiousness. A preprint, non-peer-reviewed study from a Massachusetts-based firm says that Omicron's genetic code contains a sequence shared with the cold virus—a sequence that no other coronavirus variant seems to contain.

"By virtue of Omicron adopting this insertion ... it is essentially taking a leaf out of the seasonal coronaviruses' page, which [explains] ... how it lives and transmits more efficiently with human beings," Venky Soundararajan, a biological engineer at Nference who co-wrote the study, told The Washington Post.
'''



texttiler = tt.TextTilingTokenizer(w=30, k=40)

s1 = Summarizer(texttiler)
# #start_time = time.time()
tokenized = s1.texttile(text)
print(tokenized)
processed_tokenized = []
for t in tokenized:
  processed_tokenized.append([t])
print(processed_tokenized)
raw_text1 = s1.generate([text], 3)
# #print("--- %s seconds ---" % (time.time() - start_time))

#raw_text2 = s1.generate(text, 2)
#print(s1.process(raw_text1))
#print(s1.process(raw_text2))




['Preliminary clinical data from South Africa suggest that the COVID-19 Omicron variant may be less dangerous than initially feared. \n\nThe South African Medical Research Council posted a report on Saturday outlining early Omicron cases from several hospitals. They found that patients infected with the Omicron strain tended to have less severe disease—most of the hospitalized patients did not need supplemental oxygen, a few developed COVID pneumonia, and fewer still required intensive care.', ' \n\n"The signals are a bit encouraging" but Omicron\'s risk profile is still incomplete, Anthony Fauci, head of the National Institute of Allergy and Infectious Diseases, told CNN on Sunday. More clinical data from around the world will need to be assessed for epidemiologists to get a true sense of where Omicron stands in relation to other variants. "Clearly, in South Africa, Omicron has a transmission advantage," Fauci added, "although it\'s too early to make any definitive statements about it

  # This is added back by InteractiveShellApp.init_path()


In [23]:

print(raw_text1[1])
print(len(raw_text1[1]))

sent_nlp = nlp(raw_text1[1]['section 1'][0])

named_entities=[]
for entities in sent_nlp.ents:
  named_entities.append((entities.text, entities.label_))
  #print(entities.text, entities.label_)
if not len(named_entities):
  raw_text1[1].pop('section 1')

print(raw_text1[1])

{'section 1': ['They found that patients infected with the Omicron strain tended to have less severe disease—most of the hospitalized patients did not need supplemental oxygen, a few developed COVID pneumonia, and fewer still required intensive care.', '"Clearly, in South Africa, Omicron has a transmission advantage," Fauci added, "although it\'s too early to make any definitive statements about it, thus far it does not look like there\'s a great degree of severity to it."', 'It included just 166 patients, most of whom were admitted to hospitals for reasons unrelated to COVID-19—their infections were only found because those hospitals are testing all incoming patients for the coronavirus.']}
1
{'section 1': ['They found that patients infected with the Omicron strain tended to have less severe disease—most of the hospitalized patients did not need supplemental oxygen, a few developed COVID pneumonia, and fewer still required intensive care.', '"Clearly, in South Africa, Omicron has a tr

In [None]:
print('\n\n'.join(raw_text1[0]))

The ocean is big, and our attempts to understand it are still largely surface-deep. According to the National Oceanic and Atmospheric Organization, around 80 percent of the big blue is "unmapped, unobserved, and unexplored."



Ships are the primary way to collect information about the seas, but they're costly to send out frequently. More recently, robotic buoys called Argo floats have been drifting with the currents, diving up and down to take a variety of measurements at depths up to 6,500 feet. But new aquatic robots from a lab at Caltech could rove deeper and take on more tailored underwater missions.

"We're imagining an approach for global ocean exploration where you take swarms of smaller robots of various types and populate the ocean with them for tracking, for climate change, for understanding the physics of the ocean," says John O. Dabiri, a professor of aeronautics and mechanical engineering at the California Institute of Technology. 

In comes CARL-Bot (Caltech Autonomous R