# Trying models from various libraries

## NER and text summarization using SpaCy

In [4]:
#load SpaCy
import spacy 

In [5]:
#small model
nlp = spacy.load("en_core_web_sm")

In [6]:
#Sample text from newsgroup document 
text = """In article <1p7ciqINN3th@tamsun.tamu.edu> covingc@ee.tamu.edu (Just George) writes:
>I will be traveling to Bangaldesh this summer, and am wondering
>if there are any immunizations I should get before going.
You can probably get this information by calling your public health
department in your county (in Pittsburgh, they give the shots free,
as well).  There are bulletins in medical libraries that give
recommendations, or you could call the infectious diseases section
of the medicine department of your local medical school.  You also
will probably want to talk about Malaria prophylaxis.  You will
need your doctor to get the prescription. Meet me at home or the school. 
Gordon Banks  N3JXP      | "Skepticism is the chastity of the intellect, and
geb@cadre.dsl.pitt.edu   |  it is shameful to surrender it too soon." """

doc = nlp(text)

In [7]:
#Find named entities, phrases, and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Bangaldesh GPE
this summer DATE
Pittsburgh GPE
Malaria GPE
Gordon Banks PERSON
N3JXP       PRODUCT


In [None]:
#Summarize the text using extractive summarization

In [8]:
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

In [14]:
#Can set to different percentage 
percentage = 0.2

In [15]:
## The score of each word is kept in a frequency table
tokens=[token.text for token in doc]
freq_of_word=dict()

In [16]:
# Text cleaning and vectorization 
for word in doc:
    if word.text.lower() not in list(STOP_WORDS):
        if word.text.lower() not in punctuation:
            if word.text not in freq_of_word.keys():
                freq_of_word[word.text] = 1
            else:
                freq_of_word[word.text] += 1

In [17]:
# Maximum frequency of word
max_freq=max(freq_of_word.values())

In [18]:
# Normalization of word frequency
for word in freq_of_word.keys():
    freq_of_word[word]=freq_of_word[word]/max_freq

In [19]:
# In this part, each sentence is weighed based on how often it contains the token.
sent_tokens= [sent for sent in doc.sents]
sent_scores = dict()
for sent in sent_tokens:
    for word in sent:
        if word.text.lower() in freq_of_word.keys():
            if sent not in sent_scores.keys():                            
                sent_scores[sent]=freq_of_word[word.text.lower()]
            else:
                sent_scores[sent]+=freq_of_word[word.text.lower()]

len_tokens=int(len(sent_tokens)*percentage)

In [20]:
# Summary for the sentences with maximum score. Here, each sentence in the list is of spacy.span type
summary = nlargest(n = len_tokens, iterable = sent_scores,key=sent_scores.get)

In [21]:
# Prepare for final summary
final_summary=[word.text for word in summary]

In [22]:
#convert to a string
summary=" ".join(final_summary)

In [23]:
# Return final summary
print(summary)

There are bulletins in medical libraries that give
recommendations, or you could call the infectious diseases section
of the medicine department of your local medical school.  
