# Summarizer

### Wikipedia API

If you intend to do any scraping projects or automated requests, consider alternatives such as Pywikipediabot or MediaWiki API, which has other superior features.

* wikipedia.search('keywords', results=2)
* wikipedia.suggest('keyword')
* wikipedia.summary('keywords', sentences=2)
* wikipedia.page('keywords')
* wikipedia.page('keywords').content
* wikipedia.page('keywords').references
* wikipedia.page('keywords').title
* wikipedia.page('keywords').url
* wikipedia.page('keywords').categories
* wikipedia.page('keywords').content
* wikipedia.page('keywords').links
* wikipedia.geosearch(33.2075, 97.1526)
* wikipedia.set_lang('hi')
* wikipedia.languages()
* wikipedia.page('keywords').images[0]
* wikipedia.page('keywords').html()

In [None]:
# pip install wikipedia

In [None]:
# https://kleiber.me/blog/2017/07/22/tutorial-lda-wikipedia/
import pandas as pd
import random
import wikipedia

# rtitles = wikipedia.random(5)

# get 5 Wikipedia page titles based on keywords or manually enter in keywords list
titles = []
keywords = ['Titanic', 'JP Morgan', 'immigration', 'suffrage', 'racist']
for key in keywords:
    title = wikipedia.search(key, results=1)
    titles.append(title[0])

print(titles)
data = []

for title in titles:
    # disambiguous error fix
    try:
        data.append([title, wikipedia.page(title, auto_suggest=False).content, wikipedia.summary(title, auto_suggest=False, sentences=5)])
    except wikipedia.exceptions.DisambiguationError as e:
        s = random.choice(e.options)
        data.append([title, wikipedia.page(s).content,  wikipedia.summary(title, auto_suggest=False, sentences=5)])

df = pd.DataFrame(data, columns=['title', 'content', 'summary'])
df.head()

## Summarization Using spaCy

In [None]:
# https://medium.com/analytics-vidhya/text-summarization-using-spacy-ca4867c6b744
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

nlp = spacy.load('en_core_web_md')

In [None]:
# doc = nlp(df.loc[0]['content'])
summary_text = ' '.join([txt for txt in df.summary])
# print(summary_text)
doc = nlp(summary_text)
len(list(doc.sents))
keyword = []
stopwords = list(STOP_WORDS)
pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
for token in doc:
    if(token.text in stopwords or token.text in punctuation):
        continue
    if(token.pos_ in pos_tag):
        keyword.append(token.text)

# count most frequent words
freq_word = Counter(keyword)
print(freq_word.most_common(5))

# normalize for better processing
max_freq = Counter(keyword).most_common(1)[0][1]
for word in freq_word.keys():
    freq_word[word] = (freq_word[word]/max_freq)

print(freq_word.most_common(5))


In [None]:
# weights based on frequency
sent_strength={}
for sent in doc.sents:
    for word in sent:
        if word.text in freq_word.keys():
            if sent in sent_strength.keys():
                sent_strength[sent] += freq_word[word.text]
            else:
                sent_strength[sent] = freq_word[word.text]

print(sent_strength)

In [None]:
summary = nlargest(10, sent_strength, key=sent_strength.get)
summary = ' '.join([w.text for w in summary])
summary

## Summarization Using Hugging Face

* Limit 512 tokens (BART 1024)
* Models: https://huggingface.co/docs/transformers/index

In [None]:
# pip install transformers

In [None]:
import pandas as pd
from transformers import pipeline

summary_text = ' '.join([txt for txt in df.summary])
model = pipeline('summarization')
model(summary_text, min_length=300, max_length=400)

In [None]:
# https://huggingface.co/facebook/bart-large-cnn
import pandas as pd
from transformers import pipeline

summary_text = ' '.join([txt for txt in df.summary])
model = pipeline('summarization', model='facebook/bart-large-cnn')
model(summary_text, min_length=300, max_length=400)

## Original Text

In [None]:
print('Word Count:', len(summary_text.split(' ')))
print(summary_text)