In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

import bs4 as bs
import urllib.request
from urllib.error import HTTPError
import re
import heapq

In [None]:
# Testing out a try-except for a wiki url that doesn't exist
try:
    article = urllib.request.urlopen(f'https://en.wikipedia.org/wiki/ThisArticleShouldntExist').read()
except HTTPError:
    print("Error caught")

In [None]:
KEYWORD = "Rome"

In [None]:
# Open Wikipedia page and read all the paragraphs
data = urllib.request.urlopen(f'https://en.wikipedia.org/wiki/{KEYWORD}')
article = data.read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

text = ""

for p in paragraphs:
    text += p.text

In [None]:
print(text)

In [None]:
# Removing all citations and unwanted text
text = re.sub(r'\[[0-9]*\]', ' ', text)
text = re.sub(r'\s+', ' ', text)

In [None]:
# Removing special characters and digits for calculation of weighted frequencies
formatted_text = re.sub('[^a-zA-Z]', ' ', text )
formatted_text = re.sub(r'\s+', ' ', formatted_text)

In [None]:
# Extract a list of sentences from text
sentence_list = sent_tokenize(text)

In [None]:
# Calculate weighted frequency for all words in formatted_text
stopwords = stopwords.words('english')
word_freqs = {}

for word in word_tokenize(formatted_text):
    if word not in stopwords:
        if word not in word_freqs.keys():
            word_freqs[word] = 1
        else:
            word_freqs[word] += 1

max_freq = max(word_freqs.values())

# Make all frequencies a fraction of 1
for word in word_freqs.keys():
    word_freqs[word] = (word_freqs[word]/max_freq)

In [None]:
# Calculating sentence frequency scores as a sum of the individual word frequencies
MAX_SENTENCE_LEN = 30
sentence_scores = {}

for sentence in sentence_list:
    for word in word_tokenize(sentence.lower()):
        if word in word_freqs.keys():
            if len(sentence.split(' ')) < MAX_SENTENCE_LEN:
                if sentence not in sentence_scores.keys():
                    sentence_scores[sentence] = word_freqs[word]
                else:
                    sentence_scores[sentence] += word_freqs[word]

In [None]:
# Combine the sentences with the largest frequency scores
NUM_SENTENCES = 8
summary_sentences = heapq.nlargest(NUM_SENTENCES, sentence_scores, key=sentence_scores.get)
summary = ' '.join(summary_sentences)

In [None]:
print(summary)

In [None]:
import sys
sys.path.append("../")
from src.inc.freq_summary import MostFrequentSummary

In [None]:
# Utility function used to get text for testing purposes
def get_text():
    data = urllib.request.urlopen(f'https://en.wikipedia.org/wiki/Rome')
    article = data.read()
    parsed_article = bs.BeautifulSoup(article,'lxml')
    paragraphs = parsed_article.find_all('p')
    text = ""
    for p in paragraphs:
        text += p.text
    return text

In [None]:
# Testing TextSummary class that summarizes passed in text
t = MostFrequentSummary(get_text())

In [None]:
print(t.get_summary())

In [None]:
# WikiSummarizer class can summarize the wikis for a list of keywords. It uses the TestSummary class to summarize each wiki
from src.inc.wiki_summarizer import WikiSummarizer
w1 = WikiSummarizer(keywords=["Basketball", "Soccer"], summarizer="cluster", lang='english', target="fr")
w2 = WikiSummarizer(keywords=["Basketball", "Soccer"], summarizer="freq", lang='english', target="ja")

In [None]:
w1.get_summaries()

In [None]:
w2.get_summaries()