# Alternative text summarization techniques

Here we experiment with different Python Libraries and techniques to tackle our text summarization task.


In [None]:
import requests
import json
import time
import os, sys
from IPython.display import display
import datetime
import pandas as pd
import unicodedata
from bs4 import BeautifulSoup
import nltk
import numpy as np
import time

In [None]:
# Loading one of the articles to work on
filename = os.listdir(data_path+"2016/1/")[0]
file = open(data_path+"2016/1/"+str(filename), "r")
content = json.load(file)
file.close()
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
body_raw = unicodedata.normalize("NFKD",content["body"]).encode("ascii", "ignore")
soup = BeautifulSoup(body_raw, "html.parser")
# soup.text
body_text = "\n".join(tokenizer.tokenize(soup.get_text().replace("\n", " ").replace("\r", "").replace("="," ").replace("\t", " ")))
# body_text

### Gensim

In [None]:
#Gensim will calculate the most important sentences from the document, using TextRank and will return to us a summary
# using only these sentences. It does not generate new sentences,
# it just identifies and copies the most significant sentences.
# More information at https://github.com/RaRe-Technologies/gensim
from gensim.summarization import summarize, keywords

# We can summarize the given article/document by defining the size of the summary 
# as a percentage of the size of the original one
display(summarize(body_text, ratio=0.15))

In [None]:
# Or as an absolute word count
display(summarize(body_text, word_count=50))

In [None]:
# We can also extract the keywords from the document
display(keywords(body_text))

### Sumy

In [None]:
# Sumy provides a variety of algorithms for text summarization, among which also TextRank.
# More information about Sumy can be found at https://pypi.python.org/pypi/sumy

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

In [None]:
# Here we get summaries using TextRank, LexRank and LSA.
# All three summaries produced are different, so it might be a good idea to use all three of them to find the 
# sentences on which they are overlapping, these sentences are more likely to contain valuable information.

LANGUAGE = "english"
SENTENCES_COUNT = 2
parser = PlaintextParser.from_string(body_text, Tokenizer(LANGUAGE))
# or for plain text files
# parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizers = [LexRankSummarizer, TextRankSummarizer, LsaSummarizer]
for summarizer in summarizers:
    if summarizers.index(summarizer) == 0:
        name = "LexRank"
    elif summarizers.index(summarizer) == 1:
        name = "TextRank"
    else:
        name = "LSA"
    print("Summarizing with: "+name)
    summarizer = summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        # Printing the number of the sentence in the original article
        print(body_text[0:body_text.index(str(sentence))].count("\n")+1)
    print("\n")