In [1]:
import sys
sys.path.insert(0, '../')
from FletcherLibrary import MongoQuery as MQ
from FletcherLibrary import TextSummarization as TS

# Try Text Summarization
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.evaluation import cosine_similarity, recall, precision, rouge, rouge_1

category_counts = TS.GetCategoryCounts('reuters_all', 'February 22, 2018', 'February 27, 2018')
sorted([(k, v) for k, v in category_counts.items()])

[('2014 Winter Olympics', 64),
 ('2018 Winter Olympics', 190),
 ('Afghanistan / Pakistan', 24),
 ('Africa', 71),
 ('Aircraft', 59),
 ('Automotive', 205),
 ('Brexit / European Union', 109),
 ('Business', 224),
 ('Chinese Trade', 80),
 ('Crime / Security', 142),
 ('Economy', 101),
 ('Energy / Environmental', 131),
 ('European Politics / Government', 56),
 ('Financial / Cryptocurrency', 294),
 ('German Politics', 19),
 ('Immigration', 98),
 ('Iran / Yemen', 26),
 ('Israel / Palestine', 47),
 ('Medicine', 219),
 ('North America', 85),
 ('North Korea', 97),
 ('Politics', 121),
 ('Saudi Arabia', 33),
 ('South America', 36),
 ('Sports', 85),
 ('Stock Market', 80),
 ('Technology', 94),
 ('Tennis', 40),
 ('Turkey / Syria', 85)]

In [None]:
import sys
sys.path.insert(0, '../')

from flask import Flask, render_template, request, jsonify
from FletcherLibrary import MongoQuery as MQ
from FletcherLibrary import TopicModel as TM
from FletcherLibrary import TextSummarization as TS

import numpy as np
import pandas as pd
import pickle

# Try Text Summarization
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.evaluation import cosine_similarity, recall, precision, rouge, rouge_1


app = Flask(__name__)

# Homepage
@app.route("/")
def viz_page():
    with open("index.html", 'r') as viz_file:
        return viz_file.read()

@app.route("/get_categories", methods=["POST"])
def get_categories():
    inputs = request.json
    collection_name = inputs['blog_name']
    start_date = inputs['start_date']
    end_date = inputs['end_date']
    print(collection_name, start_date, end_date)
    category_counts = TS.GetCategoryCounts(collection_name, start_date, end_date)
    print(category_counts)
    return jsonify(sorted([(k, v) for k, v in category_counts.items()]))

@app.route('/get_summaries', methods=['POST'])
def get_summaries():
    inputs = request.json
    print(inputs)
    collection_name = inputs['blog_name']
    start_date = inputs['start_date']
    end_date = inputs['end_date']
    data = TS.FetchDataInDateRange(collection_name, 
                                start_date, 
                                end_date)
    topic_list = inputs['selected_categories']
    SENTENCES_COUNT = int(inputs['num_sentences'])
    LANGUAGE = 'english'
    filt_data = []
    for d in data:
        if ('topic' in d.keys() and
           d['topic'] in topic_list and
           d['topic_confidence'] > 0.3):
            filt_data.append(d)
    print(len(data))
    print(len(filt_data))
    print(filt_data[0])
    
    print('Load Data')
    with open(collection_name+'_categories.pkl', 'rb') as pkl:
        category_names = pickle.load(pkl)
    with open(collection_name+'_count_vectorizer_tfidf.pkl', 'rb') as pkl:
        count_vectorizer = pickle.load(pkl)
    with open(collection_name+'_counts_tfidf.pkl', 'rb') as pkl:
        counts = pickle.load(pkl)
    with open(collection_name+'_NMF.pkl', 'rb') as pkl:
        model = pickle.load(pkl)
    
    print('Initial Setup')
    print('Load Stemmer')
    stemmer = Stemmer(LANGUAGE)
    print('Load Summarizers')
    summarizers = [LsaSummarizer(stemmer),
                   EdmundsonSummarizer(stemmer),
                   LexRankSummarizer(stemmer),
                   LuhnSummarizer(stemmer),
                   KLSummarizer(stemmer),
                   SumBasicSummarizer(stemmer),
                   TextRankSummarizer(stemmer)]
    summarizer_names = ['LSA', 'Edmundson', 'LexRank',
                        'Luhn', 'KL', 'SumBasic', 'TextRank']
    print('Get Top Words')
    top_n_words = TM.GetTopWords(model, 10, count_vectorizer)
    
    print('Start Loop')
    complete_summaries = filt_data
    for i, d in enumerate(filt_data):
        print('%d/%d' % (i+1, len(filt_data)))
        content = d['content']
        x = count_vectorizer.transform([content])
        W = model.transform(x.toarray())[0]
        parser = PlaintextParser(content, Tokenizer(LANGUAGE))
        stigma_words = []
        for j, elem in enumerate(W):
            if elem == 0:
                stigma_words += top_n_words[j][:5]
        title = d['title']
        all_summaries = {}
        for summarizer, sum_name in zip(summarizers, summarizer_names):
            print(sum_name)
            summarizer.stop_words = get_stop_words(LANGUAGE)
            if type(summarizer) == EdmundsonSummarizer:
                summarizer.bonus_words = title.lower().split(' ') + \
                                         top_n_words[W.argmax()]
                summarizer.stigma_words = stigma_words
                summarizer.null_words = ['']
            summary = []
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                summary.append(str(sentence))
            all_summaries[sum_name] = ' '.join(summary)
        complete_summaries[i]['summaries'] = all_summaries
        complete_summaries[i]['_id'] = ''
    titles = []
    cs = []
    complete_summaries = sorted(complete_summaries, key=lambda k: k['datetime'])
    for summary in complete_summaries:
        if summary['title'] in titles or summary['title'].lower().startswith('brief'):
            continue
        titles.append(summary['title'])
        cs.append(summary)
    return render_template("result.html", result=cs)
    
#     return jsonify(complete_summaries)
    
app.run(host='0.0.0.0')
app.run(debug=True)

 * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [26/Feb/2018 10:44:25] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [26/Feb/2018 10:44:25] "GET /favicon.ico HTTP/1.1" 404 -


reuters_all February 20, 2018 February 26, 2018


127.0.0.1 - - [26/Feb/2018 10:44:29] "POST /get_categories HTTP/1.1" 200 -


Counter({'Financial / Cryptocurrency': 343, '2018 Winter Olympics': 282, 'Business': 275, 'Medicine': 268, 'Automotive': 232, 'Crime / Security': 167, 'Energy / Environmental': 161, 'North Korea': 132, 'Politics': 124, 'Brexit / European Union': 119, 'Economy': 114, 'Stock Market': 114, 'Immigration': 114, 'North America': 107, 'Sports': 103, 'Technology': 102, 'Turkey / Syria': 93, '2014 Winter Olympics': 85, 'Chinese Trade': 81, 'Africa': 79, 'European Politics / Government': 74, 'Aircraft': 65, 'Israel / Palestine': 54, 'South America': 44, 'Saudi Arabia': 42, 'Tennis': 38, 'Afghanistan / Pakistan': 35, 'Iran / Yemen': 32, 'German Politics': 20})
{'start_date': 'February 20, 2018', 'end_date': 'February 26, 2018', 'blog_name': 'reuters_all', 'num_sentences': '3', 'selected_categories': ['Technology']}
3499
58
{'_id': ObjectId('5a95ee656b4a750da0a37328'), 'topic_confidence': 0.4274760312885275, 'author': 'Reuters Editorial', 'url': 'https://www.reuters.com/article/brief-activist-elli



Initial Setup
Load Stemmer
Load Summarizers
Get Top Words
Start Loop
1/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
2/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
3/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
4/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
5/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
6/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
7/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
8/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
9/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
10/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
11/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
12/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
13/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
14/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
15/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
16/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
17/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
18/58
LSA
Edmundson
Le

127.0.0.1 - - [26/Feb/2018 10:44:50] "POST /get_summaries HTTP/1.1" 200 -
127.0.0.1 - - [26/Feb/2018 10:45:24] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [26/Feb/2018 10:46:00] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 20, 2018 February 26, 2018
Counter({'Financial / Cryptocurrency': 343, '2018 Winter Olympics': 282, 'Business': 275, 'Medicine': 268, 'Automotive': 232, 'Crime / Security': 167, 'Energy / Environmental': 161, 'North Korea': 132, 'Politics': 124, 'Brexit / European Union': 119, 'Economy': 114, 'Stock Market': 114, 'Immigration': 114, 'North America': 107, 'Sports': 103, 'Technology': 102, 'Turkey / Syria': 93, '2014 Winter Olympics': 85, 'Chinese Trade': 81, 'Africa': 79, 'European Politics / Government': 74, 'Aircraft': 65, 'Israel / Palestine': 54, 'South America': 44, 'Saudi Arabia': 42, 'Tennis': 38, 'Afghanistan / Pakistan': 35, 'Iran / Yemen': 32, 'German Politics': 20})
{'start_date': 'February 20, 2018', 'end_date': 'February 26, 2018', 'blog_name': 'reuters_all', 'num_sentences': '3', 'selected_categories': ['Technology']}
3499
58
{'_id': ObjectId('5a95ee656b4a750da0a37328'), 'topic_confidence': 0.4274760312885275, 'author': 'Reuters Editorial', 'url': 'htt



Initial Setup
Load Stemmer
Load Summarizers
Get Top Words
Start Loop
1/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
2/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
3/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
4/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
5/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
6/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
7/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
8/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
9/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
10/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
11/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
12/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
13/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
14/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
15/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
16/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
17/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
18/58
LSA
Edmundson
Le

127.0.0.1 - - [26/Feb/2018 10:46:21] "POST /get_summaries HTTP/1.1" 200 -


KL
SumBasic
TextRank


127.0.0.1 - - [26/Feb/2018 10:47:44] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [26/Feb/2018 10:48:21] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 20, 2018 February 26, 2018
Counter({'Financial / Cryptocurrency': 343, '2018 Winter Olympics': 282, 'Business': 275, 'Medicine': 268, 'Automotive': 232, 'Crime / Security': 167, 'Energy / Environmental': 161, 'North Korea': 132, 'Politics': 124, 'Brexit / European Union': 119, 'Economy': 114, 'Stock Market': 114, 'Immigration': 114, 'North America': 107, 'Sports': 103, 'Technology': 102, 'Turkey / Syria': 93, '2014 Winter Olympics': 85, 'Chinese Trade': 81, 'Africa': 79, 'European Politics / Government': 74, 'Aircraft': 65, 'Israel / Palestine': 54, 'South America': 44, 'Saudi Arabia': 42, 'Tennis': 38, 'Afghanistan / Pakistan': 35, 'Iran / Yemen': 32, 'German Politics': 20})
{'start_date': 'February 20, 2018', 'end_date': 'February 26, 2018', 'blog_name': 'reuters_all', 'num_sentences': '3', 'selected_categories': ['Technology']}
3499
58
{'_id': ObjectId('5a95ee656b4a750da0a37328'), 'topic_confidence': 0.4274760312885275, 'author': 'Reuters Editorial', 'url': 'htt



Initial Setup
Load Stemmer
Load Summarizers
Get Top Words
Start Loop
1/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
2/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
3/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
4/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
5/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
6/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
7/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
8/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
9/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
10/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
11/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
12/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
13/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
14/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
15/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
16/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
17/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
18/58
LSA
Edmundson
Le

127.0.0.1 - - [26/Feb/2018 10:48:39] "POST /get_summaries HTTP/1.1" 200 -


KL
SumBasic
TextRank


127.0.0.1 - - [26/Feb/2018 10:49:31] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [26/Feb/2018 10:50:12] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 20, 2018 February 26, 2018
Counter({'Financial / Cryptocurrency': 343, '2018 Winter Olympics': 282, 'Business': 275, 'Medicine': 268, 'Automotive': 232, 'Crime / Security': 167, 'Energy / Environmental': 161, 'North Korea': 132, 'Politics': 124, 'Brexit / European Union': 119, 'Economy': 114, 'Stock Market': 114, 'Immigration': 114, 'North America': 107, 'Sports': 103, 'Technology': 102, 'Turkey / Syria': 93, '2014 Winter Olympics': 85, 'Chinese Trade': 81, 'Africa': 79, 'European Politics / Government': 74, 'Aircraft': 65, 'Israel / Palestine': 54, 'South America': 44, 'Saudi Arabia': 42, 'Tennis': 38, 'Afghanistan / Pakistan': 35, 'Iran / Yemen': 32, 'German Politics': 20})
{'start_date': 'February 20, 2018', 'end_date': 'February 26, 2018', 'blog_name': 'reuters_all', 'num_sentences': '3', 'selected_categories': ['Technology']}
3499
58
{'_id': ObjectId('5a95ee656b4a750da0a37328'), 'topic_confidence': 0.4274760312885275, 'author': 'Reuters Editorial', 'url': 'htt



Initial Setup
Load Stemmer
Load Summarizers
Get Top Words
Start Loop
1/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
2/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
3/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
4/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
5/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
6/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
7/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
8/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
9/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
10/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
11/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
12/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
13/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
14/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
15/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
16/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
17/58
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
18/58
LSA
Edmundson
Le

127.0.0.1 - - [26/Feb/2018 10:50:32] "POST /get_summaries HTTP/1.1" 200 -



KL
SumBasic
TextRank
