In [1]:
import sys
sys.path.insert(0, '../')
from FletcherLibrary import MongoQuery as MQ
from FletcherLibrary import TextSummarization as TS

# Try Text Summarization
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.evaluation import cosine_similarity, recall, precision, rouge, rouge_1

category_counts = TS.GetCategoryCounts('reuters_all', 'February 22, 2018', 'February 27, 2018')
sorted([(k, v) for k, v in category_counts.items()])

[('2014 Winter Olympics', 64),
 ('2018 Winter Olympics', 190),
 ('Afghanistan / Pakistan', 24),
 ('Africa', 71),
 ('Aircraft', 59),
 ('Automotive', 205),
 ('Brexit / European Union', 109),
 ('Business', 224),
 ('Chinese Trade', 80),
 ('Crime / Security', 142),
 ('Economy', 101),
 ('Energy / Environmental', 131),
 ('European Politics / Government', 56),
 ('Financial / Cryptocurrency', 294),
 ('German Politics', 19),
 ('Immigration', 98),
 ('Iran / Yemen', 26),
 ('Israel / Palestine', 47),
 ('Medicine', 219),
 ('North America', 85),
 ('North Korea', 97),
 ('Politics', 121),
 ('Saudi Arabia', 33),
 ('South America', 36),
 ('Sports', 85),
 ('Stock Market', 80),
 ('Technology', 94),
 ('Tennis', 40),
 ('Turkey / Syria', 85)]

In [None]:
import sys
sys.path.insert(0, '../')

from flask import Flask, render_template, request, jsonify
from FletcherLibrary import MongoQuery as MQ
from FletcherLibrary import TopicModel as TM
from FletcherLibrary import TextSummarization as TS

import numpy as np
import pandas as pd
import pickle

# Try Text Summarization
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.evaluation import cosine_similarity, recall, precision, rouge, rouge_1


app = Flask(__name__)

# Homepage
@app.route("/")
def viz_page():
    with open("index.html", 'r') as viz_file:
        return viz_file.read()

@app.route("/get_categories", methods=["POST"])
def get_categories():
    inputs = request.json
    collection_name = inputs['blog_name']
    start_date = inputs['start_date']
    end_date = inputs['end_date']
    print(collection_name, start_date, end_date)
    category_counts = TS.GetCategoryCounts(collection_name, start_date, end_date)
    print(category_counts)
    return jsonify(sorted([(k, v) for k, v in category_counts.items()]))

@app.route('/get_summaries', methods=['POST'])
def get_summaries():
    inputs = request.json
    print(inputs)
    collection_name = inputs['blog_name']
    start_date = inputs['start_date']
    end_date = inputs['end_date']
    data = TS.FetchDataInDateRange(collection_name, 
                                start_date, 
                                end_date)
    topic_list = inputs['selected_categories']
    SENTENCES_COUNT = int(inputs['num_sentences'])
    LANGUAGE = 'english'
    filt_data = []
    for d in data:
        if ('topic' in d.keys() and
           d['topic'] in topic_list and
           d['topic_confidence'] > 0.3):
            filt_data.append(d)
    print(len(data))
    print(len(filt_data))
    print(filt_data[0])
    
    print('Load Data')
    with open(collection_name+'_categories.pkl', 'rb') as pkl:
        category_names = pickle.load(pkl)
    with open(collection_name+'_count_vectorizer_tfidf.pkl', 'rb') as pkl:
        count_vectorizer = pickle.load(pkl)
    with open(collection_name+'_counts_tfidf.pkl', 'rb') as pkl:
        counts = pickle.load(pkl)
    with open(collection_name+'_NMF.pkl', 'rb') as pkl:
        model = pickle.load(pkl)
    
    print('Initial Setup')
    print('Load Stemmer')
    stemmer = Stemmer(LANGUAGE)
    print('Load Summarizers')
    summarizers = [LsaSummarizer(stemmer),
                   EdmundsonSummarizer(stemmer),
                   LexRankSummarizer(stemmer),
                   LuhnSummarizer(stemmer),
                   KLSummarizer(stemmer),
                   SumBasicSummarizer(stemmer),
                   TextRankSummarizer(stemmer)]
    summarizer_names = ['LSA', 'Edmundson', 'LexRank',
                        'Luhn', 'KL', 'SumBasic', 'TextRank']
    print('Get Top Words')
    top_n_words = TM.GetTopWords(model, 10, count_vectorizer)
    
    print('Start Loop')
    complete_summaries = filt_data
    for i, d in enumerate(filt_data):
        print('%d/%d' % (i+1, len(filt_data)))
        content = d['content']
        x = count_vectorizer.transform([content])
        W = model.transform(x.toarray())[0]
        parser = PlaintextParser(content, Tokenizer(LANGUAGE))
        stigma_words = []
        for j, elem in enumerate(W):
            if elem == 0:
                stigma_words += top_n_words[j][:5]
        title = d['title']
        all_summaries = {}
        for summarizer, sum_name in zip(summarizers, summarizer_names):
            print(sum_name)
            summarizer.stop_words = get_stop_words(LANGUAGE)
            if type(summarizer) == EdmundsonSummarizer:
                summarizer.bonus_words = title.lower().split(' ') + \
                                         top_n_words[W.argmax()]
                summarizer.stigma_words = stigma_words
                summarizer.null_words = ['']
            summary = []
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                summary.append(str(sentence))
            all_summaries[sum_name] = ' '.join(summary)
        complete_summaries[i]['summaries'] = all_summaries
        complete_summaries[i]['_id'] = ''
    titles = []
    cs = []
    complete_summaries = sorted(complete_summaries, key=lambda k: k['datetime'])
    for summary in complete_summaries:
        if summary['title'] in titles:
            continue
        titles.append(summary['title'])
        cs.append(summary)
    return render_template("result.html", result=cs)
    
#     return jsonify(complete_summaries)
    
app.run(host='0.0.0.0')
app.run(debug=True)

 * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [01/Mar/2018 00:44:35] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [01/Mar/2018 00:44:36] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 27, 2018 February 27, 2018
Counter({'Financial / Cryptocurrency': 69, 'Automotive': 44, 'Business': 43, 'Brexit / European Union': 37, 'Medicine': 33, 'Politics': 31, 'Energy / Environmental': 26, 'Technology': 26, 'Immigration': 23, 'Stock Market': 21, 'Aircraft': 21, 'Economy': 21, 'Chinese Trade': 18, 'North America': 18, 'Crime / Security': 17, 'Africa': 16, 'Sports': 15, 'Turkey / Syria': 14, '2018 Winter Olympics': 12, 'Israel / Palestine': 12, 'European Politics / Government': 10, 'North Korea': 9, 'South America': 9, 'Saudi Arabia': 7, 'Tennis': 6, 'Afghanistan / Pakistan': 4, '2014 Winter Olympics': 2, 'Iran / Yemen': 1})


[2018-03-01 00:44:40,298] ERROR in app: Exception on /get_summaries [POST]
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/flask/app.py", line 1982, in wsgi_app
    response = self.full_dispatch_request()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/flask/app.py", line 1614, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/flask/app.py", line 1517, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/flask/_compat.py", line 33, in reraise
    raise value
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/flask/app.py", line 1612, in full_dispatch_request
    rv = self.dispatch_request()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib

{'start_date': 'February 27, 2018', 'end_date': 'February 27, 2018', 'blog_name': 'reuters_all', 'num_sentences': '3', 'selected_categories': ['2014 Winter Olympics']}
565
0


127.0.0.1 - - [01/Mar/2018 00:44:43] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [01/Mar/2018 00:44:44] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 27, 2018 February 27, 2018
Counter({'Financial / Cryptocurrency': 69, 'Automotive': 44, 'Business': 43, 'Brexit / European Union': 37, 'Medicine': 33, 'Politics': 31, 'Energy / Environmental': 26, 'Technology': 26, 'Immigration': 23, 'Stock Market': 21, 'Aircraft': 21, 'Economy': 21, 'Chinese Trade': 18, 'North America': 18, 'Crime / Security': 17, 'Africa': 16, 'Sports': 15, 'Turkey / Syria': 14, '2018 Winter Olympics': 12, 'Israel / Palestine': 12, 'European Politics / Government': 10, 'North Korea': 9, 'South America': 9, 'Saudi Arabia': 7, 'Tennis': 6, 'Afghanistan / Pakistan': 4, '2014 Winter Olympics': 2, 'Iran / Yemen': 1})
{'start_date': 'February 27, 2018', 'end_date': 'February 27, 2018', 'blog_name': 'reuters_all', 'num_sentences': '3', 'selected_categories': ['Business']}
565
22
{'_id': ObjectId('5a95ee656b4a750da0a38182'), 'topic_confidence': 0.44991027764914604, 'author': 'Reuters Editorial', 'url': 'https://www.reuters.com/article/anta-sports-results



Start Loop
1/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
2/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
3/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
4/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
5/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
6/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
7/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
8/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
9/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
10/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
11/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
12/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
13/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
14/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
15/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
16/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
17/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
18/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
19/22
LSA
Edmundson
LexRan

127.0.0.1 - - [01/Mar/2018 00:44:50] "POST /get_summaries HTTP/1.1" 200 -


LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
21/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
22/22
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank


127.0.0.1 - - [01/Mar/2018 00:45:25] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 27, 2018 February 27, 2018
Counter({'Financial / Cryptocurrency': 69, 'Automotive': 44, 'Business': 43, 'Brexit / European Union': 37, 'Medicine': 33, 'Politics': 31, 'Energy / Environmental': 26, 'Technology': 26, 'Immigration': 23, 'Stock Market': 21, 'Aircraft': 21, 'Economy': 21, 'Chinese Trade': 18, 'North America': 18, 'Crime / Security': 17, 'Africa': 16, 'Sports': 15, 'Turkey / Syria': 14, '2018 Winter Olympics': 12, 'Israel / Palestine': 12, 'European Politics / Government': 10, 'North Korea': 9, 'South America': 9, 'Saudi Arabia': 7, 'Tennis': 6, 'Afghanistan / Pakistan': 4, '2014 Winter Olympics': 2, 'Iran / Yemen': 1})


127.0.0.1 - - [01/Mar/2018 00:45:33] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 23, 2018 March 1, 2018
Counter({'Financial / Cryptocurrency': 245, 'Medicine': 165, 'Business': 164, '2018 Winter Olympics': 144, 'Automotive': 138, 'Crime / Security': 119, 'Politics': 99, 'Energy / Environmental': 98, 'Brexit / European Union': 88, 'Technology': 81, 'Immigration': 79, 'Economy': 75, 'Turkey / Syria': 72, 'Chinese Trade': 71, 'Sports': 70, 'North Korea': 70, 'Stock Market': 62, 'Africa': 58, '2014 Winter Olympics': 54, 'North America': 51, 'European Politics / Government': 48, 'Aircraft': 47, 'Israel / Palestine': 35, 'Tennis': 34, 'South America': 25, 'Afghanistan / Pakistan': 24, 'Saudi Arabia': 23, 'German Politics': 16, 'Iran / Yemen': 14})
{'start_date': 'February 23, 2018', 'end_date': 'March 1, 2018', 'blog_name': 'reuters_all', 'num_sentences': '5', 'selected_categories': ['2018 Winter Olympics']}
2269
131
{'_id': ObjectId('5a95ee656b4a750da0a37a50'), 'topic_confidence': 0.5614833981256936, 'author': "Philip O'Connor", 'url': 'https://www.




Get Top Words
Start Loop
1/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
2/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
3/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
4/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
5/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
6/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
7/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
8/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
9/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
10/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
11/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
12/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
13/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
14/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
15/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
16/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
17/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
18/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
Te

127.0.0.1 - - [01/Mar/2018 00:46:23] "POST /get_summaries HTTP/1.1" 200 -


LexRank
Luhn
KL
SumBasic
TextRank
131/131
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank


127.0.0.1 - - [01/Mar/2018 00:47:30] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [01/Mar/2018 00:47:42] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [01/Mar/2018 00:49:33] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 23, 2018 March 1, 2018
Counter({'Financial / Cryptocurrency': 245, 'Medicine': 165, 'Business': 164, '2018 Winter Olympics': 144, 'Automotive': 138, 'Crime / Security': 119, 'Politics': 99, 'Energy / Environmental': 98, 'Brexit / European Union': 88, 'Technology': 81, 'Immigration': 79, 'Economy': 75, 'Turkey / Syria': 72, 'Chinese Trade': 71, 'Sports': 70, 'North Korea': 70, 'Stock Market': 62, 'Africa': 58, '2014 Winter Olympics': 54, 'North America': 51, 'European Politics / Government': 48, 'Aircraft': 47, 'Israel / Palestine': 35, 'Tennis': 34, 'South America': 25, 'Afghanistan / Pakistan': 24, 'Saudi Arabia': 23, 'German Politics': 16, 'Iran / Yemen': 14})
{'start_date': 'February 23, 2018', 'end_date': 'March 1, 2018', 'blog_name': 'reuters_all', 'num_sentences': '2', 'selected_categories': ['2018 Winter Olympics', 'Sports']}
2269
178
{'_id': ObjectId('5a95ee656b4a750da0a37a50'), 'topic_confidence': 0.5614833981256936, 'author': "Philip O'Connor", 'url': 'ht




Get Top Words
Start Loop
1/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
2/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
3/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
4/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
5/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
6/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
7/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
8/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
9/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
10/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
11/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
12/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
13/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
14/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
15/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
16/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
17/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
18/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
Te

LexRank
Luhn
KL
SumBasic
TextRank
150/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
151/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
152/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
153/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
154/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
155/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
156/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
157/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
158/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
159/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
160/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
161/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
162/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
163/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
164/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
165/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
166/178
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
167/178
LSA
Ed

127.0.0.1 - - [01/Mar/2018 00:50:32] "POST /get_summaries HTTP/1.1" 200 -


LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
