In [4]:
import sys
sys.path.insert(0, '../')
from FletcherLibrary import MongoQuery as MQ
from FletcherLibrary import TextSummarization as TS

# Try Text Summarization
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.evaluation import cosine_similarity, recall, precision, rouge, rouge_1

category_counts = TS.GetCategoryCounts('reuters_all', 'February 22, 2018', 'February 27, 2018')
sorted([(k, v) for k, v in category_counts.items()])

[('2014 Winter Olympics', 64),
 ('2018 Winter Olympics', 190),
 ('Afghanistan / Pakistan', 24),
 ('Africa', 71),
 ('Aircraft', 59),
 ('Automotive', 205),
 ('Brexit / European Union', 109),
 ('Business', 224),
 ('Chinese Trade', 80),
 ('Crime / Security', 142),
 ('Economy', 101),
 ('Energy / Environmental', 131),
 ('European Politics / Government', 56),
 ('Financial / Cryptocurrency', 294),
 ('German Politics', 19),
 ('Immigration', 98),
 ('Iran / Yemen', 26),
 ('Israel / Palestine', 47),
 ('Medicine', 219),
 ('North America', 85),
 ('North Korea', 97),
 ('Politics', 121),
 ('Saudi Arabia', 33),
 ('South America', 36),
 ('Sports', 85),
 ('Stock Market', 80),
 ('Technology', 94),
 ('Tennis', 40),
 ('Turkey / Syria', 85)]

In [None]:
import sys
sys.path.insert(0, '../')

from flask import Flask, render_template, request, jsonify
from FletcherLibrary import MongoQuery as MQ
from FletcherLibrary import TopicModel as TM
from FletcherLibrary import TextSummarization as TS

import numpy as np
import pandas as pd
import pickle

# Try Text Summarization
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.evaluation import cosine_similarity, recall, precision, rouge, rouge_1


app = Flask(__name__)

# Homepage
@app.route("/")
def viz_page():
    with open("index.html", 'r') as viz_file:
        return viz_file.read()

@app.route("/get_categories", methods=["POST"])
def get_categories():
    inputs = request.json
    collection_name = inputs['blog_name']
    start_date = inputs['start_date']
    end_date = inputs['end_date']
    print(collection_name, start_date, end_date)
    category_counts = TS.GetCategoryCounts(collection_name, start_date, end_date)
    print(category_counts)
    return jsonify(sorted([(k, v) for k, v in category_counts.items()]))

@app.route('/get_summaries', methods=['POST'])
def get_summaries():
    inputs = request.json
    print(inputs)
    collection_name = inputs['blog_name']
    start_date = inputs['start_date']
    end_date = inputs['end_date']
    data = TS.FetchDataInDateRange(collection_name, 
                                start_date, 
                                end_date)
    topic_list = inputs['selected_categories']
    SENTENCES_COUNT = int(inputs['num_sentences'])
    LANGUAGE = 'english'
    filt_data = []
    for d in data:
        if ('topic' in d.keys() and
           d['topic'] in topic_list and
           d['topic_confidence'] > 0.3):
            filt_data.append(d)
    print(len(data))
    print(len(filt_data))
    print(filt_data[0])
    
    print('Load Data')
    with open(collection_name+'_categories.pkl', 'rb') as pkl:
        category_names = pickle.load(pkl)
    with open(collection_name+'_count_vectorizer_tfidf.pkl', 'rb') as pkl:
        count_vectorizer = pickle.load(pkl)
    with open(collection_name+'_counts_tfidf.pkl', 'rb') as pkl:
        counts = pickle.load(pkl)
    with open(collection_name+'_NMF.pkl', 'rb') as pkl:
        model = pickle.load(pkl)
    
    print('Initial Setup')
    print('Load Stemmer')
    stemmer = Stemmer(LANGUAGE)
    print('Load Summarizers')
    summarizers = [LsaSummarizer(stemmer),
                   EdmundsonSummarizer(stemmer),
                   LexRankSummarizer(stemmer),
                   LuhnSummarizer(stemmer),
                   KLSummarizer(stemmer),
                   SumBasicSummarizer(stemmer),
                   TextRankSummarizer(stemmer)]
    summarizer_names = ['LSA', 'Edmundson', 'LexRank',
                        'Luhn', 'KL', 'SumBasic', 'TextRank']
    print('Get Top Words')
    top_n_words = TM.GetTopWords(model, 10, count_vectorizer)
    
    print('Start Loop')
    complete_summaries = filt_data
    for i, d in enumerate(filt_data):
        print('%d/%d' % (i+1, len(filt_data)))
        content = d['content']
        x = count_vectorizer.transform([content])
        W = model.transform(x.toarray())[0]
        parser = PlaintextParser(content, Tokenizer(LANGUAGE))
        stigma_words = []
        for j, elem in enumerate(W):
            if elem == 0:
                stigma_words += top_n_words[j][:5]
        title = d['title']
        all_summaries = {}
        for summarizer, sum_name in zip(summarizers, summarizer_names):
            print(sum_name)
            summarizer.stop_words = get_stop_words(LANGUAGE)
            if type(summarizer) == EdmundsonSummarizer:
                summarizer.bonus_words = title.lower().split(' ') + \
                                         top_n_words[W.argmax()]
                summarizer.stigma_words = stigma_words
                summarizer.null_words = ['']
            summary = []
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                summary.append(str(sentence))
            all_summaries[sum_name] = ' '.join(summary)
        complete_summaries[i]['summaries'] = all_summaries
        complete_summaries[i]['_id'] = ''
    return render_template("result.html", result=complete_summaries)
    
#     return jsonify(complete_summaries)
    
app.run(host='0.0.0.0')
app.run(debug=True)

 * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [28/Feb/2018 22:10:22] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Feb/2018 22:10:23] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 26, 2018 February 26, 2018
Counter({'Financial / Cryptocurrency': 73, 'Medicine': 39, 'Technology': 37, 'Business': 35, 'Energy / Environmental': 34, 'Automotive': 34, 'Crime / Security': 34, 'Politics': 29, 'Africa': 26, 'Immigration': 23, 'Aircraft': 21, 'Sports': 20, 'Economy': 20, 'Chinese Trade': 18, 'North Korea': 17, 'Turkey / Syria': 17, 'Stock Market': 16, 'Brexit / European Union': 16, 'North America': 15, '2018 Winter Olympics': 13, 'South America': 10, 'European Politics / Government': 9, 'Tennis': 6, '2014 Winter Olympics': 5, 'Iran / Yemen': 4, 'German Politics': 3, 'Israel / Palestine': 3, 'Saudi Arabia': 3, 'Afghanistan / Pakistan': 2})
{'start_date': 'February 26, 2018', 'end_date': 'February 26, 2018', 'blog_name': 'reuters_all', 'num_sentences': '3', 'selected_categories': ['Africa']}
582
18
{'_id': ObjectId('5a95ee656b4a750da0a37f30'), 'topic_confidence': 0.38342203228854815, 'author': 'Reuters Editorial', 'url': 'https://af.reuters.com/article/



Start Loop
1/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
2/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
3/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
4/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
5/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
6/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
7/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
8/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
9/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
10/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
11/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
12/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
13/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
14/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
15/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
16/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
17/18
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
18/18


127.0.0.1 - - [28/Feb/2018 22:10:34] "POST /get_summaries HTTP/1.1" 200 -


LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank


127.0.0.1 - - [28/Feb/2018 22:11:28] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Feb/2018 22:11:35] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 21, 2018 February 24, 2018
Counter({'Financial / Cryptocurrency': 203, '2018 Winter Olympics': 187, 'Medicine': 180, 'Business': 172, 'Automotive': 164, 'Crime / Security': 98, 'Energy / Environmental': 97, 'Brexit / European Union': 82, 'North Korea': 82, 'Stock Market': 79, 'Politics': 75, 'Immigration': 74, 'Economy': 72, 'North America': 71, '2014 Winter Olympics': 60, 'Chinese Trade': 55, 'Sports': 54, 'Turkey / Syria': 49, 'European Politics / Government': 43, 'Aircraft': 38, 'Technology': 38, 'Africa': 37, 'Israel / Palestine': 32, 'Saudi Arabia': 30, 'South America': 23, 'Tennis': 22, 'Iran / Yemen': 17, 'Afghanistan / Pakistan': 15, 'German Politics': 12})
{'start_date': 'February 21, 2018', 'end_date': 'February 24, 2018', 'blog_name': 'reuters_all', 'num_sentences': '3', 'selected_categories': ['Crime / Security']}
2161
74
{'_id': ObjectId('5a95ee656b4a750da0a37526'), 'topic_confidence': 0.30393153015185165, 'author': 'Reuters Editorial', 'url': 'https:/



Load Summarizers
Get Top Words
Start Loop
1/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
2/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
3/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
4/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
5/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
6/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
7/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
8/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
9/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
10/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
11/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
12/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
13/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
14/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
15/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
16/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
17/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
18/74
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
Text

127.0.0.1 - - [28/Feb/2018 22:11:58] "POST /get_summaries HTTP/1.1" 200 -


SumBasic
TextRank


127.0.0.1 - - [28/Feb/2018 22:14:08] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Feb/2018 22:14:56] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Feb/2018 22:15:03] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Feb/2018 22:15:26] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Feb/2018 22:15:35] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Feb/2018 22:15:43] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 22, 2018 February 28, 2018
Counter({'Financial / Cryptocurrency': 294, 'Business': 224, 'Medicine': 219, 'Automotive': 205, '2018 Winter Olympics': 190, 'Crime / Security': 142, 'Energy / Environmental': 131, 'Politics': 121, 'Brexit / European Union': 109, 'Economy': 101, 'Immigration': 98, 'North Korea': 97, 'Technology': 94, 'North America': 85, 'Turkey / Syria': 85, 'Sports': 85, 'Chinese Trade': 80, 'Stock Market': 80, 'Africa': 71, '2014 Winter Olympics': 64, 'Aircraft': 59, 'European Politics / Government': 56, 'Israel / Palestine': 47, 'Tennis': 40, 'South America': 36, 'Saudi Arabia': 33, 'Iran / Yemen': 26, 'Afghanistan / Pakistan': 24, 'German Politics': 19})
{'start_date': 'February 22, 2018', 'end_date': 'February 28, 2018', 'blog_name': 'reuters_all', 'num_sentences': '2', 'selected_categories': ['Automotive']}
2915
118
{'_id': ObjectId('5a95ee656b4a750da0a377a3'), 'topic_confidence': 0.4028111416327527, 'author': 'Cynthia Kim', 'url': 'https://in.reu



Initial Setup
Load Stemmer
Load Summarizers
Get Top Words
Start Loop
1/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
2/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
3/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
4/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
5/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
6/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
7/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
8/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
9/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
10/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
11/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
12/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
13/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
14/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
15/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
16/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
17/118
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
18/11

127.0.0.1 - - [28/Feb/2018 22:16:22] "POST /get_summaries HTTP/1.1" 200 -


Luhn
KL
SumBasic
TextRank


127.0.0.1 - - [28/Feb/2018 22:17:54] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Feb/2018 22:18:02] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 27, 2018 February 27, 2018
Counter({'Financial / Cryptocurrency': 69, 'Automotive': 44, 'Business': 43, 'Brexit / European Union': 37, 'Medicine': 33, 'Politics': 31, 'Energy / Environmental': 26, 'Technology': 26, 'Immigration': 23, 'Stock Market': 21, 'Aircraft': 21, 'Economy': 21, 'Chinese Trade': 18, 'North America': 18, 'Crime / Security': 17, 'Africa': 16, 'Sports': 15, 'Turkey / Syria': 14, '2018 Winter Olympics': 12, 'Israel / Palestine': 12, 'European Politics / Government': 10, 'North Korea': 9, 'South America': 9, 'Saudi Arabia': 7, 'Tennis': 6, 'Afghanistan / Pakistan': 4, '2014 Winter Olympics': 2, 'Iran / Yemen': 1})
{'start_date': 'February 27, 2018', 'end_date': 'February 27, 2018', 'blog_name': 'reuters_all', 'num_sentences': '3', 'selected_categories': ['2018 Winter Olympics', 'Sports']}
565
15
{'_id': ObjectId('5a95ee656b4a750da0a381b8'), 'topic_confidence': 0.5097600488534383, 'author': 'Kiyoshi Takenaka', 'url': 'https://www.reuters.com/article



Start Loop
1/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
2/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
3/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
4/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
5/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
6/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
7/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
8/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
9/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
10/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
11/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
12/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
13/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
14/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
15/15


127.0.0.1 - - [28/Feb/2018 22:18:18] "POST /get_summaries HTTP/1.1" 200 -


LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank


127.0.0.1 - - [28/Feb/2018 22:19:08] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Feb/2018 22:19:16] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 26, 2018 February 26, 2018
Counter({'Financial / Cryptocurrency': 73, 'Medicine': 39, 'Technology': 37, 'Business': 35, 'Energy / Environmental': 34, 'Automotive': 34, 'Crime / Security': 34, 'Politics': 29, 'Africa': 26, 'Immigration': 23, 'Aircraft': 21, 'Sports': 20, 'Economy': 20, 'Chinese Trade': 18, 'North Korea': 17, 'Turkey / Syria': 17, 'Stock Market': 16, 'Brexit / European Union': 16, 'North America': 15, '2018 Winter Olympics': 13, 'South America': 10, 'European Politics / Government': 9, 'Tennis': 6, '2014 Winter Olympics': 5, 'Iran / Yemen': 4, 'German Politics': 3, 'Israel / Palestine': 3, 'Saudi Arabia': 3, 'Afghanistan / Pakistan': 2})


127.0.0.1 - - [28/Feb/2018 22:19:21] "POST /get_categories HTTP/1.1" 200 -


reuters_all January 9, 2018 January 9, 2018
Counter({'Financial / Cryptocurrency': 59, 'Automotive': 42, 'Medicine': 38, 'North America': 38, 'Energy / Environmental': 35, 'Politics': 33, 'Stock Market': 27, 'Technology': 26, 'Business': 26, 'Immigration': 24, 'Economy': 24, 'Chinese Trade': 20, 'Brexit / European Union': 18, 'European Politics / Government': 18, '2018 Winter Olympics': 16, 'Aircraft': 16, 'Crime / Security': 14, 'Israel / Palestine': 14, 'Tennis': 13, 'North Korea': 12, 'Turkey / Syria': 8, 'Africa': 7, '2014 Winter Olympics': 7, 'Iran / Yemen': 7, 'Saudi Arabia': 6, 'Sports': 6, 'Afghanistan / Pakistan': 4, 'South America': 4, 'German Politics': 1})
{'start_date': 'January 9, 2018', 'end_date': 'January 9, 2018', 'blog_name': 'reuters_all', 'num_sentences': '3', 'selected_categories': ['2018 Winter Olympics']}
563
15
{'_id': ObjectId('5a95ee656b4a750da0a31c53'), 'topic_confidence': 0.6211508739403049, 'author': 'Reuters Editorial', 'url': 'https://ca.reuters.com/arti



Start Loop
1/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
2/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
3/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
4/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
5/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
6/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
7/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
8/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
9/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
10/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
11/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
12/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
13/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
14/15
LSA


127.0.0.1 - - [28/Feb/2018 22:19:34] "POST /get_summaries HTTP/1.1" 200 -


Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
15/15
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
