In [1]:
import sys
sys.path.insert(0, '../')
from FletcherLibrary import MongoQuery as MQ
from FletcherLibrary import TextSummarization as TS

# Try Text Summarization
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.evaluation import cosine_similarity, recall, precision, rouge, rouge_1

category_counts = TS.GetCategoryCounts('reuters_all', 'February 22, 2018', 'February 27, 2018')
sorted([(k, v) for k, v in category_counts.items()])

[('2014 Winter Olympics', 64),
 ('2018 Winter Olympics', 190),
 ('Afghanistan / Pakistan', 24),
 ('Africa', 71),
 ('Aircraft', 59),
 ('Automotive', 205),
 ('Brexit / European Union', 109),
 ('Business', 224),
 ('Chinese Trade', 80),
 ('Crime / Security', 142),
 ('Economy', 101),
 ('Energy / Environmental', 131),
 ('European Politics / Government', 56),
 ('Financial / Cryptocurrency', 294),
 ('German Politics', 19),
 ('Immigration', 98),
 ('Iran / Yemen', 26),
 ('Israel / Palestine', 47),
 ('Medicine', 219),
 ('North America', 85),
 ('North Korea', 97),
 ('Politics', 121),
 ('Saudi Arabia', 33),
 ('South America', 36),
 ('Sports', 85),
 ('Stock Market', 80),
 ('Technology', 94),
 ('Tennis', 40),
 ('Turkey / Syria', 85)]

In [1]:
import sys
sys.path.insert(0, '../')

from flask import Flask, render_template, request, jsonify
from FletcherLibrary import MongoQuery as MQ
from FletcherLibrary import TopicModel as TM
from FletcherLibrary import TextSummarization as TS

import numpy as np
import pandas as pd
import pickle

# Try Text Summarization
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.evaluation import cosine_similarity, recall, precision, rouge, rouge_1


app = Flask(__name__)

# Homepage
@app.route("/")
def viz_page():
    with open("index.html", 'r') as viz_file:
        return viz_file.read()

@app.route("/get_categories", methods=["POST"])
def get_categories():
    inputs = request.json
    collection_name = inputs['blog_name']
    start_date = inputs['start_date']
    end_date = inputs['end_date']
    print(collection_name, start_date, end_date)
    category_counts = TS.GetCategoryCounts(collection_name, start_date, end_date)
    print(category_counts)
    return jsonify(sorted([(k, v) for k, v in category_counts.items()]))

@app.route('/get_summaries', methods=['POST'])
def get_summaries():
    inputs = request.json
    print(inputs)
    collection_name = inputs['blog_name']
    start_date = inputs['start_date']
    end_date = inputs['end_date']
    data = TS.FetchDataInDateRange(collection_name, 
                                start_date, 
                                end_date)
    topic_list = inputs['selected_categories']
    SENTENCES_COUNT = int(inputs['num_sentences'])
    LANGUAGE = 'english'
    filt_data = []
    for d in data:
        if ('topic' in d.keys() and
           d['topic'] in topic_list and
           d['topic_confidence'] > 0.3):
            filt_data.append(d)
    print(len(data))
    print(len(filt_data))
    print(filt_data[0])
    
    print('Load Data')
    with open(collection_name+'_categories.pkl', 'rb') as pkl:
        category_names = pickle.load(pkl)
    with open(collection_name+'_count_vectorizer_tfidf.pkl', 'rb') as pkl:
        count_vectorizer = pickle.load(pkl)
    with open(collection_name+'_counts_tfidf.pkl', 'rb') as pkl:
        counts = pickle.load(pkl)
    with open(collection_name+'_NMF.pkl', 'rb') as pkl:
        model = pickle.load(pkl)
    
    print('Initial Setup')
    print('Load Stemmer')
    stemmer = Stemmer(LANGUAGE)
    print('Load Summarizers')
    summarizers = [LsaSummarizer(stemmer),
                   EdmundsonSummarizer(stemmer),
                   LexRankSummarizer(stemmer),
                   LuhnSummarizer(stemmer),
                   KLSummarizer(stemmer),
                   SumBasicSummarizer(stemmer),
                   TextRankSummarizer(stemmer)]
    summarizer_names = ['LSA', 'Edmundson', 'LexRank',
                        'Luhn', 'KL', 'SumBasic', 'TextRank']
    print('Get Top Words')
    top_n_words = TM.GetTopWords(model, 10, count_vectorizer)
    
    print('Start Loop')
    complete_summaries = filt_data
    for i, d in enumerate(filt_data):
        print('%d/%d' % (i+1, len(filt_data)))
        content = d['content']
        x = count_vectorizer.transform([content])
        W = model.transform(x.toarray())[0]
        parser = PlaintextParser(content, Tokenizer(LANGUAGE))
        stigma_words = []
        for j, elem in enumerate(W):
            if elem == 0:
                stigma_words += top_n_words[j][:5]
        title = d['title']
        all_summaries = {}
        for summarizer, sum_name in zip(summarizers, summarizer_names):
            print(sum_name)
            summarizer.stop_words = get_stop_words(LANGUAGE)
            if type(summarizer) == EdmundsonSummarizer:
                summarizer.bonus_words = title.lower().split(' ') + \
                                         top_n_words[W.argmax()]
                summarizer.stigma_words = stigma_words
                summarizer.null_words = ['']
            summary = []
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                summary.append(str(sentence))
            all_summaries[sum_name] = ' '.join(summary)
        complete_summaries[i]['summaries'] = all_summaries
        complete_summaries[i]['_id'] = ''
    titles = []
    cs = []
    complete_summaries = sorted(complete_summaries, key=lambda k: k['datetime'])
    for summary in complete_summaries:
        if summary['title'] in titles or summary['title'].lower().startswith('brief'):
            continue
        titles.append(summary['title'])
        cs.append(summary)
    return render_template("result.html", result=cs)
    
#     return jsonify(complete_summaries)
    
app.run(host='0.0.0.0')
app.run(debug=True)

 * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [01/Mar/2018 11:46:36] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [01/Mar/2018 11:46:38] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 27, 2018 February 27, 2018
Counter({'Financial / Cryptocurrency': 69, 'Automotive': 44, 'Business': 43, 'Brexit / European Union': 37, 'Medicine': 33, 'Politics': 31, 'Energy / Environmental': 26, 'Technology': 26, 'Immigration': 23, 'Stock Market': 21, 'Aircraft': 21, 'Economy': 21, 'Chinese Trade': 18, 'North America': 18, 'Crime / Security': 17, 'Africa': 16, 'Sports': 15, 'Turkey / Syria': 14, '2018 Winter Olympics': 12, 'Israel / Palestine': 12, 'European Politics / Government': 10, 'North Korea': 9, 'South America': 9, 'Saudi Arabia': 7, 'Tennis': 6, 'Afghanistan / Pakistan': 4, '2014 Winter Olympics': 2, 'Iran / Yemen': 1})
{'start_date': 'February 27, 2018', 'end_date': 'February 27, 2018', 'blog_name': 'reuters_all', 'num_sentences': '3', 'selected_categories': ['2018 Winter Olympics']}
565
7
{'_id': ObjectId('5a95ee656b4a750da0a381b8'), 'topic_confidence': 0.5097600488534383, 'author': 'Kiyoshi Takenaka', 'url': 'https://www.reuters.com/article/us-olympic



Start Loop
1/7
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
2/7
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
3/7
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
4/7
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
5/7
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank


127.0.0.1 - - [01/Mar/2018 11:46:42] "POST /get_summaries HTTP/1.1" 200 -


6/7
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
7/7
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank


127.0.0.1 - - [01/Mar/2018 11:46:45] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [01/Mar/2018 11:46:52] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 18, 2018 February 24, 2018
Counter({'Financial / Cryptocurrency': 319, '2018 Winter Olympics': 297, 'Business': 289, 'Medicine': 237, 'Automotive': 217, 'Energy / Environmental': 154, 'Crime / Security': 152, 'Stock Market': 122, 'North Korea': 111, 'Politics': 110, 'North America': 108, 'Brexit / European Union': 106, 'Economy': 104, 'Sports': 98, 'Immigration': 97, '2014 Winter Olympics': 92, 'Chinese Trade': 81, 'Turkey / Syria': 79, 'European Politics / Government': 69, 'Israel / Palestine': 62, 'Technology': 62, 'Saudi Arabia': 61, 'Africa': 55, 'Aircraft': 53, 'Tennis': 37, 'South America': 37, 'Iran / Yemen': 35, 'Afghanistan / Pakistan': 34, 'German Politics': 18})
{'start_date': 'February 18, 2018', 'end_date': 'February 24, 2018', 'blog_name': 'reuters_all', 'num_sentences': '3', 'selected_categories': ['Automotive']}
3296
120
{'_id': ObjectId('5a95ee656b4a750da0a37020'), 'topic_confidence': 0.7296510001287516, 'author': 'Reuters Editorial', 'url': 'https



Initial Setup
Load Stemmer
Load Summarizers
Get Top Words
Start Loop
1/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
2/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
3/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
4/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
5/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
6/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
7/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
8/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
9/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
10/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
11/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
12/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
13/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
14/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
15/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
16/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
17/120
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
18/12

127.0.0.1 - - [01/Mar/2018 11:47:21] "POST /get_summaries HTTP/1.1" 200 -


LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank


127.0.0.1 - - [01/Mar/2018 12:02:04] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [01/Mar/2018 12:02:08] "POST /get_categories HTTP/1.1" 200 -


reuters_all February 27, 2018 February 27, 2018
Counter({'Financial / Cryptocurrency': 69, 'Automotive': 44, 'Business': 43, 'Brexit / European Union': 37, 'Medicine': 33, 'Politics': 31, 'Energy / Environmental': 26, 'Technology': 26, 'Immigration': 23, 'Stock Market': 21, 'Aircraft': 21, 'Economy': 21, 'Chinese Trade': 18, 'North America': 18, 'Crime / Security': 17, 'Africa': 16, 'Sports': 15, 'Turkey / Syria': 14, '2018 Winter Olympics': 12, 'Israel / Palestine': 12, 'European Politics / Government': 10, 'North Korea': 9, 'South America': 9, 'Saudi Arabia': 7, 'Tennis': 6, 'Afghanistan / Pakistan': 4, '2014 Winter Olympics': 2, 'Iran / Yemen': 1})
{'start_date': 'February 27, 2018', 'end_date': 'February 27, 2018', 'blog_name': 'reuters_all', 'num_sentences': '3', 'selected_categories': ['Automotive']}
565
27
{'_id': ObjectId('5a95ee656b4a750da0a38189'), 'topic_confidence': 0.48650330131422315, 'author': 'Hyunjoo Jin', 'url': 'https://in.reuters.com/article/gm-southkorea/gm-korea-s



Start Loop
1/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
2/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
3/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
4/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
5/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
6/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
7/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
8/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
9/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
10/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
11/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
12/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
13/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
14/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
15/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
16/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
17/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
18/27
LSA
Edmundson
LexRank
Luhn
KL
SumBasic
TextRank
19/27
LSA
Edmundson
LexRan

127.0.0.1 - - [01/Mar/2018 12:02:19] "POST /get_summaries HTTP/1.1" 200 -
 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
 * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
