In [None]:
import os
from pprint import pprint
import pymongo
from pymongo import MongoClient    
import pyspark
from pyspark.ml import PipelineModel
import sparknlp
from sparknlp.annotator import *
from sparknlp.base import *
import time
import re
from zipfile import ZipFile
from pyspark.sql import dataframe
import json

# Initialize for View

In [None]:
conf = pyspark.SparkConf()
conf.set('spark.jars.packages', 
         "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1,com.databricks:spark-xml_2.12:0.18.0,com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.3")
conf.set('spark.driver.memory','8g')
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.SQLContext.getOrCreate(sc)
spark

In [None]:
# Set environment variables
%env MONGODB_USER=username
%env MONGODB_PASS=password

In [None]:
sys.path.insert(0, f"{os.environ['SCRATCH']}/youtube-for-newspapers/src")

In [None]:
# import database and pipeline from src
from ingest.database import NoSQLDatabase
from processing.nlp_pipeline import KeywordPipeline

In [None]:
# config reader
def read_config(fin):
    """Read .json config file"""
    with open(fin) as f:
        return json.load(f)

# create db object
database_config = read_config(f"{os.environ['SCRATCH']}/youtube-for-newspapers/config.json")
db =  NoSQLDatabase()
db = db.from_config(database_config["nosql_database"])

# create pipeline
kw_pipeline = KeywordPipeline.from_config(database_config["processing_pipeline"]["keywords"])
kw_pipeline.setup_pipeline()

# Visualizations

In [None]:
%%html
<style>
.box_style{
    width:100%;
    border : None;
    height: auto;
    background-color:#EEE;
    color=white;
}
.side_bar{
    width:100%;
    border: None;
    height: auto;
    background-color:#66b2b2;
    color=white;
}

.widget-label {
    color: white !important;
}

.widget_text {
    border-radius: 8px;
}
.button_style {
    margin-top: 15px;
}
</style>

In [None]:
import ipywidgets as ipw
from IPython.display import HTML, display, clear_output, Javascript
from bson.objectid import ObjectId
import re
from functools import partial

def show_document(header, text):
    
    display(Javascript("""
        require(
            ["base/js/dialog"], 
            function(dialog) {
                console.log('d', dialog.modal);
                dialog.modal({
                    title: '%s',
                    body: 'replace_me',
                    buttons: {
                        'Done': {}
                    }
                });
                // Using setTimeout to wait for modal to render first
                setTimeout(function(){
                const found_modals = document.getElementsByClassName("modal-body");
                for( const fm of found_modals){
                    if(fm.innerText === 'replace_me'){
                        fm.innerHTML = "<div>%s</div>";
                    }
                }
                }, 300)
            }
        );
        """ % (header, text)))

def view_doc_clicked(title, rendered_text, _b):
    show_document(title, rendered_text)


notify_output = ipw.Output()
display(notify_output)

@notify_output.capture()
def popup(text):
    clear_output()
    display(HTML("<script>alert('{}');</script>".format(text)))

def create_result_option(record_id, record_title, ctx_text, publisher, pub_date, object_id, rendered_text):
    items_layout = ipw.Layout(width='90%')
    
    children = []
    row_layout = ipw.Layout(display='flex', flex_flow='row', align_items='stretch', border_bottom='solid 2px lightgrey', padding='5px')
    sn_box = ipw.HBox([ipw.HTML(f"{record_id}.")], layout=ipw.Layout(width='3%', margin='5px 5px 5px 5px'))
    rn_box = ipw.HBox(children=[], layout=ipw.Layout(width='10%', margin='5px 5px 5px 5px'))
    
    if len(record_title):
        btnView = ipw.Button(description="View", tooltip=str(object_id))
        btnView.on_click(partial(view_doc_clicked, record_title, rendered_text))
        rn_box.children = [btnView]
        
        
        children.append(ipw.HTML(f"<b><font color='#1b75d0'; size=3px>{record_title}", layout=items_layout))
    if len(ctx_text):
        children.append(ipw.HTML(ctx_text, layout=items_layout))
    if len(publisher):
        children.append(ipw.HTML(f"<font color='grey'><b>Publisher:</b> {publisher}", layout=items_layout))
    if len(pub_date):
        children.append(ipw.HTML(f"<font color='grey'><b>Date:</b> {pub_date}", layout=items_layout))
    
    row_content = ipw.VBox(children = children, layout=ipw.Layout(width='90%'))
    row = ipw.HBox(children=[sn_box, row_content, rn_box], layout=row_layout)
 
    return row

In [None]:
def clean_query(query, stopwords):
    """Takes an user query and stopwords array and returns a keywords array"""
    query = re.sub("[^a-z]", " ", query.strip().lower())
    query = re.sub("  +", " ", query)
    keywords = query.split()
    keywords = " ".join([word for word in keywords if word not in stopwords])
    return keywords

def clean_text(text):
        text = re.sub("[^0-9a-zA-Z]", " ", text)
        text = re.sub(" +", " ", text).strip()
        return text

def article_keyword_highlighter(fulltext, keyword, chunk_start=None, chunk_end=None):
    output = []
    chunk_end = chunk_end if chunk_end else len(fulltext)
    chunk_start = chunk_start if chunk_start else 0
    block = fulltext[chunk_start:chunk_end]
    m = re.finditer(keyword, block, re.IGNORECASE)
    ind = 0
    for f in m:
        sp = f.span()
        start = sp[0]
        end = sp[1]
        output.append(block[ind:start] + \
            f"<span style='background-color: yellow;'>{block[start:end]}</span>")
        ind = end
    output.append(block[ind:])
    return "".join(output)

def word_highlighter(fulltext, keyword, ctx_length=100):
    tot_length = len(fulltext)
    f = re.search(keyword, fulltext, re.IGNORECASE)
    if not f:
        return fulltext
    sp = f.span()
    start = sp[0]
    end = sp[1]
    chunk_start = 0 if start-ctx_length < 0 else start-ctx_length
    chunk_end = tot_length if end+ctx_length > tot_length else end+ctx_length
    return article_keyword_highlighter(fulltext, keyword, chunk_start, chunk_end)

def search(query, num_articles=5):
    
    stopwords = StopWordsCleaner().getStopWords()
    keywords = clean_query(query, stopwords)
    keywords_regex = "|".join(keywords.split(" "))
    
    documents_cursor = db.query_keyword(keywords, kw_pipeline, spark)
    
    document_ids = list(doc["_id"] for doc in documents_cursor)
    
    data = db.collection.find({"_id": {"$in": document_ids}})
    
    data = list(data)
    i = 1
    children = []
    if len(data) < 1:
        children.append(create_result_option("#", '', "No articles found with this query", "", '', '',''))
        return children

    for d in data:
        text = clean_text(d["FullText"])
        ctx_text= word_highlighter(text, keywords_regex)

        rendered_text = article_keyword_highlighter(text, keywords_regex)
        children.append(create_result_option(i,
                                             d["RecordTitle"],
                                             ctx_text,
                                             d["Publisher"],
                                             d["AlphaPubDate"],
                                             d['_id'],
                                            rendered_text)
                       )
        i += 1
    return children

In [None]:
hor_layout = ipw.Layout(align_content='stretch', margin='0.1% 1% 0.1% 2% ', width='100%')

app_title = ipw.HTML('<h1> YouTube for Newspapers </h1>', layout=hor_layout)
app_footer = ipw.HTML('<p> Apache Spark, HDFS, MongoDB, NLP</p>', layout=hor_layout)

headerBox = ipw.HBox([app_title]).add_class('box_style')
footerBox = ipw.HBox([app_footer]).add_class('box_style')

txt_keyword = ipw.Text(placeholder='Enter a Keyword', description='Keyword:', disabled=False, layout=ipw.Layout(width='auto', 
    margin='15px 10px 5px 2px')).add_class('widget_text')
btnSearch = ipw.Button(description="Search!", icon='search').add_class('button_style')
box_layout = ipw.Layout(display='flex', flex_flow='column', align_items='center', width='100%')
btnContainer = ipw.HBox(children=[btnSearch], layout=box_layout)


def on_button_clicked(b):
    if len(txt_keyword.value.strip()) < 1:
        popup("Enter search Key.")
    else:
        main_panel.children = search(txt_keyword.value.lower())

btnSearch.on_click(on_button_clicked)

side_bar = ipw.VBox([txt_keyword, btnContainer]).add_class('side_bar')
main_panel = ipw.VBox([])

ipw.AppLayout(header=headerBox, left_sidebar=side_bar, center=main_panel, right_sidebar=None, footer=footerBox,
    pane_widths=[2, 7, 0],
    pane_heights=[1, 9, '40px'])