In [1]:
import os
from pprint import pprint
import pymongo
from pymongo import MongoClient    
import pyspark
from pyspark.ml import PipelineModel
import sparknlp
from sparknlp.annotator import *
from sparknlp.base import *
import time
import re
from zipfile import ZipFile

In [None]:
conf = pyspark.SparkConf()
conf.set('spark.jars.packages', 
         "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1,com.databricks:spark-xml_2.12:0.18.0,com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.3")
conf.set('spark.driver.memory','8g')
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.SQLContext.getOrCreate(sc)
spark

# Writing to DB

# Querying


In [None]:
# Set environment variables
%env MONGODB_PASS=password
%env MONGODB_USER=username

In [5]:
sys.path.insert(0, "/scratch/$USER/youtube-for-newspapers/src")

In [6]:
# import database and pipeline from src
from ingest.database import NoSQLDatabase
from processing.nlp_pipeline import KeywordPipeline
from pyspark.sql import dataframe
import json

In [7]:
def read_config(fin):
    """
    Read .json config file
    """
    with open(fin) as f:
        config = json.load(f)
    return config

In [8]:
# create db object
database_config = read_config(f"{os.environ['SCRATCH']}/youtube-for-newspapers/config.json")
db =  NoSQLDatabase()
db = db.from_config(database_config["nosql_database"])

# create pipeline
kw_pipeline = KeywordPipeline.from_config(database_config["processing_pipeline"]["keywords"])
kw_pipeline.setup_pipeline()



In [12]:
res = db.query_keyword("contest", kw_pipeline, spark)

In [13]:
for doc in res:
    print(doc)

{'_id': ObjectId('66383f1b5867ae2d611f044a'), 'score': 16.72786954782567, 'keywords': ['contest']}


# Visualizations

In [None]:
%%html
<style>
.box_style{
    width:100%;
    border : None;
    height: auto;
    background-color:#EEE;
    color=white;
}
.side_bar{
    width:100%;
    border: None;
    height: auto;
    background-color:#66b2b2;
    color=white;
}

.widget-label {
    color: white !important;
}

.widget_text {
    border-radius: 8px;
}
.button_style {
    margin-top: 15px;
}
</style>

In [None]:
import ipywidgets as ipw
from IPython.display import HTML, display, clear_output, Javascript
from bson.objectid import ObjectId
import re

def view_doc_clicked(b):
    _id = b.tooltip
    if len(_id):
        doc = db.newspapers.find_one({"_id": ObjectId(_id)})
        if doc:
            show_document(doc['RecordTitle'], doc['FullText'])
    

def show_document(header, text):
    text = re.sub("[^0-9a-zA-Z]", " ", text)
    text = re.sub(" +", " ", text).strip()
    
    display(Javascript("""
        require(
            ["base/js/dialog"], 
            function(dialog) {
                dialog.modal({
                    title: '%s',
                    body: '%s',
                    buttons: {
                        'Done': {}
                    }
                });
            }
        );
        """ % (header, text)))


notify_output = ipw.Output()
display(notify_output)

@notify_output.capture()
def popup(text):
    clear_output()
    display(HTML("<script>alert('{}');</script>".format(text)))

def create_result_option(record_id, record_title, ctx_text, publisher, pub_date, object_id):
    items_layout = ipw.Layout(width='90%')
    
    children = []
    row_layout = ipw.Layout(display='flex', flex_flow='row', align_items='stretch', border_bottom='solid 2px lightgrey', padding='5px')
    sn_box = ipw.HBox([ipw.HTML(f"{record_id}.")], layout=ipw.Layout(width='3%', margin='5px 5px 5px 5px'))
    rn_box = ipw.HBox(children=[], layout=ipw.Layout(width='10%', margin='5px 5px 5px 5px'))
    
    if len(record_title):
        btnView = ipw.Button(description="View", tooltip=str(object_id))
        btnView.on_click(view_doc_clicked)
        rn_box.children = [btnView]
        
        
        children.append(ipw.HTML(f"<b><font color='#1b75d0'; size=3px>{record_title}", layout=items_layout))
    if len(ctx_text):
        children.append(ipw.HTML(ctx_text, layout=items_layout))
    if len(publisher):
        children.append(ipw.HTML(f"<font color='grey'><b>Publisher:</b> {publisher}", layout=items_layout))
    if len(pub_date):
        children.append(ipw.HTML(f"<font color='grey'><b>Date:</b> {pub_date}", layout=items_layout))
    
    row_content = ipw.VBox(children = children, layout=ipw.Layout(width='90%'))
    row = ipw.HBox(children=[sn_box, row_content, rn_box], layout=row_layout)
 
    return row

In [None]:
def search_keyword(keyword):
    data = db.newspapers.aggregate([             
        {
            '$facet': {
                'docs': [
                    { '$unwind': '$keywords' },
                    { '$match': {'keywords.result': keyword} }
                ],
                'count': [{'$count': "count"} ]
            }
        },
        {
            '$addFields': { 'count': { '$arrayElemAt': ["$count.count", 0] } }
        }
    ])
    
    
    # ToDo: Limit to say 5 results
    data = data.next()
    i = 1
    children = []
    if len(data['docs']) < 1:
        children.append(create_result_option("#", '', "No Newspaper found with this keyword", "", '', ''))
        return children

    for d in data['docs']:
        # ctx_txt: take keyword with highest score and contexualize it
        start = d["keywords"]['begin'] or 0
        end = d["keywords"]['end'] or 0
        ctx_text=word_highlighter(d["FullText"], start, end, 90)

        children.append(create_result_option(i, d["RecordTitle"], ctx_text, d["Publisher"], d["AlphaPubDate"], d['_id']))
        i += 1
    return children   

def word_highlighter(fulltext, start, end, ctx_length=100):
    tot_length = len(fulltext)
    chunk_start = 0 if start-ctx_length < 0 else start-ctx_length
    chunk_end = tot_length if end+ctx_length > tot_length else end+ctx_length
    return fulltext[chunk_start:start] + \
        f"<span style='background-color: yellow;'>{fulltext[start:end + 1]}</span>" + \
        fulltext[end + 1:chunk_end]

# search_keyword("grass")

In [None]:
hor_layout = ipw.Layout(align_content='stretch', margin='0.1% 1% 0.1% 2% ', width='100%')

app_title = ipw.HTML('<h1> YouTube for Newspapers </h1>', layout=hor_layout)
app_footer = ipw.HTML('<p> Apache Spark, HDFS, MongoDB, NLP</p>', layout=hor_layout)

headerBox = ipw.HBox([app_title]).add_class('box_style')
footerBox = ipw.HBox([app_footer]).add_class('box_style')

txt_keyword = ipw.Text(placeholder='Enter a Keyword', description='Keyword:', disabled=False, layout=ipw.Layout(width='auto', 
    margin='15px 10px 5px 2px')).add_class('widget_text')
btnSearch = ipw.Button(description="Search!", icon='search').add_class('button_style')
box_layout = ipw.Layout(display='flex', flex_flow='column', align_items='center', width='100%')
btnContainer = ipw.HBox(children=[btnSearch], layout=box_layout)


def on_button_clicked(b):
    if len(txt_keyword.value.strip()) < 1:
        popup("Enter search Key.")
    else:
        main_panel.children = search(txt_keyword.value.lower())

btnSearch.on_click(on_button_clicked)

side_bar = ipw.VBox([txt_keyword, btnContainer]).add_class('side_bar')
main_panel = ipw.VBox([])

ipw.AppLayout(header=headerBox, left_sidebar=side_bar, center=main_panel, right_sidebar=None, footer=footerBox,
    pane_widths=[2, 7, 0],
    pane_heights=[1, 9, '40px'])