In [1]:
import os
from pprint import pprint
import pymongo
from pymongo import MongoClient    
import pyspark
from pyspark.ml import PipelineModel
import sparknlp
from sparknlp.annotator import *
from sparknlp.base import *
from time import time
import re
from zipfile import ZipFile
from pyspark.sql import dataframe
import json

# Initialize for View

In [2]:
conf = pyspark.SparkConf()
conf.set('spark.jars.packages', 
         "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1,com.databricks:spark-xml_2.12:0.18.0,com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.3")
conf.set('spark.driver.memory','8g')
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.SQLContext.getOrCreate(sc)
spark



:: loading settings :: url = jar:file:/ext3/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/stm6992/.ivy2/cache
The jars for the packages stored in: /home/stm6992/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
com.databricks#spark-xml_2.12 added as a dependency
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2d9e6b99-9bed-4772-a20e-a2318b866dd0;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mongodb#bson;4.0.5 in central
	found org.mongodb#mongodb-driver-core;4.0.5 in central
	found com.databricks#spark-xml_2.12;0.18.0 in central
	found commons-io#commons-io;2.11.0 in central
	found org.glassfish.jaxb#txw2;3.0.2 in central
	found org.apache.ws.xmlschema#xmlschema-core;2.3.0 in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.9.0 in central
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.3.3 in central
	found com.ty

24/05/07 23:19:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


<pyspark.sql.context.SQLContext at 0x15344e7346d0>

In [3]:
# Set environment variables
%env MONGODB_USER=dmb443
%env MONGODB_PASS=wXwfzBEQTangXV44

env: MONGODB_USER=dmb443
env: MONGODB_PASS=wXwfzBEQTangXV44


In [4]:
sys.path.insert(0, f"{os.environ['SCRATCH']}/youtube-for-newspapers/src")

In [13]:
# import database and pipeline from src
from ingest.database import NoSQLDatabase
from processing.nlp_pipeline import KeywordPipeline
from processing.nlp_pipeline import EmbeddingsPipeline

In [14]:
# config reader
def read_config(fin):
    """Read .json config file"""
    with open(fin) as f:
        return json.load(f)

# create db object
database_config = read_config(f"{os.environ['SCRATCH']}/youtube-for-newspapers/config.json")
db =  NoSQLDatabase()
db = db.from_config(database_config["nosql_database"])

# create keywords pipeline
kw_pipeline = KeywordPipeline.from_config(database_config["processing_pipeline"]["keywords"])
kw_pipeline.setup_pipeline()

# create keywords pipeline
em_pipeline = EmbeddingsPipeline.from_config(database_config["processing_pipeline"]["doc2vec"])
em_pipeline.setup_pipeline()

doc2vec_gigaword_300 download started this may take some time.
Approximate size to download 312.3 MB
[OK!]


# Visualizations

In [26]:
# load custom css
from IPython.core.display import HTML

def _set_css_style():
    res = !cd .. && pwd
    css_file_path = f"{res[0]}/chart.css"
    styles = open(css_file_path, "r").read()
    s = '<style>%s</style>' % styles     
    return HTML(s)

_set_css_style()

In [8]:
%%html
<style>
.box_style{
    width:100%;
    border : None;
    height: auto;
    background-color:#EEE;
    color=white;
}
.side_bar{
    width:100%;
    border: None;
    height: auto;
    background-color:#66b2b2;
    color=white;
}

.widget-label {
    color: white !important;
}

.widget_text {
    border-radius: 8px;
}
.button_style {
    margin-top: 15px;
}
.widget-html > .widget-html-content {
   line-height: 20px !important;
}
.widget-html {
    line-height: 20px !important;
}
.view_button_style {
    background: white;
    border: 1px solid #c4c3dd;
    border-radius: 5px;
}
.chart--sm {
    height: 0.5em;
}
</style>

In [21]:
import ipywidgets as ipw
from IPython.display import HTML, display, clear_output, Javascript
from bson.objectid import ObjectId
import re
from functools import partial
import cgi

def show_document(header, text):
    
    display(Javascript("""
        require(
            ["base/js/dialog"], 
            function(dialog) {
                console.log('d', dialog.modal);
                dialog.modal({
                    title: '%s',
                    body: 'replace_me',
                    buttons: {
                        'Done': {}
                    }
                });
                // Using setTimeout to wait for modal to render first
                setTimeout(function(){
                const found_modals = document.getElementsByClassName("modal-body");
                for( const fm of found_modals){
                    if(fm.innerText === 'replace_me'){
                        fm.innerHTML = "<div>%s</div>";
                    }
                }
                }, 300)
            }
        );
        """ % (header, text)))

def view_doc_clicked(title, rendered_text, _b):
    show_document(clean_text(title), rendered_text)

notify_output = ipw.Output()
display(notify_output)

@notify_output.capture()
def popup(text):
    clear_output()
    display(HTML("<script>alert('{}');</script>".format(text)))

def create_result_option(record_id, record_title, ctx_text, publisher, pub_date, object_id, vs_score, kw_score, rendered_text):
    items_layout = ipw.Layout(width='90%')
    
    children = []
    row_layout = ipw.Layout(display='flex', flex_flow='row', align_items='stretch', border_bottom='solid 2px lightgrey', padding='5px')
    sn_box = ipw.HBox([], layout=ipw.Layout(width='2%', margin='5px 5px 5px 5px'))
    rn_box = ipw.HBox(children=[], layout=ipw.Layout(width='10%', margin='5px 5px 5px 5px'))    
    
    html_out = ""      
        
    if len(ctx_text):
        html_out += f"<br>{ctx_text}"
    if len(publisher):
        html_out += f"<br><font color='#cc3333'/><b>{publisher}</b>"
    if len(pub_date):
        html_out += f": <font color='grey'/>{pub_date}"
    
    if len(record_title):
        html_out = f"""<span class='text-primary' style='margin-right: 5px; font-size: 1.5rem'>
                            <i class='fa fa-fw fa-globe'></i>
                        </span>
                        <b><font color='#1b75d0' size=3px/>{record_title}</b><font color='black' size=2px/> {html_out}"""
        
        btnView = ipw.Button(description="", tooltip=str(object_id), icon='file-text-o', layout=ipw.Layout(width='100px')).add_class('view_button_style')
        btnView.on_click(partial(view_doc_clicked, record_title, rendered_text))
        rn_box.children = [btnView]
    
    
    rrf_out = """
    <div class="charts charts--grouped">
		<div class="charts__chart chart--p100 chart--sm" data-percent>
			<div class="charts__chart chart--p100 chart--green" data-percent>
				<div class="charts__chart chart--p%s chart--blue" data-percent></div>
			</div>
		</div>
	</div>
    """ % kw_score
    
    rrf_row = ipw.HBox([
        ipw.HTML("<span class='text-primary'><b>Lex</b></span>", layout=ipw.Layout(width='5%')),
        ipw.HTML(rrf_out, layout=ipw.Layout(width='60%')),
        ipw.HTML("<span class='text-success' style='margin-left: 5px;'><b>Vec</b></span>", layout=ipw.Layout(width='5%'))
    ])
      
    row_content = ipw.VBox([ipw.HTML(html_out), rrf_row], layout=ipw.Layout(width='90%'))
    row = ipw.HBox(children=[sn_box, row_content, rn_box], layout=row_layout)
 
    return row

Output()

In [10]:
def get_fusion_strengths(json_object, _id):
    vec_perc = 0
    kw_perc = 0
    for doc in json_object:
        if int(doc['recordID']) == int(_id):
            if doc['vs_score'] == 0:
                kw_perc = 100
            elif doc['kw_score'] == 0:
                vec_perc = 100
            else:
                vec_perc = int("%.0f" % (doc['vs_score']/doc['score'] * 100))
                kw_perc = 100 - vec_perc
                
    return vec_perc, kw_perc

In [11]:
def clean_query(query, stopwords):
    """Takes an user query and stopwords array and returns a keywords array"""
    query = re.sub("[^a-z]", " ", query.strip().lower())
    query = re.sub("  +", " ", query)
    keywords = query.split()
    keywords = " ".join([word for word in keywords if word not in stopwords])
    return keywords

def clean_text(text):
        text = re.sub("[^0-9a-zA-Z]", " ", text)
        text = re.sub(" +", " ", text).strip()
        return text

def article_keyword_highlighter(fulltext, keyword, chunk_start=None, chunk_end=None):
    output = []
    chunk_end = chunk_end if chunk_end else len(fulltext)
    chunk_start = chunk_start if chunk_start else 0
    block = fulltext[chunk_start:chunk_end]
    m = re.finditer(keyword, block, re.IGNORECASE)
    ind = 0
    for f in m:
        sp = f.span()
        start = sp[0]
        end = sp[1]
        output.append(block[ind:start] + \
            f"<span style='background-color: yellow;'>{block[start:end]}</span>")
        ind = end
    output.append(block[ind:])
    return "".join(output)

def word_highlighter(fulltext, keyword, ctx_length=200):
    tot_length = len(fulltext)
    f = re.search(keyword, fulltext, re.IGNORECASE)
    if not f:
        return fulltext[0:ctx_length]
    sp = f.span()
    start = sp[0]
    end = sp[1]
    chunk_start = 0 if start-ctx_length < 0 else start-ctx_length
    chunk_end = tot_length if end+ctx_length > tot_length else end+ctx_length
    return article_keyword_highlighter(fulltext, keyword, chunk_start, chunk_end)

def search(query):
    keywords_regex = "|".join(query.split(" "))
    documents_cursor = db.query_hybrid(query, kw_pipeline, em_pipeline, spark)
    rrf_list = [doc for doc in documents_cursor]
    
    document_ids = list(doc["_id"] for doc in rrf_list)
    
    data = db.collection.find({"_id": {"$in": document_ids}})
    
    data = list(data)
    i = 1
    children = []
    if len(data) < 1:
        children.append(create_result_option("#", '', "No articles found with this query", "", '', '',''))
        return children

    for d in data:
        text = clean_text(d["FullText"])
        ctx_text= word_highlighter(text, keywords_regex)

        rendered_text = article_keyword_highlighter(text, keywords_regex)
        vs_score, kw_score = get_fusion_strengths(rrf_list, d['RecordID'])
        children.append(create_result_option(i,
                                             d["RecordTitle"],
                                             ctx_text,
                                             d["Publisher"],
                                             d["AlphaPubDate"],
                                             d['_id'],
                                             vs_score,
                                             kw_score,
                                            rendered_text)
                       )
        i += 1
    return children

In [12]:
hor_layout = ipw.Layout(align_content='stretch', margin='0.1% 1% 0.1% 2% ', width='100%')

app_title = ipw.HTML('<h1> YouTube for Newspapers </h1>', layout=hor_layout)
app_footer = ipw.HTML('<p> Apache Spark, HDFS, MongoDB, NLP</p>', layout=hor_layout)

headerBox = ipw.HBox([app_title]).add_class('box_style')
footerBox = ipw.HBox([app_footer]).add_class('box_style')

txt_keyword = ipw.Text(placeholder='Enter a Keyword', description='Keyword:', disabled=False, layout=ipw.Layout(width='auto', 
    margin='15px 10px 5px 2px')).add_class('widget_text')
btnSearch = ipw.Button(description="Search!", icon='search').add_class('button_style')
box_layout = ipw.Layout(display='flex', flex_flow='column', align_items='center', width='100%')
btnContainer = ipw.HBox(children=[btnSearch], layout=box_layout)


def on_button_clicked(b):
    start = time()
    if len(txt_keyword.value.strip()) < 1:
        popup("Enter search Key.")
    else:
        main_panel.children = search(txt_keyword.value.lower())
    stop = time()
    print(stop-start)

btnSearch.on_click(on_button_clicked)

side_bar = ipw.VBox([txt_keyword, btnContainer]).add_class('side_bar')
main_panel = ipw.VBox([])

ipw.AppLayout(header=headerBox, left_sidebar=side_bar, center=main_panel, right_sidebar=None, footer=footerBox,
    pane_widths=[2, 7, 0],
    pane_heights=[1, 9, '40px'])

AppLayout(children=(HBox(children=(HTML(value='<h1> YouTube for Newspapers </h1>', layout=Layout(align_content…

                                                                                

4.162949562072754
0.9905591011047363
0.7330286502838135
0.783376932144165
0.7048976421356201
0.8069908618927002
0.6878113746643066
0.6985025405883789


<IPython.core.display.Javascript object>

0.6495499610900879
0.6787228584289551
0.697368860244751


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>