# Note: Please run all cells once. When the widgets are ready for use, the notebook will automatically will hide all code chunks and ... voila!!! #
***

*    For the setup of the Virtuoso ODBC data source please see section 1a in https://github.com/eurostat/NLP4Stat/tree/testing/Software%20Environment
*    Download the notebook as "raw" file and save it with extension .ipynb (cut the .txt extension which is added)
*    Install the necessary libraries from your jupyter command prompt. These, together with the versions used, are:
    *    tqdm==4.62.3
    *    spacy==3.0.7
    *    seaborn==0.11.2
    *    scispacy==0.4.0
    *    pandas==1.3.5
    *    numpy==1.20.3
    *    matplotlib==3.4.3
    *    joblib==1.1.0
    *    ipywidgets==7.6.5
    *    if you have difficulties installing scispacy install first numpy and nmslib

*   Copy the input files: **data.csv**, **tm_topics.csv**, **df_topic_sents_keywords.csv**, **SVOs_all_5_24_17_37.xlsx** from the [notebook folder](https://github.com/eurostat/NLP4Stat/tree/testing/Use%20case%20A/Use%20Case%20A%20Widgets%20Demo) to the same folder where you downloaded the notebook. 

# Use case A 
### “Asset querying” – Making the most of internal data assets in their multiple forms


The Use Case A is the ability to annotate and enrich content from the Knowledge Database.
A user makes a query to the Eurostat Website, this query goes through a tool that is in direct relation with KDB. The KDB is used to enriched documents from the Eurostat Website, and the product of these enrich documents will be the answer to the query made by the user. 


The query made by the user will either be a query builder, a faceted search, a data exploration or a visual data browsing. As presented in the schema above, the two first solutions, query builders and faceted search are about looking into specific content, while the two latter, data exploration and visual data browsing (graphical navigation) are more used to provide a general idea of the content inside the data. 

### Scenario A: Query Builder
***


A query builder is a program that builds a query from a set of parameters designed to perform a search in a specific database.I the libraries are loaded and the data related to the statistical articles are uploaded.

In [1]:
#Load libraries
import pandas as pd
from pandas import read_excel
import numpy as np 
import os
##import ee
##import string 

#Widgets libraries

import ipywidgets
import ipywidgets as widgets
from ipywidgets import interact,Layout, IntSlider, interactive, widgets, interact_manual,HBox,fixed,VBox, Box, HTML
from ipywidgets import Button, FloatText, Textarea, Dropdown, Label
from ipywidgets import interact_manual
layout = widgets.Layout(width='400px', height='30px')

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

pd.set_option('display.max_rows' , 1000)
pd.set_option('display.max_columns' , 1000)
pd.options.display.float_format = "{:.2f}".format

##from sklearn.feature_extraction import text
##from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
##from sklearn.decomposition import LatentDirichletAllocation

import scispacy
#import spacy
#import en_core_sci_lg
from scipy.spatial.distance import jensenshannon

from IPython.display import HTML, display,clear_output
from IPython.utils import io
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#from os.path import isfile
from itertools import product
import re
import pprint
from pprint import pprint
import spacy
import sys
## Run to install the language library, then comment-out
!{sys.executable} -m spacy download en_core_web_md

nlp = spacy.load('en_core_web_md')
print('Finished loading.')
nlp.max_length = 3000000


import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

Collecting en-core-web-md==3.0.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0/en_core_web_md-3.0.0-py3-none-any.whl (47.1 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_md')
Finished loading.


#### **Builder 1: Simple Keyword Searching**
***
The user inserts a keyword and this query structure finds articles including the keyword in the raw content of the article and then it returns the titles of the articles (related to the keyword) with their urls.

We have created an interactive widget, where the user has the potential to change simultaneously the keyword, category and number of articles displayed.

In [2]:
data = pd.read_csv('data.csv')   
##data

In [3]:
def change_top_articles(Top_articles, Keywords, categories):
  
 
  #Search by category in raw content
  if Keywords == '':
    print("")
  else:
    if categories == "All Categories":
      df1 = data[data['raw content'].str.contains(Keywords)]
      if df1.title.count() == 0:
        print("Not matches found")
      else:
        print(df1.title.count()," articles found")
        #output  
        h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in df1[["url","title"]][0:Top_articles].values])
        return display(HTML( h))
        #print(df1[:Top_articles.value])
    
     
    else:
      candidate = data[data['categories'].str.contains(categories)]
      df1 = candidate[candidate['raw content'].str.contains(Keywords)]
      if df1.title.count() == 0:
        print("Not mathces found")
      else:
        print(df1.title.count()," articles found")
        #print(df1[:Top_articles.value])
        #output  
        h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in df1[["url","title"]][0:Top_articles].values])
        return display(HTML( h))            

In [4]:
def query_build2():
  style = {'description_width': 'initial'}
  categories = widgets.Dropdown(
      description='Select Category:',
      options=['All Categories','Health', 'Tourism',  'Energy', 'Agriculture', 'World trade' ],
      value='All Categories',
      style=style
  )

  Top_articles = widgets.IntSlider(
      description='Display',
      tooltip='maximum:',
      value=5,
      min=1, 
      max = 30,
      style={'description_width': 'initial'}
  )


  Keywords = widgets.Text(
      value='hospitals',
      placeholder='Type something',
      description='Keywords:',
      disabled=False
  )



  ui = widgets.HBox([Keywords, categories, Top_articles])

  out = widgets.interactive_output(change_top_articles, {'Top_articles': Top_articles, 'Keywords': Keywords, 'categories': categories})

  display(ui, out)


In [5]:
query_build2()

HBox(children=(Text(value='hospitals', description='Keywords:', placeholder='Type something'), Dropdown(descri…

Output()

#### **Builder 2: Advanced Keyword Searching**
***
This query structure allows for advanced searching in terms of searching options. For example, the user can either search by titles, keywords related to the abstract or to the content of the article. An individual could also narrow down results by selecting a spesific category, year of the last update of the article and finally how many articles to display.

We have created an interctive widget, where the user can shift from simple keyword searching (described in Builder 1)  to the advanced mode. There is still the potential for the user to change simultaneously the searching options.

In [6]:
def AdvancedSearch(Top_articles, Keywords, categories, title, abstract):
  #data = dat
  # Search by Title
  if title == '':
    # Abstract if
    if abstract == '':
      # Keyword if
      if Keywords == '':
        print("")

      # Keywords == string
      else:
        if categories == "All Categories":
          df1 = data[data['raw content'].str.contains(Keywords)]
          if df1.title.count() == 0:
            print("Not mathces found")
          else:
            print(df1.title.count()," articles found")
            #output  
            h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in df1[["url","title"]][0:Top_articles].values])
            return display(HTML( h))
            #print(df1[:Top_articles.value])
        
        #Else for category = 'string'
        else:
          candidate = data[data['categories'].str.contains(categories)]
          df1 = candidate[candidate['raw content'].str.contains(Keywords)]
          if df1.title.count() == 0:
            print("Not mathces found")
          else:
            print(df1.title.count()," articles found")
            #print(df1[:Top_articles.value])
            #output  
            h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in df1[["url","title"]][0:Top_articles].values])
            return display(HTML( h))








    # Else abstract ==string
    else:
      abstractArt = data[data['abstract'].str.contains(abstract)]

      #Keywords if
      if Keywords == '':
        if categories == "All Categories":
          
          if abstractArt.title.count() == 0:
            print("Not mathces found")
          else:
            print(abstractArt.title.count()," articles found")
            #output  
            h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in abstractArt[["url","title"]][0:Top_articles].values])
            return display(HTML( h))
            
        
        #Else for category = 'string'
        else:
          candidate = abstractArt[abstractArt['categories'].str.contains(categories)]
          if candidate.title.count() == 0:
            print("Not mathces found")
          else:
            print(candidate.title.count()," articles found")
            #print(df1[:Top_articles.value])
            #output  
            h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in candidate[["url","title"]][0:Top_articles].values])
            return display(HTML( h))


      # else keywords == string
      else:
        KeywordArt = abstractArt[abstractArt['raw content'].str.contains(Keywords)]
        if categories == "All Categories":
            if KeywordArt.title.count() == 0:
              print("Not mathces found")
            else:
              print(KeywordArt.title.count()," articles found")
              #output  
              h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in KeywordArt[["url","title"]][0:Top_articles].values])
              return display(HTML( h))
              
        else:
          candidate = KeywordArt[KeywordArt['categories'].str.contains(categories)]
          if candidate.title.count() == 0:
            print("Not mathces found")
          else:
            print(candidate.title.count()," articles found")
            #output  
            h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in candidate[["url","title"]][0:Top_articles].values])
            return display(HTML( h))

    
    
 
 ################ Else for title = 'string' #######################################################################################################
  else:
    #Abstract if 
    titleArt = data[data['title'].str.contains(title)]
    if abstract == '':
      # Keyword is
      if Keywords == '':
        if categories == "All Categories":
          if titleArt.title.count() == 0:
            print("Not mathces found")
          else:
            print(titleArt.title.count()," articles found")
            #output  
            h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in titleArt[["url","title"]][0:Top_articles].values])
            return display(HTML( h))
            
        else:
          candidate = titleArt[titleArt['categories'].str.contains(categories)]
          df1 = candidate[candidate['title'].str.contains(title)]
          if df1.title.count() == 0:
            print("Not mathces found")
          else:
            print(df1.title.count()," articles found")
            #print(df1[:Top_articles.value])
            #output  
            h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in df1[["url","title"]][0:Top_articles].values])
            return display(HTML( h))

      # Keyword == string
      else:
        KeywordArt = titleArt[titleArt['raw content'].str.contains(Keywords)]
        if categories == "All Categories":
          if KeywordArt.title.count() == 0:
            print("Not mathces found")
          else:
            print(KeywordArt.title.count()," articles found")
            #output  
            h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in KeywordArt[["url","title"]][0:Top_articles].values])
            return display(HTML( h))
            
        else:
          candidate = KeywordArt[KeywordArt['categories'].str.contains(categories)]
          df1 = candidate[candidate['title'].str.contains(title)]
          if df1.title.count() == 0:
            print("Not mathces found")
          else:
            print(df1.title.count()," articles found")
            #output  
            h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in df1[["url","title"]][0:Top_articles].values])
            return display(HTML( h))



    #else Abstract == string  
    else:
      abstractArt = titleArt[titleArt['abstract'].str.contains(abstract)]
      # Keyword if
      if Keywords == '':
        if categories == "All Categories":
          
          if abstractArt.title.count() == 0:
            print("Not mathces found")
          else:
            print(abstractArt.title.count()," articles found")
            #output  
            h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in abstractArt[["url","title"]][0:Top_articles].values])
            return display(HTML( h))
            
        else:
          candidate = abstractArt[abstractArt['categories'].str.contains(categories)]
          df1 = candidate[candidate['title'].str.contains(title)]
          if df1.title.count() == 0:
            print("Not mathces found")
          else:
            print(df1.title.count()," articles found")
            #output  
            h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in df1[["url","title"]][0:Top_articles].values])
            return display(HTML( h))




      # Else Keyword == string
      else:
        KeywordArt = abstractArt[abstractArt['raw content'].str.contains(Keywords)]
        if categories == "All Categories":
          
          if KeywordArt.title.count() == 0:
            print("Not mathces found")
          else:
            print(KeywordArt.title.count()," articles found")
            #output  
            h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in KeywordArt[["url","title"]][0:Top_articles].values])
            return display(HTML( h))
            
        else:
          candidate = KeywordArt[KeywordArt['categories'].str.contains(categories)]
          df1 = candidate[candidate['title'].str.contains(title)]
          if df1.title.count() == 0:
            print("Not mathces found")
          else:
            print(df1.title.count()," articles found")
            
            #output  
            h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in df1[["url","title"]][0:Top_articles].values])
            return display(HTML( h))


In [7]:
def query_build4():
  style = {'description_width': 'initial'}
 
  categories = widgets.Dropdown(
      description='',
      options=['All Categories','Health', 'Tourism',  'Energy', 'Agriculture', 'World trade' ],
      value='All Categories',
      style=style
  )

  Top_articles = widgets.IntSlider(
      description='Display:',
      tooltip='maximum:',
      value=5,
      min=1, 
      max = 30,
      style=style
  )
  Top_articles.style.handle_color = 'lightblue'


  Pick_dat = widgets.IntRangeSlider(
      description=' ',
      tooltip='maximum:',
      value=[2010,2020],
      min=1950, 
      max = 2021,
      style=style
  )

  
  Pick_dat.style.handle_color = 'lightblue'

  Search_byTitle = widgets.Text(
      value='',
      placeholder='Type something',
      description='',
      disabled=False
  )

  Search_byAbstract = widgets.Text(
      value='',
      placeholder='Type something',
      description='',
      disabled=False
  )

  Keywords = widgets.Text(
      value='hospitals',
      placeholder='Type something',
      description=' ',
      disabled=False
  )

  form_item_layout = Layout(justify_content='space-between')

  form_items = [
      HBox([Label(value='Search by Title:'), Search_byTitle], layout=form_item_layout),
      HBox([Label(value='Search in Abstract'), 
          Search_byAbstract], layout=form_item_layout),
      HBox([Label(value='Keywords'),Keywords], layout=form_item_layout),
      HBox([Label(value='Categories'), 
          categories], layout=form_item_layout),
      HBox([Label(value='Year'), 
          Pick_dat], layout=form_item_layout)
  ]

  form = VBox(form_items, layout=Layout(
      border='2px solid gray', padding='10px',
      align_items='stretch', width='65%')
  )

  box2 = widgets.HBox([Top_articles])

  out = widgets.interactive_output(AdvancedSearch, {'Top_articles': Top_articles, 'Keywords': Keywords, 'categories': categories, 'title':Search_byTitle, 'abstract':Search_byAbstract})

  return display(form, box2, out)
  

In [8]:
def builder_Advanced():
  output = widgets.Output()
  style = {'description_width': 'initial'}
  advanced_search = widgets.Button(
      description='Advanced search',
      button_style='primary',
      tooltip='Click to download article',
      disabled=False,
      style=style )
  
  minimize = widgets.Button(
    description='Minimize',
    button_style='primary',
    tooltip='Minimize',
    disabled=False,
    style=style)
  display ( query_build2(),widgets.HBox([advanced_search]))
  
 
  def on_search_clicked(b):
    clear_output()  
    #call a function a function for getting advanced searching options
    display ( query_build4(),widgets.HBox([minimize]))  
  advanced_search.on_click(on_search_clicked)  

  def on_minimize_clicked(b):
    clear_output() 
    display ( query_build2(),widgets.HBox([advanced_search]))

  minimize.on_click(on_minimize_clicked)


In [9]:
builder_Advanced()

HBox(children=(Text(value='hospitals', description='Keywords:', placeholder='Type something'), Dropdown(descri…

Output()

None

HBox(children=(Button(button_style='primary', description='Advanced search', style=ButtonStyle(), tooltip='Cli…

### Scenario B: Facets
***

For this case scenario, we have implemented some query structures based on topic modelling and semantic analysis (Name Entity Recognition and their correspondive relations).


#### **Topic Modelling - Related Articles**
***
This query scenario is based on datasets extracted by Topic Modelling with Gensim library. In particular, user has the potential to search articles according to the dominant topics, narrow down results by typing topic keywords or even to find related articles to topic keywords that are not necessarily articles of the selected dominant topic. 


In [10]:
tm_topics = pd.read_csv('tm_topics.csv')
df_topic_sents_keywords = pd.read_csv('df_topic_sents_keywords.csv')

In [11]:
def display_side_by_side(dfs:list, captions:list):
    """Display tables side by side to save vertical space
    Input:
        dfs: list of pandas.DataFrame
        captions: list of table captions
    """
    output = ""
    combined = dict(zip(captions, dfs))
    for caption, df in combined.items():
        output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
        output += "\xa0\xa0\xa0"
    display(HTML(output))

In [12]:
def get_Related_Articles(selector,keyword,cloud):
  topics = tm_topics[tm_topics['Topic name'].str.contains(selector)]
  topicID = topics['Topic id'].values.tolist()
  if cloud:
    for i in topicID:
      candidateArticles = df_topic_sents_keywords[df_topic_sents_keywords['Dominant_Topic']== i]
      keywordArticles = df_topic_sents_keywords[df_topic_sents_keywords['Topic_Keywords'].str.contains(keyword)]
      related = keywordArticles.drop(keywordArticles[keywordArticles['Dominant_Topic']==i].index)
      #display_side_by_side([df1, df2, df3], ['caption1', 'caption2', 'caption3'])
    return display_side_by_side([candidateArticles[['Text title']],related[['Text title']]], ['Articles on dominant topic selected', 'Articles related to keyword'])

  else:
    if keyword == '':
      for i in topicID:
        candidateArticles = df_topic_sents_keywords[df_topic_sents_keywords['Dominant_Topic']== i]
      return display(candidateArticles[['Text title']])

    else:
      for i in topicID:
        candidateArticles = df_topic_sents_keywords[df_topic_sents_keywords['Dominant_Topic']== i]
        #candidateArticles = df_topic_sents_keywords[df_topic_sents_keywords['Topic_Keywords'].str.contains(keyword)]
      return display(candidateArticles[['Text title']])
    

In [13]:
def ShowRelatedArticles():
  style = {'description_width': 'initial'}

  selector =  widgets.Combobox(
  placeholder='Choose a Title',
  options=list(tm_topics['Topic name'].unique())+['All'],
  description='Select topic')
 

  cloud = widgets.Checkbox(
      value=False,
      description='Show related articles',
      style=style
  )
  
  Keywords = widgets.Text(
      value='',
      placeholder='Type something',
      description='Keywords:',
      disabled=False
  )
  ui  = widgets.HBox([selector, Keywords ,cloud]) 
  
  


  out = widgets.interactive_output(get_Related_Articles, {'selector':selector,'keyword': Keywords, 'cloud':cloud})

  display(ui, out)

In [14]:
ShowRelatedArticles()

HBox(children=(Combobox(value='', description='Select topic', options=('Technological product and process inno…

Output()

####  **Named-Entity Recognition methods**
***
This query structure relies on Subject-Verb-Object triplets extracted for both Statistics Explained and Glossary articles. 

We have implemented 3 subcases. In all cases, the difference is on text box selection options. 

The user types keywords, and instantly proposing phrases are returned along with a list of articles. Also there is an option to display only either statistics explained or glossary articles. 

***
##### Case 1: Proposing phrases - whole sentences containing at least one typed keyword

***

In [15]:
ner = pd.read_excel('SVOs_all_5_24_17_37.xlsx')
ner['SVOs_together'] = ner['Subject'] + ' ' + ner['Verb'] + ' ' + ner['Object']

In [16]:
def NER_articles1(selector,Statistics_Explained,Glossaries,howmany):
  # If Statistics_Explained == True, then show only Statistics Explained Articles
  if Statistics_Explained:
    candidateArticles = ner[ner['Sentence'].str.contains(selector)]
    proposedArticles = candidateArticles[candidateArticles['Source'].str.contains('SE')]
    proposedArticles = proposedArticles.drop_duplicates(subset=['Title'], keep='first')
    # drop duplicated titles
    print(proposedArticles.Title.count(),'Articles found.')
    h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in proposedArticles[["URL","Title"]][0:howmany].values])
    return display(HTML(h))
  
  # If Glossaries == True, then show only Glossary articles
  elif Glossaries:
    candidateArticles = ner[ner['Sentence'].str.contains(selector)]
    proposedArticles = candidateArticles[candidateArticles['Source'].str.contains('GL')]
    proposedArticles = proposedArticles.drop_duplicates(subset=['Title'], keep='first')
    # drop duplicated titles
    print(proposedArticles.Title.count(),'Articles found.')
    h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in proposedArticles[["URL","Title"]][0:howmany].values])
    return display(HTML(h))
  
  elif Statistics_Explained & Glossaries:
    Statistics_Explained = False
    candidateArticles = ner[ner['Sentence'].str.contains(selector)]
    proposedArticles = candidateArticles[candidateArticles['Source'].str.contains('GL')]
    proposedArticles = proposedArticles.drop_duplicates(subset=['Title'], keep='first')
    # drop duplicated titles
    print(proposedArticles.Title.count(),'Articles found.')
    h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in proposedArticles[["URL","Title"]][0:howmany].values])
    return display(HTML(h))


  else:
    candidateArticles = ner[ner['Sentence'].str.contains(selector)]
    candidateArticles = candidateArticles.drop_duplicates(subset=['Title'], keep='first')
    print(candidateArticles.Title.count(),'Articles found.')
    h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in candidateArticles[["URL","Title"]][0:howmany].values])
    return display(HTML(h))


In [17]:
def NER_builder1():
  style = {'description_width': 'initial'}

  selector =  widgets.Combobox(
    placeholder='Type something, e.g., EU, ASEA etc.',
    options=list(ner['Sentence'].unique()),
    description='Search')
  
  Statistics_Explained = widgets.Checkbox(
    value=False,
    description='Only Statistics Explained',
    style=style)
  
  Glossaries = widgets.Checkbox(
    value=False,
    description='Only Glossaries',
    style=style)
  
  howmany = widgets.IntSlider(
    description='Display:',
    tooltip='maximum:',
    value=5,
    min=1, 
    max = 30,
    style=style )
  howmany.style.handle_color = 'lightblue'
  
  ui = widgets.HBox([selector,Statistics_Explained,Glossaries, howmany])
  out = widgets.interactive_output(NER_articles1, {'selector':selector,'Statistics_Explained': Statistics_Explained, 'Glossaries':Glossaries, 'howmany':howmany})


  display(ui,out)

In [18]:
NER_builder1()

HBox(children=(Combobox(value='', description='Search', options=('This vision was confirmed by a Council Recom…

Output()

***
##### Case 2: Proposing organizations, countries, etc., instead of phrases
***

In [19]:
def NER_articles2(selector,Statistics_Explained,Glossaries,howmany):
  # If Statistics_Explained == True, then show only Statistics Explained Articles
  if Statistics_Explained:
    candidateArticles = ner[ner['Key'].str.contains(selector)]
    proposedArticles = candidateArticles[candidateArticles['Source'].str.contains('SE')]
    proposedArticles = proposedArticles.drop_duplicates(subset=['Title'], keep='first')
    # drop duplicated titles
    print(proposedArticles.Title.count(),'Articles found.')
    h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in proposedArticles[["URL","Title"]][0:howmany].values])
    return display(HTML(h))
  # If Glossaries == True, then show only Glossary articles
  elif Glossaries:
    candidateArticles = ner[ner['Key'].str.contains(selector)]
    proposedArticles = candidateArticles[candidateArticles['Source'].str.contains('GL')]
    proposedArticles = proposedArticles.drop_duplicates(subset=['Title'], keep='first')
    # drop duplicated titles
    print(proposedArticles.Title.count(),'Articles found.')
    h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in proposedArticles[["URL","Title"]][0:howmany].values])
    return display(HTML(h))
  
  elif Statistics_Explained & Glossaries:
    Statistics_Explained = False
    candidateArticles = ner[ner['Key'].str.contains(selector)]
    proposedArticles = candidateArticles[candidateArticles['Source'].str.contains('GL')]
    proposedArticles = proposedArticles.drop_duplicates(subset=['Title'], keep='first')
    # drop duplicated titles
    print(proposedArticles.Title.count(),'Articles found.')
    h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in proposedArticles[["URL","Title"]][0:howmany].values])
    return display(HTML(h))


  else:
    candidateArticles = ner[ner['Key'].str.contains(selector)]
    candidateArticles = candidateArticles.drop_duplicates(subset=['Title'], keep='first')
    print(candidateArticles.Title.count(),'Articles found.')
    h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in candidateArticles[["URL","Title"]][0:howmany].values])
    return display(HTML(h))


In [20]:
def NER_builder2():
  style = {'description_width': 'initial'}

  selector =  widgets.Combobox(
    placeholder='Type something, e.g., EU, ASEA etc.',
    options=list(ner['Key'].unique()),
    description='Search')
  
  Statistics_Explained = widgets.Checkbox(
    value=False,
    description='Only Statistics Explained',
    style=style)
  
  Glossaries = widgets.Checkbox(
    value=False,
    description='Only Glossaries',
    style=style)
  
  howmany = widgets.IntSlider(
    description='Display:',
    tooltip='maximum:',
    value=5,
    min=1, 
    max = 30,
    style=style )
  howmany.style.handle_color = 'lightblue'
  
  ui = widgets.HBox([selector,Statistics_Explained,Glossaries, howmany])
  out = widgets.interactive_output(NER_articles2, {'selector':selector,'Statistics_Explained': Statistics_Explained, 'Glossaries':Glossaries, 'howmany':howmany})


  display(ui,out)


In [21]:
NER_builder2()

HBox(children=(Combobox(value='', description='Search', options=('A COUNCIL RECOMMENDATION', 'AAA', 'ACER', 'A…

Output()

##### Case 3: Proposing phrases based on entity-relation-entity triplets (again Subject-Verb-Object relations)
***

In [22]:
def NER_articles3(selector,Statistics_Explained,Glossaries,howmany):

  # If Statistics_Explained == True, then show only Statistics Explained Articles
  if Statistics_Explained:
    candidateArticles = ner[ner['SVOs_together'].str.contains(selector)]
    proposedArticles = candidateArticles[candidateArticles['Source'].str.contains('SE')]
    proposedArticles = proposedArticles.drop_duplicates(subset=['Title'], keep='first')
    # drop duplicated titles
    print(proposedArticles.Title.count(),'Articles found.')
    h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in proposedArticles[["URL","Title"]][0:howmany].values])
    return display(HTML(h))

  # If Glossaries == True, then show only Glossary articles
  elif Glossaries:
    candidateArticles = ner[ner['SVOs_together'].str.contains(selector)]
    proposedArticles = candidateArticles[candidateArticles['Source'].str.contains('GL')]
    proposedArticles = proposedArticles.drop_duplicates(subset=['Title'], keep='first')
    # drop duplicated titles
    print(proposedArticles.Title.count(),'Articles found.')
    h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in proposedArticles[["URL","Title"]][0:howmany].values])
    return display(HTML(h))
  
  elif Statistics_Explained & Glossaries:
    Statistics_Explained = False
    candidateArticles = ner[ner['SVOs_together'].str.contains(selector)]
    proposedArticles = candidateArticles[candidateArticles['Source'].str.contains('GL')]
    proposedArticles = proposedArticles.drop_duplicates(subset=['Title'], keep='first')
    # drop duplicated titles
    print(proposedArticles.Title.count(),'Articles found.')
    h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in proposedArticles[["URL","Title"]][0:howmany].values])
    return display(HTML(h))


  else:
    candidateArticles = ner[ner['SVOs_together'].str.contains(selector)]
    candidateArticles = candidateArticles.drop_duplicates(subset=['Title'], keep='first')
    print(candidateArticles.Title.count(),'Articles found.')
    h = '<br/'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in candidateArticles[["URL","Title"]][0:howmany].values])
    return display(HTML(h))


In [23]:
def NER_builder3():
  style = {'description_width': 'initial'}

  selector =  widgets.Combobox(
    placeholder='Type something, e.g., EU, ASEA etc.',
    options=list(ner['SVOs_together'].unique()),
    description='Search')
  
  Statistics_Explained = widgets.Checkbox(
    value=False,
    description='Only Statistics Explained',
    style=style)
  
  Glossaries = widgets.Checkbox(
    value=False,
    description='Only Glossaries',
    style=style)
  
  howmany = widgets.IntSlider(
    description='Display:',
    tooltip='maximum:',
    value=5,
    min=1, 
    max = 30,
    style=style )
  howmany.style.handle_color = 'lightblue'
  
  ui = widgets.HBox([selector,Statistics_Explained,Glossaries, howmany])
  out = widgets.interactive_output(NER_articles3, {'selector':selector,'Statistics_Explained': Statistics_Explained, 'Glossaries':Glossaries, 'howmany':howmany})


  display(ui,out)


In [24]:
NER_builder3()

HBox(children=(Combobox(value='', description='Search', options=('a Council Recommendation on a comprehensive …

Output()

#### **Alternative way of proposing related articles**
***
We have also implemented an alterantive way of both exploring related articles and constructing query.

The purpose of this facet refers to users who have find an interesting article and want to easily find related research without browsing all existing publications themself. Also,it is very useful for users who have a specific research question and want to discover relevant articles.

At first we discover a number of topics using LDA (Latent Dirichlet Allocation). This allows us to view each article as a mixture of these topics. By mapping a specific article into the topic space, we can find related articles.

Articles are proposed according to a k-nearest algorithm in topic space based on Jensen - Shannon measure. The Jensen–Shannon divergence is a method of measuring the similarity between two probability distributions.

In this widget a user can insert any kind of text like abstract, paragraph, full text, keywords, questions and find related articles.

In [25]:
### Get Nearest Papers (in Topic Space) ###

In [26]:
#Install/Load packages.

import scispacy
import spacy
#import en_core_sci_lg
import joblib

from scipy.spatial.distance import jensenshannon
from tqdm import tqdm
from os.path import isfile

import seaborn as sb
import matplotlib.pyplot as plt
plt.style.use("dark_background")

#nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])


In [27]:
##### Latend Dirichlet Allocation ######
#Prepare dataset.

#all_texts = data['raw content']
#all_texts[0][:500]

def spacy_tokenizer(sentence):
    return [word.lemma_ for word in nlp(sentence) if not (word.like_num or word.is_stop or word.is_punct or word.is_space or len(word)==1)]
    
#vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, min_df=2)
#data_vectorized = vectorizer.fit_transform(tqdm(all_texts))
#data_vectorized.shape

#Most frequent words.

#word_count = pd.DataFrame({'word': vectorizer.get_feature_names(), 'count': np.asarray(data_vectorized.sum(axis=0))[0]})

#word_count.sort_values('count', ascending=False).set_index('word')[:20].sort_values('count', ascending=True).plot(kind='barh')
#joblib.dump(vectorizer, 'vectorizer.csv')
vectorizer = joblib.load('vectorizer.csv')
data_vectorized = joblib.load('data_vectorized.csv')
#joblib.dump(data_vectorized, 'data_vectorized.csv')
#lda = LatentDirichletAllocation(n_components=50, random_state=0)
#lda.fit(data_vectorized)
#joblib.dump(lda, 'lda.csv')

lda = joblib.load('lda.csv')

def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
#print_top_words(lda, vectorizer, n_top_words=25)
doc_topic_dist = pd.DataFrame(lda.transform(data_vectorized))
#doc_topic_dist.to_csv('doc_topic_dist.csv', index=False)


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [28]:
#Make variables 'title' and 'raw content' a list.

tasks = data.set_index('title').to_dict()['raw content']

In [29]:
article = data['raw content']

def get_k_nearest_docs(doc_dist, k=5, lower=1950, upper=2020, get_dist=False):
    '''
    doc_dist: topic distribution (sums to 1) of one article
    
    Returns the index of the k nearest articles (as by Jensen–Shannon divergence in topic space). 
    '''
    temp = doc_topic_dist

    distances = temp.apply(lambda x: jensenshannon(x, doc_dist), axis=1)
    k_nearest = distances[distances != 0].nsmallest(n=k).index
    
    if get_dist:
        k_distances = distances[distances != 0].nsmallest(n=k)
        return k_nearest, k_distances
    else:
        return k_nearest

def plot_article_dna(title, width=20):
    t = data[data.title == title].title.values[0]
    doc_topic_dist[data.title== title].T.plot(kind='bar', legend=None, title=t, figsize=(width, 4))
    plt.xlabel('Topic')

def compare_dnas(title, recommendation_id, width=20):
    t = data[data.title == recommendation_id].title.values[0]
    temp = doc_topic_dist[data.title == title]
    ymax = temp.max(axis=1).values[0]*1.25
    temp = pd.concat([temp, doc_topic_dist[data.title == recommendation_id]])
    temp.T.plot(kind='bar', title=t, figsize=(width, 4), ylim= [0, ymax])
    plt.xlabel('Topic')
    plt.legend(['Selection', 'Recommendation'])
    
def dna_tabs(title):
    k = len(title)
    outs = [widgets.Output() for i in range(k)]

    tab = widgets.Tab(children = outs)
    tab_titles = ['title ' + str(i+1) for i in range(k)]
    for i, t in enumerate(tab_titles):
        tab.set_title(i, t)
    display(tab)

    for i, t in enumerate(tab_titles):
        with outs[i]:
            ax = plot_article_dna(paper_ids[i])
            plt.show(ax)

def compare_tabs(title, recommendation_ids):
    k = len(recommendation_ids)
    outs = [widgets.Output() for i in range(k)]

    tab = widgets.Tab(children = outs)
    tab_titles = ['title ' + str(i+1) for i in range(k)]
    for i, t in enumerate(tab_titles):
        tab.set_title(i, t)
    display(tab)

    for i, t in enumerate(tab_titles):
        with outs[i]:
            ax = compare_dnas(title, recommendation_ids[i])
            plt.show(ax)

def recommendation(title, k=5, lower=1950, plot_dna=False):
    '''
    Returns the title of the k papers that are closest (topic-wise) to the paper given by id.
    '''
    
    print(data.title[data.title ==title].values[0])

    recommended, dist= get_k_nearest_docs(doc_topic_dist[data.title == title].iloc[0], k, lower, 10, get_dist=True)
    recommended = data.iloc[recommended].copy()
    recommended['similarity'] = 1 - dist 
    
    h = '<br/>'.join(['<a href="' + l + '" target="_blank">'+ n + '</a>' +' (Similarity: ' + "{:.2f}".format(s) + ')' for l, n, s in recommended[['url','title', 'similarity']].values])
    display(HTML(h))
    
    if plot_dna:
        compare_tabs(title, recommended.title.values)
              

In [30]:
def relevant_articles(tasks, k=3, lower=1950, upper=2020):
    tasks = [tasks] if type(tasks) is str else tasks 
    
    
    tasks_vectorized = vectorizer.transform(tasks)
    tasks_topic_dist = pd.DataFrame(lda.transform(tasks_vectorized))
   
    
    for index, bullet in enumerate(tasks):
        print(bullet)
       
        recommended = get_k_nearest_docs(tasks_topic_dist.iloc[index], k, lower, upper)
        recommended = data.iloc[recommended]
        
        h = '<br/>'.join(['<a href="' + l + '" target="_blank">'+ n + '</a>' for l, n in recommended[['url','title']].values])
        display(HTML(h))
        
      

In [31]:
#First_version
def relevant_articles_for_text():    
    textW = widgets.Textarea(
        value='Tourism',
        placeholder='Type something',
        description='',
        disabled=False,
        layout=Layout(width='90%', height='100px')
    )

    yearW = widgets.IntRangeSlider(min=1950, max=2020, value=[2010, 2020], description='Year Range', 
                               continuous_update=False, layout=Layout(width='40%'))
    kWidget = widgets.IntSlider(value=10, description='Display', max=50, min=1, layout=Layout(width='25%'))
    button = widgets.Button(description="Search")
    style = {'description_width': 'initial'}
    display(VBox([HBox([kWidget, yearW], layout=Layout(width='90%', justify_content='space-around')),
        textW, button], layout=Layout(align_items='center')))

    def on_button_clicked(b):
        clear_output()
        display(VBox([HBox([kWidget, yearW], layout=Layout(width='90%', justify_content='space-around')),
            textW, button], layout=Layout(align_items='center')))  
        relevant_articles(textW.value, kWidget.value, yearW.value[0], yearW.value[1])
        
    button.on_click(on_button_clicked)

In [32]:
relevant_articles_for_text()

VBox(children=(HBox(children=(IntSlider(value=10, description='Display', layout=Layout(width='25%'), max=50, m…

In [33]:
#HTML('''<script>
#code_show=true; 
#function code_toggle() {
# if (code_show){
# $('div.input').hide();
# } else {
# $('div.input').show();
# }
# code_show = !code_show
#} 
#$( document ).ready(code_toggle);
#</script>
#<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')