# Use Case B - Faceted search 
***

## Using Eurostat themes and sub-themes to search articles from the OECD's Glossary of Statistical Terms: https://stats.oecd.org/glossary/

### Revised (January 2022) to read all data from the database.
### Adjusted (May 2022) to read relations from the Knowledge Database and enrich the results.

### Installation instructions

This is a Google Colab notebook. You must have a Google account with a Google Drive. Please allow access to the Google Drive.

Launch the notebook and put your own credentials in the chunk with title "Connect to the Virtuoso database" 

### Connect to the Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Installations

In [2]:
!pip install pyodbc

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install SPARQLWrapper
!pip install sparql_dataframe

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!apt-get install virtuoso-opensource

Reading package lists... Done
Building dependency tree       
Reading state information... Done
virtuoso-opensource is already the newest version (6.1.6+repack-0ubuntu9).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 42 not upgraded.


### Imports

In [5]:
import pandas as pd
import numpy as np

import ipywidgets as widgets

import gensim

import pyodbc

import os 
import re
import logging
import sys
import hashlib
from datetime import datetime
from SPARQLWrapper import SPARQLWrapper, POST, DIGEST, GET
from SPARQLWrapper import JSON, INSERT, DELETE
import sparql_dataframe

In [6]:
import datetime

def file_name(pre,ext):
    current_time = datetime.datetime.now() 
    return pre + '_'+ str(current_time.month)+ '_' + str(current_time.day) + \
                 '_' + str(current_time.hour)+ '_' + str(current_time.minute)  +'.'+ext
    

### The data cleansing function

In [7]:
import re
import unicodedata as ud

def clean(x, quotes=True):
    if pd.isnull(x): return x  
    x = x.strip()
    
    ## make letter-question mark-letter -> letter-quote-space-letter !!! but NOT in the lists of URLs!!!
    if quotes:
        x = re.sub(r'([A-Za-z])\?([A-Za-z])','\\1\' \\2',x) ## NEW
    
    ## make letter-question mark-space lower case letter letter-quote-space letter
    x = re.sub(r'([A-Za-z])\? ([a-z])','\\1\' \\2',x) ## NEW

    ## delete ,000 commas in numbers    
    x = re.sub(r'\b(\d+),(\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## delete  000 spaces in numbers
    x = re.sub(r'\b(\d+) (\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## remove more than one spaces
    x = re.sub(r' +', ' ',x)
    
    ## remove start and end spaces
    x = re.sub(r'^ +| +$', '',x,flags=re.MULTILINE) 
    
    ## space-comma -> comma
    x = re.sub(r' \,',',',x)
    
    ## space-dot -> dot
    x = re.sub(r' \.','.',x)
    
    # x = x.encode('latin1').decode('utf-8') ## â\x80\x99
    x = ud.normalize('NFKD',x).encode('ascii', 'ignore').decode()
    
    return x

### Create a dictionary with Eurostat's themes and sub-themes

* Include some artificial ones (theme: 'Other') to match some OECD's Glossary themes.

In [8]:

themes = {'General and regional statistics/EU policies':
          ['Non-EU countries','Regions and cities','Sustainable development goals',
          'Policy indicators'],
          'Economy and finance': 
          ['Balance of payments','Comparative price levels (PPPs)','Consumer prices',
           'Exchange rates and interest rates','Government finance','National accounts (incl. GDP)'],
          'Population and social conditions':
          ['Asylum and migration','Crime','Culture','Education and training','Health',
           'Labour market','Living conditions','Population','Social protection','Sport','Youth'],
          'Industry and services': ['Short-term business statistics','Structural business statistics',
                                    'Business registers','Globalisation in businesses','Production statistics',
                                    'Tourism'],
          'Agriculture, forestry and fisheries':['Agriculture','Fisheries','Forestry'],
          'International trade':['Goods','Services'],
          'Transport':[],
          'Environment and energy':['Energy','Environment'],
          'Science, technology and digital society':['Digital economy and society','Science and technology'],
          'Other':['Methodology','Other']}



### Connect to the Virtuoso database

In [9]:
user = 'kimon'
passw = 'RkhvQYZ442e2JVXLHdtW'

In [10]:
c = pyodbc.connect('DRIVER=/usr/lib/odbc/virtodbc.so;HOST=lod.csd.auth.gr:1111;UID='+user+';PWD='+passw+';DATABASE=ESTAT')

In [11]:
#set encoding
c.setdecoding(pyodbc.SQL_CHAR, encoding='latin-1')
c.setencoding(encoding="latin-1")

In [12]:
cursor = c.cursor()

In [13]:
def load_table(cursor,query):
  cursor.execute(query)
  t1 = cursor.fetchall()
  df = pd.DataFrame.from_records(t1, columns=[x[0] for x in cursor.description])
  return df

In [14]:
def connect_virtuoso(DSN, UID, PWD):

    sparql = SPARQLWrapper(DSN)
    sparql.setHTTPAuth(DIGEST)
    sparql.setCredentials(UID, PWD)
    sparql.setMethod(GET)

    return sparql

# Connection to the KDB 
endpoint = "http://lod.csd.auth.gr:8890/sparql/"
sparql = connect_virtuoso(endpoint,user,passw)

### Read the table with OECD's terms and definitions
* Do not load cross-references (terms and URLs). These will be read from the knowledge database.


In [15]:
#import ast

query      = """SELECT id, article_id, term, url, definition, context, theme, last_update
                FROM ESTAT.V1.OECD_Glossary """

OECD_df = load_table(cursor,query)

OECD_df.head()


Unnamed: 0,id,article_id,term,url,definition,context,theme,last_update
0,1,1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=1,See Pollution abatement.,,Environmental statistics,"Thursday, March 14, 2002"
1,2,2,Absence from work due to illness,https://stats.oecd.org/glossary/detail.asp?ID=2,Absence from work due to illness refers to the...,,Health statistics,"Thursday, November 22, 2001"
2,3,3,Activity restriction - free expectancy,https://stats.oecd.org/glossary/detail.asp?ID=3,Functional limitation-free life expectancy is ...,,Health statistics,"Wednesday, October 31, 2001"
3,4,4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=4,Acute care is one in which the principal inten...,,Health statistics,"Thursday, April 25, 2013"
4,5,5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=5,Acute care beds are beds accommodating patient...,Acute care beds have alternatively been define...,Health statistics,"Thursday, April 25, 2013"


* Drop records with missing values and apply data cleansing.

In [16]:
OECD_df = OECD_df.replace('',np.nan)
print(OECD_df.isnull().sum())
OECD_df.dropna(subset=['term','definition'],inplace=True)
OECD_df.reset_index(drop=True, inplace=True)
print(OECD_df.isnull().sum())

OECD_df['term'] = OECD_df['term'].apply(clean)
OECD_df['term'] = OECD_df['term'].apply(lambda x: re.sub(r'\?','-',x))
OECD_df['definition'] = OECD_df['definition'].apply(clean)
OECD_df['context'] = OECD_df['context'].apply(clean)
OECD_df.head()

id                0
article_id        0
term              3
url               0
definition        0
context        5538
theme            35
last_update    1763
dtype: int64
id                0
article_id        0
term              0
url               0
definition        0
context        5536
theme            35
last_update    1761
dtype: int64


Unnamed: 0,id,article_id,term,url,definition,context,theme,last_update
0,1,1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=1,See Pollution abatement.,,Environmental statistics,"Thursday, March 14, 2002"
1,2,2,Absence from work due to illness,https://stats.oecd.org/glossary/detail.asp?ID=2,Absence from work due to illness refers to the...,,Health statistics,"Thursday, November 22, 2001"
2,3,3,Activity restriction - free expectancy,https://stats.oecd.org/glossary/detail.asp?ID=3,Functional limitation-free life expectancy is ...,,Health statistics,"Wednesday, October 31, 2001"
3,4,4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=4,Acute care is one in which the principal inten...,,Health statistics,"Thursday, April 25, 2013"
4,5,5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=5,Acute care beds are beds accommodating patient...,Acute care beds have alternatively been define...,Health statistics,"Thursday, April 25, 2013"


### Tokenize and stem the articles terms, definitions and contexts

* Also remove stop-words.
* Create columns _term tokens_, _definition tokens_, _context tokens_.

In [17]:
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import STOPWORDS
all_stopwords_gensim = STOPWORDS

p = PorterStemmer()

def text_to_words(text):
    words = str(gensim.utils.simple_preprocess(text, deacc=True))
    words = gensim.utils.tokenize(words)
    words = [word for word in words if not word in all_stopwords_gensim]
        
    words = [p.stem(token) for token in words]  
    return ' '.join(words)        

texts=list()

for i in range(len(OECD_df)):
    OECD_df.loc[i,'term tokens']=text_to_words(OECD_df.loc[i,'term'])
    OECD_df.loc[i,'definition tokens']=text_to_words(OECD_df.loc[i,'definition'])
    if not pd.isnull(OECD_df.loc[i,'context']):        
        OECD_df.loc[i,'context tokens']=text_to_words(OECD_df.loc[i,'context'])
    else:
        OECD_df.loc[i,'context tokens']=''
OECD_df.drop(columns=['definition','context'],inplace=True)
OECD_df.head()

Unnamed: 0,id,article_id,term,url,theme,last_update,term tokens,definition tokens,context tokens
0,1,1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=1,Environmental statistics,"Thursday, March 14, 2002",abat,pollut abat,
1,2,2,Absence from work due to illness,https://stats.oecd.org/glossary/detail.asp?ID=2,Health statistics,"Thursday, November 22, 2001",absenc work ill,absenc work ill refer number work dai lost yea...,
2,3,3,Activity restriction - free expectancy,https://stats.oecd.org/glossary/detail.asp?ID=3,Health statistics,"Wednesday, October 31, 2001",activ restrict free expect,function limit free life expect averag number ...,
3,4,4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=4,Health statistics,"Thursday, April 25, 2013",acut care,acut care princip intent follow manag labour o...,
4,5,5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=5,Health statistics,"Thursday, April 25, 2013",acut care bed,acut care bed bed accommod patient princip cli...,acut care bed altern defin bed accommod patien...


### Read the table with the correspondence between a) Eurostat's themes and sub-themes b) OECD's Glossary themes

* There may be more than one OECD's themes corresponding to a Eurostat's theme and sub-theme combination.

In [18]:
query =      """SELECT ESTAT_theme, ESTAT_sub_theme, OECD_themes
                FROM ESTAT.V1.Eurostat_OECD_themes """

corresp_df = load_table(cursor,query)
corresp_df['OECD_themes'] = corresp_df['OECD_themes'].apply(lambda x: x.split(';')) 
corresp_df

Unnamed: 0,ESTAT_theme,ESTAT_sub_theme,OECD_themes
0,General and regional statistics/EU policies,Non-EU countries,[]
1,General and regional statistics/EU policies,Regions and cities,[]
2,General and regional statistics/EU policies,Sustainable development goals,[]
3,General and regional statistics/EU policies,Policy indicators,[]
4,Economy and finance,Balance of payments,[Financial statistics - Balance of payments]
5,Economy and finance,Comparative price levels (PPPs),[Prices and purchasing power parities - Price ...
6,Economy and finance,Consumer prices,[Prices and purchasing power parities - Price ...
7,Economy and finance,Exchange rates and interest rates,[Financial statistics - Exchange rates]
8,Economy and finance,Government finance,[Financial statistics - Government finance and...
9,Economy and finance,National accounts (incl. GDP),"[National accounts - Input-output tables, Nati..."


### Insert Eurostat's themes - sub-themes information into OECD Glossary articles dataframe

In [19]:
OECD_df['ESTAT_theme']=pd.Series(list() for i in range(len(OECD_df)))
OECD_df['ESTAT_sub_theme']=pd.Series(list() for i in range(len(OECD_df)))
for i in range(len(OECD_df)):
    theme = OECD_df.loc[i,'theme']

    for j in range(len(corresp_df)):
        if theme in corresp_df.loc[j,'OECD_themes']:
            if corresp_df.loc[j,'ESTAT_theme'] not in OECD_df.loc[i,'ESTAT_theme']: ## avoid duplicates
                OECD_df.loc[i,'ESTAT_theme'].append(corresp_df.loc[j,'ESTAT_theme'])
            if corresp_df.loc[j,'ESTAT_sub_theme'] not in OECD_df.loc[i,'ESTAT_sub_theme']: ## avoid duplicates               
                OECD_df.loc[i,'ESTAT_sub_theme'].append(corresp_df.loc[j,'ESTAT_sub_theme'])
            

idx=OECD_df[OECD_df['ESTAT_theme'].apply(len)==0].index
OECD_df.drop(index=idx,inplace=True)
OECD_df.reset_index()
OECD_df.head()

Unnamed: 0,id,article_id,term,url,theme,last_update,term tokens,definition tokens,context tokens,ESTAT_theme,ESTAT_sub_theme
0,1,1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=1,Environmental statistics,"Thursday, March 14, 2002",abat,pollut abat,,[Environment and energy],"[Energy, Environment]"
1,2,2,Absence from work due to illness,https://stats.oecd.org/glossary/detail.asp?ID=2,Health statistics,"Thursday, November 22, 2001",absenc work ill,absenc work ill refer number work dai lost yea...,,[Population and social conditions],[Health]
2,3,3,Activity restriction - free expectancy,https://stats.oecd.org/glossary/detail.asp?ID=3,Health statistics,"Wednesday, October 31, 2001",activ restrict free expect,function limit free life expect averag number ...,,[Population and social conditions],[Health]
3,4,4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=4,Health statistics,"Thursday, April 25, 2013",acut care,acut care princip intent follow manag labour o...,,[Population and social conditions],[Health]
4,5,5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=5,Health statistics,"Thursday, April 25, 2013",acut care bed,acut care bed bed accommod patient princip cli...,acut care bed altern defin bed accommod patien...,[Population and social conditions],[Health]


In [20]:

OECD_df[["day", "month", "year"]] =OECD_df["last_update"].str.split(",", expand = True)
OECD_df['year'] =OECD_df["year"].astype(str)

OECD_df.loc[OECD_df['year'] == 'nan', 'year'] = np.nan 

OECD_df['year'].fillna(value="Not found", inplace=True)
OECD_df.reset_index(drop=True,inplace=True)
OECD_df.drop(columns=['last_update','day','month'],inplace=True)
OECD_df.head()

Unnamed: 0,id,article_id,term,url,theme,term tokens,definition tokens,context tokens,ESTAT_theme,ESTAT_sub_theme,year
0,1,1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=1,Environmental statistics,abat,pollut abat,,[Environment and energy],"[Energy, Environment]",2002
1,2,2,Absence from work due to illness,https://stats.oecd.org/glossary/detail.asp?ID=2,Health statistics,absenc work ill,absenc work ill refer number work dai lost yea...,,[Population and social conditions],[Health],2001
2,3,3,Activity restriction - free expectancy,https://stats.oecd.org/glossary/detail.asp?ID=3,Health statistics,activ restrict free expect,function limit free life expect averag number ...,,[Population and social conditions],[Health],2001
3,4,4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=4,Health statistics,acut care,acut care princip intent follow manag labour o...,,[Population and social conditions],[Health],2013
4,5,5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=5,Health statistics,acut care bed,acut care bed bed accommod patient princip cli...,acut care bed altern defin bed accommod patien...,[Population and social conditions],[Health],2013


### Read the relations from the Knowledge Database

* Objective: to display the related links together with their OECD themes.

In [21]:
## DEFINE input:inference <https://ec.europa.eu/eurostat/NLP4StatRef/knowledge/>
RelationsStatements = """
PREFIX estat: <https://ec.europa.eu/eurostat/NLP4StatRef/ontology/>
select * where { 
	?x a estat:OECDTerm .
    ?x estat:term ?term1.
    ?x estat:hasURL ?url1 .
    optional{?x estat:hasOECDTheme ?theme1.
    		?theme1 estat:title ?themetitle1.}
    ?x estat:relatedTerm ?y.
    ?y estat:term ?term2.
    ?y estat:hasURL ?url2 .
    optional{    ?y estat:hasOECDTheme ?theme2.
    			?theme2 estat:title ?themetitle2.}
} 
"""
## estat:GlossaryArticle OR StatisticsExplainedArticle

##    ?x estat:relatedTerm ?y.
##        FILTER (str(?x) < str(?y))
##    ?y estat:term ?term2.


sparql.setQuery(RelationsStatements)
sparql.method = "POST"
sparql.setReturnFormat(JSON)
results = sparql.query().convert()['results']['bindings']
results = pd.json_normalize(results)
print(len(results))
#print(results.columns)
#results.to_excel('/content/drive/MyDrive/results.xlsx')  
#results.head()

results2 = results.groupby(['x.value']).agg({'term1.value': lambda x: x.iloc[0],'url1.value': lambda x: x.iloc[0], \
                                             'themetitle1.value':lambda x: x.iloc[0], \
                                             'term2.value': lambda x: list(x), \
                                             'url2.value': lambda x: list(x), \
                                             'themetitle2.value':lambda x: list(x), \
                                             }).reset_index()
results2.drop(columns=['x.value'],inplace=True)        
results2.rename(columns={'term1.value':'term2','url1.value':'url','themetitle1.value':'OECD_theme', \
                         'term2.value':'related_terms','url2.value':'related_urls','themetitle2.value':'related_OECD_themes'},inplace=True)                                     
#results2.to_excel('/content/drive/MyDrive/results2.xlsx')  
                                                                               
results2.head()

3939


Unnamed: 0,term2,url,OECD_theme,related_terms,related_urls,related_OECD_themes
0,A posteriori audit,https://stats.oecd.org/glossary/detail.asp?ID=...,public management,[Ex poste control],[https://stats.oecd.org/glossary/detail.asp?ID...,[public management]
1,A priori audit,https://stats.oecd.org/glossary/detail.asp?ID=...,public management,[Ex ante control],[https://stats.oecd.org/glossary/detail.asp?ID...,[public management]
2,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=1,environmental statistics,[Pollution abatement],[https://stats.oecd.org/glossary/detail.asp?ID...,[environmental statistics]
3,Abatement cost,https://stats.oecd.org/glossary/detail.asp?ID=...,environmental statistics,[Abatement],[https://stats.oecd.org/glossary/detail.asp?ID=1],[environmental statistics]
4,ABO,https://stats.oecd.org/glossary/detail.asp?ID=...,financial statistics,[Accumulated benefit obligation ABO],[https://stats.oecd.org/glossary/detail.asp?ID...,[financial statistics]


### Merge with main file

In [22]:
OECD_df2 = pd.merge(OECD_df,results2,on='url',how='left')

#OECD_df2.to_excel('/content/drive/MyDrive/OECD_df2.xlsx')
OECD_df2.head()

Unnamed: 0,id,article_id,term,url,theme,term tokens,definition tokens,context tokens,ESTAT_theme,ESTAT_sub_theme,year,term2,OECD_theme,related_terms,related_urls,related_OECD_themes
0,1,1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=1,Environmental statistics,abat,pollut abat,,[Environment and energy],"[Energy, Environment]",2002,Abatement,environmental statistics,[Pollution abatement],[https://stats.oecd.org/glossary/detail.asp?ID...,[environmental statistics]
1,2,2,Absence from work due to illness,https://stats.oecd.org/glossary/detail.asp?ID=2,Health statistics,absenc work ill,absenc work ill refer number work dai lost yea...,,[Population and social conditions],[Health],2001,,,,,
2,3,3,Activity restriction - free expectancy,https://stats.oecd.org/glossary/detail.asp?ID=3,Health statistics,activ restrict free expect,function limit free life expect averag number ...,,[Population and social conditions],[Health],2001,,,,,
3,4,4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=4,Health statistics,acut care,acut care princip intent follow manag labour o...,,[Population and social conditions],[Health],2013,Acute care,health statistics,"[Acute care beds, Acute care hospital staff ra...",[https://stats.oecd.org/glossary/detail.asp?ID...,"[health statistics, health statistics, health ..."
4,5,5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=5,Health statistics,acut care bed,acut care bed bed accommod patient princip cli...,acut care bed altern defin bed accommod patien...,[Population and social conditions],[Health],2013,Acute care beds,health statistics,"[Acute care, Long term care beds in hospitals]",[https://stats.oecd.org/glossary/detail.asp?ID...,"[health statistics, health statistics]"


In [23]:
del(results,OECD_df)

## Facets
***
### The function filtering the results.

In [24]:

def isNaN(num):
    return num!= num

df1 = None

def articles(Top_articles, Keywords, themes_dd, sub_themes_dd,year,related): 

    global df1
    
    Keywords = text_to_words(Keywords) 
    print('Searching for keywords: ',Keywords)
    df1=OECD_df2[OECD_df2['term tokens'].str.contains(Keywords,regex=False) | \
                 OECD_df2['definition tokens'].str.contains(Keywords,regex=False) | \
                 OECD_df2['context tokens'].str.contains(Keywords,regex=False)]

    if len(df1) == 0:
        print("No matches found")
        return
        
    if year != "All years":    
        df1 = df1[df1['year'].str.contains(year)]    
    if len(df1) == 0:
        print("No matches found")
        return        
        
    if themes_dd != "All themes":
        #df1 = df1[df1['ESTAT_theme'].str.contains(themes_dd,regex=False)]
        df1=df1[df1['ESTAT_theme'].apply(lambda x: themes_dd in x)]
        

    if sub_themes_dd is not None:
        if sub_themes_dd != "All sub-themes" and sub_themes_dd != "":    
            #df1 = df1[df1['ESTAT_sub_theme'].str.contains(sub_themes_dd,regex=False)]
            df1=df1[df1['ESTAT_sub_theme'].apply(lambda x: sub_themes_dd in x)]

    if len(df1) == 0:
        print("No matches found")
    else:
        df1.reset_index(inplace=True)
        print(df1.term.count()," articles found")
        h = ''
        for i in range(min(Top_articles,len(df1))):
            l,n = df1.loc[i,["url","term"]].values
            theme1 = df1.loc[i,"theme"]
            h += '<br/><b><u><a href="' + l + '" target="_blank">'+ n + '</a></u></b>    (Theme: '+theme1+')'
            if related:
                if not isNaN(df1.loc[i,"related_terms"]):
                    if len(df1.loc[i,"related_terms"]) > 0:
                        h += '<blockquote>Related links:</blockquote>'
                        for k in range(len(df1.loc[i,"related_terms"])):
                            title = df1.loc[i,"related_terms"][k]
                            url = df1.loc[i,"related_urls"][k]
                            if not isNaN(df1.loc[i,"related_OECD_themes"]):
                                theme = df1.loc[i,"related_OECD_themes"][k]
                                if not isNaN(theme):
                                    theme = ' '.join([y.capitalize() for y in theme.split()])
                                else:
                                    theme='None'          
                            else:
                                theme='None'    
                            h += '<blockquote><blockquote><a href="' + url + '" target="_blank">'+ title + '</a>    (Theme: '+theme+')</blockquote></blockquote>'
                    else:
                        h += ' None'

                
        display(HTML( h))
  
    

### The widgets.

In [25]:
from IPython.display import display
from ipywidgets import HTML
layout = widgets.Layout(width='500px', height='300px')

In [26]:
def query_build2(themes):
    style = {'description_width': 'initial'}
      
    
    
    themes_dd = widgets.Dropdown(
        description='Select theme:',
        options=['All themes']+sorted([k for k in themes.keys()]),
        style=style
    )    
    

    def on_change_theme(change):
        if change['type'] == 'change' and change['name'] == 'value':
            if change['new'] == 'All themes':
                sub_themes_dd.options = []
            else:    
                sub_themes_dd.options = ['All sub-themes']+themes[themes_dd.value]
                
                

    themes_dd.observe(on_change_theme)
    
    sub_themes_dd = widgets.Dropdown(
        description='Select sub-theme:',
        options= [''],
        style=style
    )    
    

 
   

    Top_articles = widgets.IntSlider(
        description='Display',
        tooltip='maximum:',
        value=50,
        min=1, 
        max = 200,
        style={'description_width': 'initial'}
    )


    Keywords = widgets.Text(
        value='',
        placeholder='Type something',
        description='Keywords:',
        disabled=False
    )
    

    year = widgets.Dropdown(
        options=['All years','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','Not found'],
        value='All years',
        description='Year:',
        disabled=False)


    
    related = widgets.Checkbox(
        value=False,
        description='Show related links',
        disabled=False,
        indent=True
     )    
    
    out = widgets.interactive_output(articles, {'Top_articles': Top_articles, 'Keywords': Keywords,'themes_dd':themes_dd,'sub_themes_dd':sub_themes_dd,'year':year,'related':related})


    left_box = widgets.VBox([themes_dd, sub_themes_dd])
    right_box = widgets.VBox([Keywords,related])
    box = widgets.HBox([left_box, right_box])
    display(box)
    
    display(year,Top_articles,out)
    
        

query_build2(themes)



HBox(children=(VBox(children=(Dropdown(description='Select theme:', options=('All themes', 'Agriculture, fores…

Dropdown(description='Year:', options=('All years', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2…

IntSlider(value=50, description='Display', max=200, min=1, style=SliderStyle(description_width='initial'))

Output()