# Use Case B - Faceted search 
***
## Using Eurostat themes and sub-themes to search articles from the OECD's Glossary of Statistical Terms: https://stats.oecd.org/glossary/

In [1]:
import pandas as pd
import numpy as np

import ipywidgets as widgets

import gensim


In [2]:
import datetime

def file_name(pre,ext):
    current_time = datetime.datetime.now() 
    return pre + '_'+ str(current_time.month)+ '_' + str(current_time.day) + \
                 '_' + str(current_time.hour)+ '_' + str(current_time.minute)  +'.'+ext
    

### The data cleansing function

In [3]:
import re
import unicodedata as ud

def clean(x, quotes=True):
    if pd.isnull(x): return x  
    x = x.strip()
    
    ## make letter-question mark-letter -> letter-quote-space-letter !!! but NOT in the lists of URLs!!!
    if quotes:
        x = re.sub(r'([A-Za-z])\?([A-Za-z])','\\1\' \\2',x) ## NEW
    
    ## make letter-question mark-space lower case letter letter-quote-space letter
    x = re.sub(r'([A-Za-z])\? ([a-z])','\\1\' \\2',x) ## NEW

    ## delete ,000 commas in numbers    
    x = re.sub(r'\b(\d+),(\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## delete  000 spaces in numbers
    x = re.sub(r'\b(\d+) (\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## remove more than one spaces
    x = re.sub(r' +', ' ',x)
    
    ## remove start and end spaces
    x = re.sub(r'^ +| +$', '',x,flags=re.MULTILINE) 
    
    ## space-comma -> comma
    x = re.sub(r' \,',',',x)
    
    ## space-dot -> dot
    x = re.sub(r' \.','.',x)
    
    #x = x.encode('latin1').decode('utf-8') ## â\x80\x99
    x = ud.normalize('NFKD',x).encode('ascii', 'ignore').decode()
    
    return x

### Create a dictionary with Eurostat's themes and sub-themes

* Include some artificial ones (theme: 'Other') to match some OECD's Glossary themes.

In [4]:

themes = {'General and regional statistics/EU policies':
          ['Non-EU countries','Regions and cities','Sustainable development goals',
          'Policy indicators'],
          'Economy and finance': 
          ['Balance of payments','Comparative price levels (PPPs)','Consumer prices',
           'Exchange rates and interest rates','Government finance','National accounts (incl. GDP)'],
          'Population and social conditions':
          ['Asylum and migration','Crime','Culture','Education and training','Health',
           'Labour market','Living conditions','Population','Social protection','Sport','Youth'],
          'Industry and services': ['Short-term business statistics','Structural business statistics',
                                    'Business registers','Globalisation in businesses','Production statistics',
                                    'Tourism'],
          'Agriculture, forestry and fisheries':['Agriculture','Fisheries','Forestry'],
          'International trade':['Goods','Services'],
          'Transport':[],
          'Environment and energy':['Energy','Environment'],
          'Science, technology and digital society':['Digital economy and society','Science and technology'],
          'Other':['Methodology','Other']}



### Read the file with OECD's terms and definitions
* Column 'related' has the cross-references separated by semicolons and also with some invalid ones (not valid URL in 'related_URL' removed).


In [5]:
OECD_df = pd.read_excel('OECD_final_results_2.xlsx')
OECD_df.drop(columns=['Unnamed: 0','Cross References:','French Equivalent:','French Definition:',
                     'Glossary Output Segments:','Classification Indicator:','Version Indicator:',
                     'Created on','Source Publication:','Hyperlink:'],inplace=True)
OECD_df.rename(columns={'Term':'term','Definition:':'definition',
                        'Statistical Theme:':'theme','Cross_References_2':'related','Context:':'context',
                       'URL:Cross References':'related_URL','Last updated on':'last_update'},inplace=True)
OECD_df

Unnamed: 0,ID,URL,term,related_URL,definition,theme,last_update,context,related
0,1,https://stats.oecd.org/glossary/detail.asp?ID=1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=...,See Pollution abatement.,Environmental statistics,"Thursday, March 14, 2002",,Pollution abatement
1,2,https://stats.oecd.org/glossary/detail.asp?ID=2,Absence from work due to illness,,Absence from work due to illness refers to the...,Health statistics,"Thursday, November 22, 2001",,
2,3,https://stats.oecd.org/glossary/detail.asp?ID=3,Activity restriction - free expectancy,,Functional limitation-free life expectancy is ...,Health statistics,"Wednesday, October 31, 2001",,
3,4,https://stats.oecd.org/glossary/detail.asp?ID=4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=...,Acute care is one in which the principal inten...,Health statistics,"Thursday, April 25, 2013",,Acute care beds;Acute care hospital staff rati...
4,5,https://stats.oecd.org/glossary/detail.asp?ID=5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=...,Acute care beds are beds accommodating patient...,Health statistics,"Thursday, April 25, 2013",Acute care beds have alternatively been define...,Acute care;Long-term care beds in hospitals
...,...,...,...,...,...,...,...,...,...
6934,7353,https://stats.oecd.org/glossary/detail.asp?ID=...,CO2,https://stats.oecd.org/glossary/detail.asp?ID=284,,,"Thursday, April 4, 2013",,Carbon dioxide (CO2)
6935,7354,https://stats.oecd.org/glossary/detail.asp?ID=...,Carbon market,https://stats.oecd.org/glossary/detail.asp?ID=...,A popular (but misleading) term for a trading ...,,"Thursday, April 4, 2013",,Greenhouse gases
6936,7355,https://stats.oecd.org/glossary/detail.asp?ID=...,Classification structure,https://stats.oecd.org/glossary/detail.asp?ID=350,Refers to how the categories of a classificati...,,"Tuesday, April 9, 2013",,Classification
6937,7356,https://stats.oecd.org/glossary/detail.asp?ID=...,United Nation Framework Convention on Climate ...,https://stats.oecd.org/glossary/detail.asp?ID=...,The United Nations Framework Convention on Cli...,,"Friday, April 26, 2013","The other “Rio Conventions”, also negotiated a...",United Nations Conference on Environment and D...


* Drop records with missing values and apply data cleansing.

In [6]:
print(OECD_df.isnull().sum())
OECD_df.dropna(subset=['term','definition'],inplace=True)
OECD_df.reset_index(drop=True, inplace=True)
print(OECD_df.isnull().sum())

OECD_df['term'] = OECD_df['term'].apply(clean)
OECD_df['definition'] = OECD_df['definition'].apply(clean)
OECD_df['context'] = OECD_df['context'].apply(clean)
OECD_df.head(5)

ID                0
URL               0
term              4
related_URL    4374
definition        3
theme            37
last_update    1764
context        5541
related        4374
dtype: int64
ID                0
URL               0
term              0
related_URL    4371
definition        0
theme            35
last_update    1761
context        5536
related        4371
dtype: int64


Unnamed: 0,ID,URL,term,related_URL,definition,theme,last_update,context,related
0,1,https://stats.oecd.org/glossary/detail.asp?ID=1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=...,See Pollution abatement.,Environmental statistics,"Thursday, March 14, 2002",,Pollution abatement
1,2,https://stats.oecd.org/glossary/detail.asp?ID=2,Absence from work due to illness,,Absence from work due to illness refers to the...,Health statistics,"Thursday, November 22, 2001",,
2,3,https://stats.oecd.org/glossary/detail.asp?ID=3,Activity restriction - free expectancy,,Functional limitation-free life expectancy is ...,Health statistics,"Wednesday, October 31, 2001",,
3,4,https://stats.oecd.org/glossary/detail.asp?ID=4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=...,Acute care is one in which the principal inten...,Health statistics,"Thursday, April 25, 2013",,Acute care beds;Acute care hospital staff rati...
4,5,https://stats.oecd.org/glossary/detail.asp?ID=5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=...,Acute care beds are beds accommodating patient...,Health statistics,"Thursday, April 25, 2013",Acute care beds have alternatively been define...,Acute care;Long-term care beds in hospitals


### Tokenize and stem the articles terms, definitions and contexts

* Also remove stop-words.
* Create columns _term tokens_, _definition tokens_, _context tokens_.

In [7]:
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import STOPWORDS
all_stopwords_gensim = STOPWORDS

p = PorterStemmer()

def text_to_words(text):
    words = str(gensim.utils.simple_preprocess(text, deacc=True))
    words = gensim.utils.tokenize(words)
    words = [word for word in words if not word in all_stopwords_gensim]
        
    words = [p.stem(token) for token in words]  
    return ' '.join(words)        

texts=list()

for i in range(len(OECD_df)):
    OECD_df.loc[i,'term tokens']=text_to_words(OECD_df.loc[i,'term'])
    OECD_df.loc[i,'definition tokens']=text_to_words(OECD_df.loc[i,'definition'])
    if not pd.isnull(OECD_df.loc[i,'context']):        
        OECD_df.loc[i,'context tokens']=text_to_words(OECD_df.loc[i,'context'])
    else:
        OECD_df.loc[i,'context tokens']=''

OECD_df

Unnamed: 0,ID,URL,term,related_URL,definition,theme,last_update,context,related,term tokens,definition tokens,context tokens
0,1,https://stats.oecd.org/glossary/detail.asp?ID=1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=...,See Pollution abatement.,Environmental statistics,"Thursday, March 14, 2002",,Pollution abatement,abat,pollut abat,
1,2,https://stats.oecd.org/glossary/detail.asp?ID=2,Absence from work due to illness,,Absence from work due to illness refers to the...,Health statistics,"Thursday, November 22, 2001",,,absenc work ill,absenc work ill refer number work dai lost yea...,
2,3,https://stats.oecd.org/glossary/detail.asp?ID=3,Activity restriction - free expectancy,,Functional limitation-free life expectancy is ...,Health statistics,"Wednesday, October 31, 2001",,,activ restrict free expect,function limit free life expect averag number ...,
3,4,https://stats.oecd.org/glossary/detail.asp?ID=4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=...,Acute care is one in which the principal inten...,Health statistics,"Thursday, April 25, 2013",,Acute care beds;Acute care hospital staff rati...,acut care,acut care princip intent follow manag labour o...,
4,5,https://stats.oecd.org/glossary/detail.asp?ID=5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=...,Acute care beds are beds accommodating patient...,Health statistics,"Thursday, April 25, 2013",Acute care beds have alternatively been define...,Acute care;Long-term care beds in hospitals,acut care bed,acut care bed bed accommod patient princip cli...,acut care bed altern defin bed accommod patien...
...,...,...,...,...,...,...,...,...,...,...,...,...
6928,7352,https://stats.oecd.org/glossary/detail.asp?ID=...,European Agricultural Fund for Rural Developme...,https://stats.oecd.org/glossary/detail.asp?ID=...,The Common Agricultural Policy (CAP) is financ...,,"Wednesday, April 3, 2013",,Common Agricultural Policy (CAP);European Agri...,european agricultur fund rural develop eafrd,common agricultur polici cap financ fund europ...,
6929,7354,https://stats.oecd.org/glossary/detail.asp?ID=...,Carbon market,https://stats.oecd.org/glossary/detail.asp?ID=...,A popular (but misleading) term for a trading ...,,"Thursday, April 4, 2013",,Greenhouse gases,carbon market,popular mislead term trade countri bui sell un...,
6930,7355,https://stats.oecd.org/glossary/detail.asp?ID=...,Classification structure,https://stats.oecd.org/glossary/detail.asp?ID=350,Refers to how the categories of a classificati...,,"Tuesday, April 9, 2013",,Classification,classif structur,refer categori classif arrang group sub divid ...,
6931,7356,https://stats.oecd.org/glossary/detail.asp?ID=...,United Nation Framework Convention on Climate ...,https://stats.oecd.org/glossary/detail.asp?ID=...,The United Nations Framework Convention on Cli...,,"Friday, April 26, 2013","The other Rio Conventions, also negotiated at ...",United Nations Conference on Environment and D...,unit nation framework convent climat chang unfccc,unit nation framework convent climat chang unf...,rio convent negoti unit nation confer environ ...


### Read the file with the correspondence between a) Eurostat's themes and sub-themes b) OECD's Glossary themes

* There may be more than one OECD's themes corresponding to a Eurostat's theme and sub-theme combination.

In [8]:
corresp_df =  pd.read_excel('themes_eurostat_oecd_v2.xlsx')
corresp_df.drop(columns=['Unnamed: 0','id'],inplace=True)
corresp_df.rename(columns={'Eurostat_Themes':'ESTAT_theme','Subthemes':'ESTAT_sub_theme','OECD_Themes':'OECD_theme'},inplace=True)
corresp_df.replace(np.nan,value='',inplace=True)
corresp_df['OECD_theme'] = corresp_df['OECD_theme'].apply(lambda x: x.split(';'))
print(corresp_df.isnull().sum())
corresp_df


ESTAT_theme        0
ESTAT_sub_theme    0
OECD_theme         0
dtype: int64


Unnamed: 0,ESTAT_theme,ESTAT_sub_theme,OECD_theme
0,General and regional statistics/EU policies,Non-EU countries,[]
1,General and regional statistics/EU policies,Regions and cities,[]
2,General and regional statistics/EU policies,Sustainable development goals,[]
3,General and regional statistics/EU policies,Policy indicators,[]
4,Economy and finance,Balance of payments,[Financial statistics - Balance of payments]
5,Economy and finance,Comparative price levels (PPPs),[Prices and purchasing power parities - Price ...
6,Economy and finance,Consumer prices,[Prices and purchasing power parities - Price ...
7,Economy and finance,Exchange rates and interest rates,[Financial statistics - Exchange rates]
8,Economy and finance,Government finance,[Financial statistics - Government finance and...
9,Economy and finance,National accounts (incl. GDP),"[National accounts - Input-output tables, Nati..."


### Insert Eurostat's themes - sub-themes information into OECD Glossary articles dataframe

In [9]:
OECD_df['ESTAT_theme']=pd.Series(list() for i in range(len(OECD_df)))
OECD_df['ESTAT_sub_theme']=pd.Series(list() for i in range(len(OECD_df)))
for i in range(len(OECD_df)):
    theme = OECD_df.loc[i,'theme']
    #print(theme)
    for j in range(len(corresp_df)):
        if theme in corresp_df.loc[j,'OECD_theme']:
            if corresp_df.loc[j,'ESTAT_theme'] not in OECD_df.loc[i,'ESTAT_theme']: ## avoid duplicates
                OECD_df.loc[i,'ESTAT_theme'].append(corresp_df.loc[j,'ESTAT_theme'])
            if corresp_df.loc[j,'ESTAT_sub_theme'] not in OECD_df.loc[i,'ESTAT_sub_theme']: ## avoid duplicates               
                OECD_df.loc[i,'ESTAT_sub_theme'].append(corresp_df.loc[j,'ESTAT_sub_theme'])
            

idx=OECD_df[OECD_df['ESTAT_theme'].apply(len)==0].index
OECD_df.drop(index=idx,inplace=True)
OECD_df.reset_index()
OECD_df

Unnamed: 0,ID,URL,term,related_URL,definition,theme,last_update,context,related,term tokens,definition tokens,context tokens,ESTAT_theme,ESTAT_sub_theme
0,1,https://stats.oecd.org/glossary/detail.asp?ID=1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=...,See Pollution abatement.,Environmental statistics,"Thursday, March 14, 2002",,Pollution abatement,abat,pollut abat,,[Environment and energy],"[Energy, Environment]"
1,2,https://stats.oecd.org/glossary/detail.asp?ID=2,Absence from work due to illness,,Absence from work due to illness refers to the...,Health statistics,"Thursday, November 22, 2001",,,absenc work ill,absenc work ill refer number work dai lost yea...,,[Population and social conditions],[Health]
2,3,https://stats.oecd.org/glossary/detail.asp?ID=3,Activity restriction - free expectancy,,Functional limitation-free life expectancy is ...,Health statistics,"Wednesday, October 31, 2001",,,activ restrict free expect,function limit free life expect averag number ...,,[Population and social conditions],[Health]
3,4,https://stats.oecd.org/glossary/detail.asp?ID=4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=...,Acute care is one in which the principal inten...,Health statistics,"Thursday, April 25, 2013",,Acute care beds;Acute care hospital staff rati...,acut care,acut care princip intent follow manag labour o...,,[Population and social conditions],[Health]
4,5,https://stats.oecd.org/glossary/detail.asp?ID=5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=...,Acute care beds are beds accommodating patient...,Health statistics,"Thursday, April 25, 2013",Acute care beds have alternatively been define...,Acute care;Long-term care beds in hospitals,acut care bed,acut care bed bed accommod patient princip cli...,acut care bed altern defin bed accommod patien...,[Population and social conditions],[Health]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6919,7343,https://stats.oecd.org/glossary/detail.asp?ID=...,Statistical products,https://stats.oecd.org/glossary/detail.asp?ID=...,"Statistical products are, generally, informati...",Methodological information (metadata),"Monday, October 1, 2007",Statistical products include general-purpose t...,Statistical data,statist product,statist product gener inform dissemin product ...,statist product includ gener purpos tabul anal...,[Other],[Methodology]
6920,7344,https://stats.oecd.org/glossary/detail.asp?ID=...,Statistical press release,https://stats.oecd.org/glossary/detail.asp?ID=...,Is an announcement to media of statistical pro...,Methodological information (metadata),"Monday, October 1, 2007",,Statistical products,statist press releas,announc media statist product releas contain t...,,[Other],[Methodology]
6921,7345,https://stats.oecd.org/glossary/detail.asp?ID=...,Press release,https://stats.oecd.org/glossary/detail.asp?ID=...,See Statistical press release.,Methodological information (metadata),,,Statistical press release,press releas,statist press releas,,[Other],[Methodology]
6922,7346,https://stats.oecd.org/glossary/detail.asp?ID=...,APW - Average Production Worker,,An adult full-time worker directly engaged in ...,Tax policy & analysis - Taxing Wages,"Thursday, January 13, 2011",This definition was last used in tax calculati...,,apw averag product worker,adult time worker directli engag product activ...,definit tax calcul tax wage public averag work...,[Economy and finance],[Government finance]


In [10]:
def my_split(x):
    if pd.isna(x):
        return []
    else:
        return x.split(';')

OECD_df['related'] = OECD_df['related'].apply(my_split)
OECD_df[["day", "month", "year"]] =OECD_df["last_update"].str.split(",", expand = True)
OECD_df['year'] =OECD_df["year"].astype(str)

OECD_df.loc[OECD_df['year'] == 'nan', 'year'] = np.nan 

OECD_df['year'].fillna(value="Not found", inplace=True)
OECD_df.reset_index(drop=True,inplace=True)
OECD_df

Unnamed: 0,ID,URL,term,related_URL,definition,theme,last_update,context,related,term tokens,definition tokens,context tokens,ESTAT_theme,ESTAT_sub_theme,day,month,year
0,1,https://stats.oecd.org/glossary/detail.asp?ID=1,Abatement,https://stats.oecd.org/glossary/detail.asp?ID=...,See Pollution abatement.,Environmental statistics,"Thursday, March 14, 2002",,[Pollution abatement],abat,pollut abat,,[Environment and energy],"[Energy, Environment]",Thursday,March 14,2002
1,2,https://stats.oecd.org/glossary/detail.asp?ID=2,Absence from work due to illness,,Absence from work due to illness refers to the...,Health statistics,"Thursday, November 22, 2001",,[],absenc work ill,absenc work ill refer number work dai lost yea...,,[Population and social conditions],[Health],Thursday,November 22,2001
2,3,https://stats.oecd.org/glossary/detail.asp?ID=3,Activity restriction - free expectancy,,Functional limitation-free life expectancy is ...,Health statistics,"Wednesday, October 31, 2001",,[],activ restrict free expect,function limit free life expect averag number ...,,[Population and social conditions],[Health],Wednesday,October 31,2001
3,4,https://stats.oecd.org/glossary/detail.asp?ID=4,Acute care,https://stats.oecd.org/glossary/detail.asp?ID=...,Acute care is one in which the principal inten...,Health statistics,"Thursday, April 25, 2013",,"[Acute care beds, Acute care hospital staff ra...",acut care,acut care princip intent follow manag labour o...,,[Population and social conditions],[Health],Thursday,April 25,2013
4,5,https://stats.oecd.org/glossary/detail.asp?ID=5,Acute care beds,https://stats.oecd.org/glossary/detail.asp?ID=...,Acute care beds are beds accommodating patient...,Health statistics,"Thursday, April 25, 2013",Acute care beds have alternatively been define...,"[Acute care, Long-term care beds in hospitals]",acut care bed,acut care bed bed accommod patient princip cli...,acut care bed altern defin bed accommod patien...,[Population and social conditions],[Health],Thursday,April 25,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5667,7343,https://stats.oecd.org/glossary/detail.asp?ID=...,Statistical products,https://stats.oecd.org/glossary/detail.asp?ID=...,"Statistical products are, generally, informati...",Methodological information (metadata),"Monday, October 1, 2007",Statistical products include general-purpose t...,[Statistical data],statist product,statist product gener inform dissemin product ...,statist product includ gener purpos tabul anal...,[Other],[Methodology],Monday,October 1,2007
5668,7344,https://stats.oecd.org/glossary/detail.asp?ID=...,Statistical press release,https://stats.oecd.org/glossary/detail.asp?ID=...,Is an announcement to media of statistical pro...,Methodological information (metadata),"Monday, October 1, 2007",,[Statistical products],statist press releas,announc media statist product releas contain t...,,[Other],[Methodology],Monday,October 1,2007
5669,7345,https://stats.oecd.org/glossary/detail.asp?ID=...,Press release,https://stats.oecd.org/glossary/detail.asp?ID=...,See Statistical press release.,Methodological information (metadata),,,[Statistical press release],press releas,statist press releas,,[Other],[Methodology],,,Not found
5670,7346,https://stats.oecd.org/glossary/detail.asp?ID=...,APW - Average Production Worker,,An adult full-time worker directly engaged in ...,Tax policy & analysis - Taxing Wages,"Thursday, January 13, 2011",This definition was last used in tax calculati...,[],apw averag product worker,adult time worker directli engag product activ...,definit tax calcul tax wage public averag work...,[Economy and finance],[Government finance],Thursday,January 13,2011


### Produce also file for input to Power BI

In [11]:
OECD_content = OECD_df.copy()
OECD_content['ESTAT_theme'] = OECD_content['ESTAT_theme'].apply(lambda x: ';'.join(x))
OECD_content['ESTAT_sub_theme'] = OECD_content['ESTAT_sub_theme'].apply(lambda x: ';'.join(x))
OECD_content
OECD_content.to_excel('OECD_content.xlsx')

## Facets
***
### The function filtering the results.

In [12]:

df1 = None

def articles(Top_articles, Keywords, themes_dd, sub_themes_dd,year,related): 

    global df1
    
    Keywords = text_to_words(Keywords) 
    df1=OECD_df[OECD_df['term tokens'].str.contains(Keywords,regex=False) | OECD_df['definition tokens'].str.contains(Keywords,regex=False) | OECD_df['context tokens'].str.contains(Keywords,regex=False)]

    if year != "All years":    
        df1 = df1[df1['year'].str.contains(year)]    
        
        
    if themes_dd != "All themes":
        #df1 = df1[df1['ESTAT_theme'].str.contains(themes_dd,regex=False)]
        df1=df1[df1['ESTAT_theme'].apply(lambda x: themes_dd in x)]
        

    if sub_themes_dd is not None:
        if sub_themes_dd != "All sub-themes" and sub_themes_dd != "":    
            #df1 = df1[df1['ESTAT_sub_theme'].str.contains(sub_themes_dd,regex=False)]
            df1=df1[df1['ESTAT_sub_theme'].apply(lambda x: sub_themes_dd in x)]

    if len(df1) == 0:
        print("No matches found")
    else:
        df1.reset_index(inplace=True)
        print(df1.term.count()," articles found")
        h = ''
        for i in range(min(Top_articles,len(df1))):
            l,n = df1.loc[i,["URL","term"]].values
            h += '<br/><u><a href="' + l + '" target="_blank">'+ n + '</a></u>'
            if related:
                h += '<blockquote>Related links:'
                if len(df1.loc[i,"related"]) > 0:
                    # print(df1.loc[i,"related"])
                    #print(len(df1.loc[i,"related"]))
                    #if not np.isnan(df1.loc[i,"related"]):
                    for k in range(len(df1.loc[i,"related"])):
                        #print(df1.loc[i,"related"])
                        title = df1.loc[i,"related"][k]
                        url = df1.loc[i,"related_URL"][k]
                        h += '<br><a href="' + url + '" target="_blank">'+ title + '</a>'
                else:
                    h += ' None'
                h += '</blockquote>'
        display(HTML( h))
  
    

### The widgets.

In [13]:
from IPython.display import display
from ipywidgets import HTML
layout = widgets.Layout(width='500px', height='30px')

In [14]:
def query_build2(themes):
    style = {'description_width': 'initial'}
      
    
    
    themes_dd = widgets.Dropdown(
        description='Select theme:',
        options=['All themes']+sorted([k for k in themes.keys()]),
        style=style
    )    
    

    def on_change_theme(change):
        if change['type'] == 'change' and change['name'] == 'value':
            if change['new'] == 'All themes':
                sub_themes_dd.options = []
            else:    
                sub_themes_dd.options = ['All sub-themes']+themes[themes_dd.value]
                
                

    themes_dd.observe(on_change_theme)
    
    sub_themes_dd = widgets.Dropdown(
        description='Select sub-theme:',
        options= [''],
        style=style
    )    
    

 
   

    Top_articles = widgets.IntSlider(
        description='Display',
        tooltip='maximum:',
        value=50,
        min=1, 
        max = 200,
        style={'description_width': 'initial'}
    )


    Keywords = widgets.Text(
        value='',
        placeholder='Type something',
        description='Keywords:',
        disabled=False
    )
    

    year = widgets.Dropdown(
        options=['All years','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','Not found'],
        value='All years',
        description='Year:',
        disabled=False)


    
    related = widgets.Checkbox(
        value=False,
        description='Show related links',
        disabled=False,
        indent=True
     )    
    
    out = widgets.interactive_output(articles, {'Top_articles': Top_articles, 'Keywords': Keywords,'themes_dd':themes_dd,'sub_themes_dd':sub_themes_dd,'year':year,'related':related})


    left_box = widgets.VBox([themes_dd, sub_themes_dd])
    right_box = widgets.VBox([Keywords,related])
    box = widgets.HBox([left_box, right_box])
    display(box)
    
    display(year,Top_articles,out)
    
        

query_build2(themes)



HBox(children=(VBox(children=(Dropdown(description='Select theme:', options=('All themes', 'Agriculture, fores…

Dropdown(description='Year:', options=('All years', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2…

IntSlider(value=50, description='Display', max=200, min=1, style=SliderStyle(description_width='initial'))

Output()