# Use Case B - Query builder 
***
### Using 2-grams, 3-grams, 4-grams with content from the SE Glossary articles, the Statistics Explained articles and OECD's Glossary of Statistical Terms: https://stats.oecd.org/glossary/¶


In [21]:
import pandas as pd
import numpy as np

import pyodbc

import gensim

pd.set_option('display.max_colwidth', 40)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from datetime import datetime

def file_name(pre,ext):
    current_time = datetime.now() 
    return pre + '_'+ str(current_time.month)+ '_' + str(current_time.day) + \
                 '_' + str(current_time.hour)+ '_' + str(current_time.minute)  +'.'+ext
    

### Connect to the database

In [22]:
c = pyodbc.connect('DSN=Virtuoso All;DBA=ESTAT;UID=kimon;PWD=RkhvQYZ442e2JVXLHdtW')
cursor = c.cursor()

### Glossary articles  

* Definitions from dat_glossary.
* Titles and URLs from dat_link_info (with resource_information_id=1, i.e. Eurostat, see ESTAT.V1.mod_resource_information).
* Match above on id.
* Errors from the scraping process are corrected in a later chunk.

In [23]:
SQLCommand = """SELECT T1.id, T1.definition, T2.title, T2.url 
                FROM ESTAT.V1.dat_glossary as T1 
                INNER JOIN ESTAT.V1.dat_link_info as T2  
                  ON T1.id=T2.id 
                WHERE T2.resource_information_id=1 """

GL_df = pd.read_sql(SQLCommand,c)
GL_df = GL_df[['id', 'title', 'definition', 'url']]
GL_df.head(5)

Unnamed: 0,id,title,definition,url
0,1,Accident at work,An accident at work in the ...,https://ec.europa.eu/eurostat/statis...
1,5,Fatal accident at work,A fatal accident at work re...,https://ec.europa.eu/eurostat/statis...
2,6,Non-fatal accident at work,A non-fatal accident at w...,https://ec.europa.eu/eurostat/statis...
3,8,Aggregate demand,Aggregate demand is the total ...,https://ec.europa.eu/eurostat/statis...
4,9,Goods and services account,The goods and services account...,https://ec.europa.eu/eurostat/statis...


### Check for missing information

In [24]:
GL_df = GL_df.replace('', np.nan) 
print(GL_df.isnull().sum())


id            0
title         0
definition    2
url           0
dtype: int64


### Delete records with empty definitions and carry out data cleansing

* Also put the URLs in lists. This is required later.

In [25]:
## drop records with any field empty
GL_df = GL_df.dropna(axis=0,how='any')
print(GL_df.isnull().sum())
GL_df.reset_index(drop=True, inplace=True)

import re
#import unicodedata as ud

def clean(x, quotes=True):
    if pd.isnull(x): return x  
    x = x.strip()
    
    ## make letter-question mark-letter -> letter-quote-space-letter !!! but NOT in the lists of URLs!!!
    if quotes:
        x = re.sub(r'([A-Za-z])\?([A-Za-z])','\\1\' \\2',x) 
    
    ## make letter-question mark-space lower case letter letter-quote-space letter
    x = re.sub(r'([A-Za-z])\? ([a-z])','\\1\' \\2',x) 

    ## delete ,000 commas in numbers    
    x = re.sub(r'\b(\d+),(\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## delete  000 spaces in numbers
    x = re.sub(r'\b(\d+) (\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## remove more than one spaces
    x = re.sub(r' +', ' ',x)
    
    ## remove start and end spaces
    x = re.sub(r'^ +| +$', '',x,flags=re.MULTILINE) 
    
    ## space-comma -> comma
    x = re.sub(r' \,',',',x)
    
    ## space-dot -> dot
    x = re.sub(r' \.','.',x)
    
    x = re.sub(r'â.{2}',"'",x) ### !!! NEW: single quotes are read as: âXX
    
    #x = x.encode('latin1').decode('utf-8') ## â\x80\x99
    #x = ud.normalize('NFKD',x).encode('ascii', 'ignore').decode()
    
    return x


GL_df['title'] = GL_df['title'].apply(clean)
GL_df['title'] = GL_df['title'].apply(lambda x: re.sub(r'\?','-',x)) ## also replace question marks by dashes
GL_df['definition'] = GL_df['definition'].apply(clean)
GL_df['url'] = GL_df['url'].apply(clean,quotes=False)  
GL_df['url'] = GL_df['url'].apply(lambda x: [x]) ## also put each URL in a list - required later

GL_df.head(5)

id            0
title         0
definition    0
url           0
dtype: int64


Unnamed: 0,id,title,definition,url
0,1,Accident at work,An accident at work in the framework...,[https://ec.europa.eu/eurostat/stati...
1,5,Fatal accident at work,A fatal accident at work refers to a...,[https://ec.europa.eu/eurostat/stati...
2,6,Non-fatal accident at work,A non-fatal accident at work is an a...,[https://ec.europa.eu/eurostat/stati...
3,8,Aggregate demand,Aggregate demand is the total amount...,[https://ec.europa.eu/eurostat/stati...
4,9,Goods and services account,The goods and services account shows...,[https://ec.europa.eu/eurostat/stati...


### Delete "special" records

* i.e. redirections.

In [26]:
## Drop The records with definitions "The revision ..." and "Redirect to ..." 

idx = GL_df[GL_df['definition'].str.startswith('The revision #')].index
print(idx)
GL_df.drop(idx , inplace=True)
idx = GL_df[GL_df['definition'].str.startswith('Redirect to')].index
print(idx)
GL_df.drop(idx , inplace=True)
GL_df.reset_index(drop=True, inplace=True)

Int64Index([ 230,  291,  383,  385,  432,  435,  437,  438,  503,  518,  529,
             556,  587,  728,  741,  774,  825,  888,  890,  911,  959,  960,
             968, 1001, 1005, 1131, 1142, 1180, 1229],
           dtype='int64')
Int64Index([], dtype='int64')


### Check for duplicate titles AND definitions

In [27]:
idx = GL_df[GL_df.duplicated(subset=['title','definition'], keep=False)].sort_values(by=['title','definition']).index
print(idx)
GL_df.drop(idx , inplace=True)
GL_df.reset_index(drop=True, inplace=True)

Int64Index([1217, 1218], dtype='int64')


### Statistics explained articles

* IDs, titles and urls from dat_link_info, with resource_information_id=1, i.e. Eurostat (see ESTAT.V1.mod_resource_information) and matching IDs from dat_article.
* Carry out data cleansing on titles and URLs.
* Also put the URLs in lists. This is required later.

In [28]:
SQLCommand = """SELECT id, title, url 
                FROM ESTAT.V1.dat_link_info 
                WHERE resource_information_id=1 AND id IN (SELECT id FROM ESTAT.V1.dat_article) """

SE_df = pd.read_sql(SQLCommand,c)

SE_df['title'] = SE_df['title'].apply(clean)
SE_df['url'] = SE_df['url'].apply(clean,quotes=False)
SE_df['url'] = SE_df['url'].apply(lambda x: [x])
SE_df.head(5)

Unnamed: 0,id,title,url
0,7,Accidents at work statistics,[https://ec.europa.eu/eurostat/stati...
1,13,National accounts and GDP,[https://ec.europa.eu/eurostat/stati...
2,16,Railway safety statistics in the EU,[https://ec.europa.eu/eurostat/stati...
3,17,Railway freight transport statistics,[https://ec.europa.eu/eurostat/stati...
4,18,Railway passenger transport statisti...,[https://ec.europa.eu/eurostat/stati...


### Add paragraphs titles and contents

* From dat_article_paragraph with abstract=0 (i.e. "no").
* Match article_id from dat_article_paragraph with id from dat_article.
* Carry out data cleansing on titles and paragraph contents.

In [29]:
SQLCommand = """SELECT article_id, title, content 
                FROM ESTAT.V1.dat_article_paragraph
                WHERE abstract=0 AND article_id IN (SELECT id FROM ESTAT.V1.dat_article) """

add_content = pd.read_sql(SQLCommand,c)
add_content['title'] = add_content['title'].apply(clean)
add_content['content'] = add_content['content'].apply(clean)
add_content

Unnamed: 0,article_id,title,content
0,2905,Absences from work sharply increase ...,Absences from work recorded unpreced...
1,2905,Absences: 9.5 % of employment in Q4 ...,The article's next figure (Figure 4)...
2,2905,Higher share of absences from work a...,Considering all four quarters of 202...
3,2905,Absences from work due to own illnes...,"From Q4 2019 to Q4 2020, the number ..."
4,2905,Absences from work due to holidays,Expressed as a share of employed peo...
...,...,...,...
3854,10539,General presentation and definition,Scope of asylum statistics and Dubli...
3855,10539,Methodological aspects in asylum sta...,Annual aggregate of the number of as...
3856,10539,Methodological aspects in Dublin sta...,Asymmetries For most of the collecte...
3857,10539,What questions can or cannot be answ...,How many asylum seekers are entering...


### Aggregate above paragraph titles and contents  from SE articles paragraphs by article id

* Create a column _raw content_ which gathers all paragraph titles and contents in one text per article.

In [30]:

add_content_grouped = add_content.groupby(['article_id'])[['title','content']].aggregate(lambda x: list(x))
add_content_grouped.reset_index(drop=False, inplace=True)
for i in range(len(add_content_grouped)):
    add_content_grouped.loc[i,'raw content'] = ''
    for (a,b) in zip(add_content_grouped.loc[i,'title'],add_content_grouped.loc[i,'content']):
        add_content_grouped.loc[i,'raw content'] += ' '+a + ' ' + b
add_content_grouped = add_content_grouped[['article_id','raw content']]    

add_content_grouped

Unnamed: 0,article_id,raw content
0,7,"Number of accidents In 2018, there ..."
1,13,Developments for GDP in the EU-27: ...
2,16,Fall in the number of railway accid...
3,17,Downturn for EU transport performan...
4,18,Rail passenger transport performanc...
...,...,...
860,10456,Problem After successfully identify...
861,10470,"Problem In France, there was signif..."
862,10506,General overview Nine PEEIs concern...
863,10531,What are administrative sources? Th...


### Merge raw content of SE articles with main file

In [31]:
SE_df = pd.merge(SE_df,add_content_grouped,left_on='id',right_on='article_id',how='inner')
SE_df.drop(['article_id'],axis=1,inplace=True)

SE_df.head(5)

Unnamed: 0,id,title,url,raw content
0,7,Accidents at work statistics,[https://ec.europa.eu/eurostat/stati...,"Number of accidents In 2018, there ..."
1,13,National accounts and GDP,[https://ec.europa.eu/eurostat/stati...,Developments for GDP in the EU-27: ...
2,16,Railway safety statistics in the EU,[https://ec.europa.eu/eurostat/stati...,Fall in the number of railway accid...
3,17,Railway freight transport statistics,[https://ec.europa.eu/eurostat/stati...,Downturn for EU transport performan...
4,18,Railway passenger transport statisti...,[https://ec.europa.eu/eurostat/stati...,Rail passenger transport performanc...


### Read file with OECD's terms and definitions
* Column 'Cross_References_2' has the correct cross-references from 'Cross References:', separated by semicolons and also with some invalid ones (not valid URL in 'URL:Cross References' removed.
* Above not used in this code though.
* Also put the URLs in lists. This is required later.

In [32]:
OECD_df = pd.read_excel('OECD_final_results_2.xlsx')
OECD_df.drop(columns=['Unnamed: 0'],inplace=True)
OECD_df['URL'] = OECD_df['URL'].apply(lambda x: [x]) ## also put each URL in a list - required later
OECD_df

Unnamed: 0,ID,URL,Term,URL:Cross References,French Equivalent:,Definition:,Cross References:,Statistical Theme:,Created on,Last updated on,Source Publication:,Context:,Hyperlink:,Glossary Output Segments:,Classification Indicator:,Version Indicator:,French Definition:,Cross_References_2
0,1,[https://stats.oecd.org/glossary/det...,Abatement,https://stats.oecd.org/glossary/deta...,Réduction,See Pollution abatement.,Pollution abatement,Environmental statistics,"Tuesday, September 25, 2001","Thursday, March 14, 2002",,,,,,,,Pollution abatement
1,2,[https://stats.oecd.org/glossary/det...,Absence from work due to illness,,,Absence from work due to illness ref...,,Health statistics,"Tuesday, September 25, 2001","Thursday, November 22, 2001",OECD Health Data 2001: A Comparative...,,,,,,,
2,3,[https://stats.oecd.org/glossary/det...,Activity restriction - free expectancy,,,Functional limitation-free life expe...,,Health statistics,"Tuesday, September 25, 2001","Wednesday, October 31, 2001",OECD Health Data 2001: A Comparative...,,,,,,,
3,4,[https://stats.oecd.org/glossary/det...,Acute care,https://stats.oecd.org/glossary/deta...,,Acute care is one in which the princ...,Acute care beds Acute care hospital ...,Health statistics,"Tuesday, September 25, 2001","Thursday, April 25, 2013",OECD Health Data 2001: A Comparative...,,,,,,,Acute care beds;Acute care hospital ...
4,5,[https://stats.oecd.org/glossary/det...,Acute care beds,https://stats.oecd.org/glossary/deta...,,Acute care beds are beds accommodati...,Acute care Long-term care beds in ho...,Health statistics,"Tuesday, September 25, 2001","Thursday, April 25, 2013",2001 Data Collection on Education Sy...,Acute care beds have alternatively b...,,,,,,Acute care;Long-term care beds in ho...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6934,7353,[https://stats.oecd.org/glossary/det...,CO2,https://stats.oecd.org/glossary/deta...,,,Carbon dioxide (CO2),,"Thursday, April 4, 2013","Thursday, April 4, 2013",,,,,,,,Carbon dioxide (CO2)
6935,7354,[https://stats.oecd.org/glossary/det...,Carbon market,https://stats.oecd.org/glossary/deta...,,A popular (but misleading) term for ...,Greenhouse gases,,"Thursday, April 4, 2013","Thursday, April 4, 2013",United Nations Framework Convention ...,,http://unfccc.int/essential_backgrou...,,,,,Greenhouse gases
6936,7355,[https://stats.oecd.org/glossary/det...,Classification structure,https://stats.oecd.org/glossary/deta...,,Refers to how the categories of a cl...,Classification,,"Tuesday, April 9, 2013","Tuesday, April 9, 2013","United Nations Statistics Division, ...",,http://unstats.un.org/unsd/class/fam...,,,,,Classification
6937,7356,[https://stats.oecd.org/glossary/det...,United Nation Framework Convention o...,https://stats.oecd.org/glossary/deta...,,The United Nations Framework Convent...,United Nations Conference on Environ...,,"Tuesday, April 9, 2013","Friday, April 26, 2013",United Nations Framework Convention ...,"The other “Rio Conventions”, also ne...",http://unfccc.int/2860.php,,,,,United Nations Conference on Environ...


* Drop records with missing values and apply data cleansing.

In [33]:
print(OECD_df.isnull().sum())
OECD_df.dropna(subset=['Term','Definition:'],inplace=True)
OECD_df.reset_index(drop=True, inplace=True)
print(OECD_df.isnull().sum())

OECD_df['Term'] = OECD_df['Term'].apply(clean)
OECD_df['Definition:'] = OECD_df['Definition:'].apply(clean)
OECD_df['Context:'] = OECD_df['Context:'].apply(clean)
OECD_df.head(5)

OECD_df.to_excel('OECD_df.xlsx')

ID                              0
URL                             0
Term                            4
URL:Cross References         4374
French Equivalent:           4813
Definition:                     3
Cross References:            4377
Statistical Theme:             37
Created on                      0
Last updated on              1764
Source Publication:           869
Context:                     5541
Hyperlink:                   4650
Glossary Output Segments:    6369
Classification Indicator:    6862
Version Indicator:           6632
French Definition:           6935
Cross_References_2           4374
dtype: int64
ID                              0
URL                             0
Term                            0
URL:Cross References         4371
French Equivalent:           4809
Definition:                     0
Cross References:            4374
Statistical Theme:             35
Created on                      0
Last updated on              1761
Source Publication:           866
C

Unnamed: 0,ID,URL,Term,URL:Cross References,French Equivalent:,Definition:,Cross References:,Statistical Theme:,Created on,Last updated on,Source Publication:,Context:,Hyperlink:,Glossary Output Segments:,Classification Indicator:,Version Indicator:,French Definition:,Cross_References_2
0,1,[https://stats.oecd.org/glossary/det...,Abatement,https://stats.oecd.org/glossary/deta...,Réduction,See Pollution abatement.,Pollution abatement,Environmental statistics,"Tuesday, September 25, 2001","Thursday, March 14, 2002",,,,,,,,Pollution abatement
1,2,[https://stats.oecd.org/glossary/det...,Absence from work due to illness,,,Absence from work due to illness ref...,,Health statistics,"Tuesday, September 25, 2001","Thursday, November 22, 2001",OECD Health Data 2001: A Comparative...,,,,,,,
2,3,[https://stats.oecd.org/glossary/det...,Activity restriction - free expectancy,,,Functional limitation-free life expe...,,Health statistics,"Tuesday, September 25, 2001","Wednesday, October 31, 2001",OECD Health Data 2001: A Comparative...,,,,,,,
3,4,[https://stats.oecd.org/glossary/det...,Acute care,https://stats.oecd.org/glossary/deta...,,Acute care is one in which the princ...,Acute care beds Acute care hospital ...,Health statistics,"Tuesday, September 25, 2001","Thursday, April 25, 2013",OECD Health Data 2001: A Comparative...,,,,,,,Acute care beds;Acute care hospital ...
4,5,[https://stats.oecd.org/glossary/det...,Acute care beds,https://stats.oecd.org/glossary/deta...,,Acute care beds are beds accommodati...,Acute care Long-term care beds in ho...,Health statistics,"Tuesday, September 25, 2001","Thursday, April 25, 2013",2001 Data Collection on Education Sy...,Acute care beds have alternatively b...,,,,,,Acute care;Long-term care beds in ho...


### Tokenize, remove stop-words and stem; keep also the original terms

* Use titles and definitions from the Glossary articles.
* Use titles and raw content from the SE articles
* Use terms, definitions and contexts from OECD's glossary entries.
* _texts_ is a list containing lists. Each sub-list has the stemmed term, the original term and the URL where the term was found. The URL is itself put in a list.

In [34]:
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import STOPWORDS

all_stopwords_gensim = STOPWORDS


p = PorterStemmer()

def text_to_words(text,url):
    words = str(gensim.utils.simple_preprocess(text, deacc=True))
    words = gensim.utils.tokenize(words)
    words = [word for word in words if not word in all_stopwords_gensim]    
        
    ## keep also original token!!! 
    words = [[p.stem(token),token,url] for token in words if len(p.stem(token)) >= 5] ##minimum length = 5 
    yield words        

texts=list()   

for i in range(len(GL_df)):
    texts.extend(text_to_words(GL_df.loc[i,'definition'],GL_df.loc[i,'url']))
    texts.extend(text_to_words(GL_df.loc[i,'title'],GL_df.loc[i,'url'])) 
for i in range(len(SE_df)):    
    texts.extend(text_to_words(SE_df.loc[i,'title'],SE_df.loc[i,'url'])) 
    texts.extend(text_to_words(SE_df.loc[i,'raw content'],SE_df.loc[i,'url'])) 
for i in range(len(OECD_df)):    
    texts.extend(text_to_words(OECD_df.loc[i,'Term'],OECD_df.loc[i,'URL'])) 
    texts.extend(text_to_words(OECD_df.loc[i,'Definition:'],OECD_df.loc[i,'URL'])) 
    OECD_rec = OECD_df.loc[i,'Context:']
    if not pd.isna(OECD_rec):
        texts.extend(text_to_words(OECD_rec,OECD_df.loc[i,'URL'])) 


* Example: the first 5 sub-lists in the first list in _texts_.

In [35]:
print(len(texts))
texts[0][:5]

19555


[['accid',
  'accident',
  ['https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Glossary:Accident_at_work']],
 ['framework',
  'framework',
  ['https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Glossary:Accident_at_work']],
 ['administr',
  'administrative',
  ['https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Glossary:Accident_at_work']],
 ['collect',
  'collection',
  ['https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Glossary:Accident_at_work']],
 ['european',
  'european',
  ['https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Glossary:Accident_at_work']]]

### Co-occurences: keys in n-grams are (n-1) tuples of stemmed tokens 

* Three dictionaries, for 2-,3-,and 4-grams. The corresponding keys are single stemmed terms, pairs of stemmed terms and triplets, respectively.
* For each key in a dictionary, the value is another (nested) dictionary with the **original terms**, their counts and the relevant URLs. In the end, the counts are used to calculate probabilities.
* Below all three dictionaries are constructed from the sequences of 4-grams.


In [36]:
##%%script false --no-raise-error
## Check also COLLOCATIONS: http://www.nltk.org/howto/collocations.html and http://www.nltk.org/api/nltk.html?highlight=ngram


from nltk import bigrams, trigrams, ngrams
#from collections import Counter, defaultdict


model2=dict()
model3=dict()
model4=dict()

def dict_insert(model,entered,proposed,new_urls_to_check):
    key_1 = model.get(entered)
    if key_1:
        key_2 = key_1.get(proposed)
        if key_2:
            key_2[0] +=1
            existing_urls = key_2[1]
            add_urls = [u for u in new_urls_to_check if u not in existing_urls]
            key_2.extend(add_urls) 
        else:    
            key_1[proposed]= [1,new_urls_to_check]
    else:
        model[entered]={proposed:[1,new_urls_to_check]}
    return model            


# Co-occurences

for sentence in texts:
    pairs = [elem for elem in sentence] ## a list of 3-tuples (stemmed term, original term, list of URLs)
    if len(pairs) == 0: continue
    
    for first, second, third, fourth in ngrams(pairs,4): ## quadruplets of 3-tuples (stemmed term, original term, list of URLs)
        first_stem, first_orig, first_url = first
        second_stem, second_orig, second_url = second
        third_stem, third_orig, third_url = third
        fourth_stem, fourth_orig, fourth_url = fourth
       
        model2 = dict_insert(model2, first_stem, second_orig,list(set(first_url).intersection(second_url)))
        model2 = dict_insert(model2, second_stem, third_orig,list(set(second_url).intersection(third_url)))
        model2 = dict_insert(model2, third_stem, fourth_orig,list(set(third_url).intersection(fourth_url)))
        
        model3 = dict_insert(model3,(first_stem,second_stem),third_orig,list(set(first_url).intersection(*[second_url,third_url])))
        model3 = dict_insert(model3,(second_stem,third_stem),fourth_orig,list(set(second_url).intersection(*[third_url,fourth_url])))
        
        model4 = dict_insert(model4,(first_stem, second_stem, third_stem),fourth_orig,fourth_url)
   
        
## Transform counts to probabilities

for w1 in model2.keys():
    ssum = sum(model2[w1][w2][0] for w2 in model2[w1].keys())
    for w2 in model2[w1].keys():
        model2[w1][w2][0] /= ssum

for w1_w2 in model3.keys():
    ssum = sum(model3[w1_w2][w3][0] for w3 in model3[w1_w2].keys())
    for w3 in model3[w1_w2].keys():
        model3[w1_w2][w3][0] /= ssum

for w1_w2_w3 in model4.keys():
    ssum = sum(model4[w1_w2_w3][w4][0] for w4 in model4[w1_w2_w3].keys())
    for w4 in model4[w1_w2_w3].keys():
        model4[w1_w2_w3][w4][0] /= ssum


### Examples of keys and values in the three dictionaries

* We do not show an example of the 2-grams dictionary because the values are too many.


In [38]:

print('\nExample, model with 3-grams, key = (household,expenditur):\n')
print(model3[('household','expenditur')])

print('\nExample, model with 4-grams, key = (survei,structur,agricultur):\n')
print(model4[('household', 'incom', 'expenditur')])



Example, model with 3-grams, key = (household,expenditur):

{'services': [0.06, ['https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Glossary:Household_budget_survey_(HBS)'], 'https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Ageing_Europe_-_statistics_on_pensions,_income_and_expenditure', 'https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Ageing_Europe_-_statistics_on_pensions,_income_and_expenditure', 'https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Ageing_Europe_-_statistics_on_pensions,_income_and_expenditure', 'https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Ageing_Europe_-_statistics_on_pensions,_income_and_expenditure'], 'increasingly': [0.02, ['https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Living_conditions_in_Europe_-_housing']], 'respectively': [0.08, ['https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Comparative_price_levels_of_consumer_goods_and_se

### The widgets
***

In [39]:
import ipywidgets as widgets
layout = widgets.Layout(width='600px', height='30px')

In [40]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [42]:

def change_top_articles( Keywords, Glossaries):

    from operator import itemgetter
    p = PorterStemmer()
    
    last_match = ''
    
    #if not Keywords.endswith(' '):
    #    return None
    
    def test_and_back_step(x):
        mod_index = -1
        models = [model2,model3,model4]
        if len(x)==1: 
            x=x[0] ; model=models[0]
        elif len(x) ==2 or len(x) == 3:
            x=tuple(x) ; mod_index=len(x)-1; model=models[mod_index]
        elif len(x) >=4:
            x=tuple(x[:3]) ; model=models[2]
#        else:
#            return None
        while not models[mod_index].get(x) and mod_index >=0:
            x=x[:-1]
            if len(x)==1 : x=x[0]
            mod_index -=1 ; model=models[mod_index] 
        return (model,x)    
            

    x = Keywords.split() 
    if len(x) ==0: 
        print()
        return
    x = [p.stem(el) for el in x]
    
    
    model,x = test_and_back_step(x)
    if not model.get(x):
        return None
    

    print()
    print('Based on last match: ',x,'\n')
    print('Suggestions, probabilities (in descending order) and relevant URLs: ')
    proposals = sorted([(k,v) for (k,v) in model[x].items()],key=itemgetter(1),reverse=True)
    
    ## Adjust sum of probabilities
    if Glossaries == 'OECD':
        valid_urls = [(v[0],list(filter(lambda x: re.match(r'^https://stats.oecd.org/',x),v[1]))) for (k,v) in proposals]
        valid_urls_sum = sum([v for (v,l) in valid_urls if len(l)>0])
    elif Glossaries == 'Eurostat':    
        valid_urls = [(v[0],list(filter(lambda x: re.match(r'^https://ec.europa.eu/eurostat/',x),v[1]))) for (k,v) in proposals]
        valid_urls_sum = sum([v for (v,l) in valid_urls if len(l)>0])
    else:
        valid_urls_sum = 1.0
    #print('valid_sum: ',valid_urls_sum)

    last_match = x
    sump = 0
    for key, value in proposals:
        urls = value[1]
        if Glossaries == 'OECD':
            urls = [url for url in urls if re.match(r'^https://stats.oecd.org/',url)]
        elif Glossaries == 'Eurostat':   
            urls = [url for url in urls if re.match(r'^https://ec.europa.eu/eurostat/',url)]            
        
        if len(urls) > 0: 
            print()
            print(key,': ',value[0]/valid_urls_sum)
            sump += value[0]/valid_urls_sum
            for url in urls:
                print(url)
    #print('sump= ',sump)    
   
    
def query_build1(value):
    style = {'description_width': 'initial'}
    
    Keywords = widgets.Text(
        value=value,
        placeholder='Type something',
        description='Keywords:',
        disabled=False
  )

    Glossaries= widgets.RadioButtons(
        options=['All vocabularies','Eurostat', 'OECD'],
        description='Select:',
        value='OECD',
        disabled=False
  )

  #ui = widgets.HBox([Keywords])
    ui = widgets.HBox([Keywords,Glossaries])
    out = widgets.interactive_output(change_top_articles, {'Keywords': Keywords,'Glossaries' : Glossaries})
    display(ui, out)
    
query_build1(value='Household income')  

HBox(children=(Text(value='Household income', description='Keywords:', placeholder='Type something'), RadioBut…

Output()