# Use Case A - Faceted search (in progress)
***

In [1]:
import pandas as pd
import numpy as np

import ipywidgets as widgets

import pyodbc

import gensim


## A. Import Statistics Explained data from the database
***

T1: id, context and last update.  
T2: ids, titles and URLs.  
Merge above (inner join on id).

T3: ids, titles and abstracts: from the ESTAT.V1.dat_article_paragraph table with abstract=1.  
Merge above (inner join on id, article_id).


In [2]:


c = pyodbc.connect('DSN=Virtuoso All;DBA=ESTAT;UID=user_name;PWD=password')
cursor = c.cursor()

SQLCommand = """SELECT T1.id, T1.context, T1.last_update, T2.title, T2.url, T3.article_id, T3.content 
                FROM ESTAT.V1.dat_article as T1 
                INNER JOIN ESTAT.V1.dat_link_info as T2  
                  ON T1.id=T2.id  
                INNER JOIN ESTAT.V1.dat_article_paragraph as T3  
                  ON T2.id=T3.article_id  
                WHERE T3.abstract=1"""

dat3 = pd.read_sql(SQLCommand,c)
dat3.rename(columns={'content':'abstract'},inplace=True)
dat3 = dat3[['id','context','title','abstract','url','last_update']]

dat3


Unnamed: 0,id,context,title,abstract,url,last_update
0,1,,Accessing European statistics,The simplest way to obtain Eurostat âs wide ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-02 16:48:00
1,8,Eurostat publishes data on education and trai...,Adult Education Survey (AES) methodolo...,,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-24 19:19:00
2,13,Lifelong learning can take place in a variety ...,Adult learning statistics,This article provides an overview of adult le...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 11:25:00
3,14,Adults with a low level of educational attainm...,Adult learning statistics - characteristics of...,This article presents an overview of European ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 14:20:00
4,17,The concept of HNV farmland has been emerging ...,Agri-environmental indicator - High Na...,This article provides a fact sheet of the Euro...,https://ec.europa.eu/eurostat/statistics-expla...,2020-04-15 10:34:00
...,...,...,...,...,...,...
886,6169,In his first State of the Union speech in Sept...,Living conditions statistics at region...,"By global standards, most Europeans are relati...",https://ec.europa.eu/eurostat/statistics-expla...,2021-07-05 09:43:00
887,6188,"Universal access to good healthcare, at an aff...",Living conditions in Europe - health c...,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-15 14:13:00
888,6191,"Among other objectives, EU policy initiatives ...",Living conditions in Europe - childcar...,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-30 14:23:00
889,6194,The content of this statistical article is bas...,Maritime passenger statistics,This article presents the latest statistical ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:21:00


### Content
ids, titles and contents.  
From the ESTAT.V1.dat_article_paragraph table with abstract=0.

In [3]:
add_content = pd.read_sql("SELECT id,article_id,title,content,abstract FROM ESTAT.V1.dat_article_paragraph WHERE abstract=0",c)
add_content

Unnamed: 0,id,article_id,title,content,abstract
0,1,1,Statistical themes,The information on Eurostatâs website under...,0
1,2,1,Access to data,Data navigation tree The majority of Euros...,0
2,3,1,Publications,Eurostat produces a variety of publications. ...,0
3,4,1,Reference metadata,Metadata dissemination Statistical data an...,0
4,5,1,User support,Eurostat and the other members of the ESS hav...,0
...,...,...,...,...,...
3842,4731,6194,Most EU seaborne passenger transport is within...,Table 1 and Figure 5 show the breakdown of se...,0
3843,4733,6199,Structural profile,Around 1 in 10 (8.9 %) of all enterprises ...,0
3844,4734,6199,Sectoral analysis,At the NACE division level the manufacturing ...,0
3845,4735,6199,Country overview,Because of the tradable (export and import) n...,0


### Aggregate above by article id

Aggregate titles and contents in one string.

In [4]:
add_content_grouped = add_content.groupby(['article_id'])['title','content'].aggregate(lambda x: list(x))
add_content_grouped.reset_index(inplace=True)
for i in range(len(add_content_grouped)):
    add_content_grouped.loc[i,'raw content'] = ''
    for (a,b) in zip(add_content_grouped.loc[i,'title'],add_content_grouped.loc[i,'content']):
        add_content_grouped.loc[i,'raw content'] += ' '+a + ' ' + b
add_content_grouped = add_content_grouped[['article_id','raw content']]    

add_content_grouped

  add_content_grouped = add_content.groupby(['article_id'])['title','content'].aggregate(lambda x: list(x))


Unnamed: 0,article_id,raw content
0,1,Statistical themes The information on Eurost...
1,8,Introduction The AES provides an overview of...
2,13,Participation rate of adults in learning in t...
3,14,Formal and non-formal adult education and tra...
4,17,Key messages The concept of high nature valu...
...,...,...
860,6169,Poverty and deprivation There are two princi...
861,6188,Key findings This article presents statistic...
862,6191,Childcare and education arrangements The une...
863,6194,Number of seaborne passengers reaches 419 mil...


In [5]:

dat4 = pd.merge(dat3,add_content_grouped,left_on='id',right_on='article_id',how='inner')
dat4.drop(['article_id'],axis=1,inplace=True)

del(dat3, add_content, add_content_grouped)

dat4


Unnamed: 0,id,context,title,abstract,url,last_update,raw content
0,1,,Accessing European statistics,The simplest way to obtain Eurostat âs wide ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-02 16:48:00,Statistical themes The information on Eurost...
1,8,Eurostat publishes data on education and trai...,Adult Education Survey (AES) methodolo...,,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-24 19:19:00,Introduction The AES provides an overview of...
2,13,Lifelong learning can take place in a variety ...,Adult learning statistics,This article provides an overview of adult le...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 11:25:00,Participation rate of adults in learning in t...
3,14,Adults with a low level of educational attainm...,Adult learning statistics - characteristics of...,This article presents an overview of European ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 14:20:00,Formal and non-formal adult education and tra...
4,17,The concept of HNV farmland has been emerging ...,Agri-environmental indicator - High Na...,This article provides a fact sheet of the Euro...,https://ec.europa.eu/eurostat/statistics-expla...,2020-04-15 10:34:00,Key messages The concept of high nature valu...
...,...,...,...,...,...,...,...
859,6169,In his first State of the Union speech in Sept...,Living conditions statistics at region...,"By global standards, most Europeans are relati...",https://ec.europa.eu/eurostat/statistics-expla...,2021-07-05 09:43:00,Poverty and deprivation There are two princi...
860,6188,"Universal access to good healthcare, at an aff...",Living conditions in Europe - health c...,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-15 14:13:00,Key findings This article presents statistic...
861,6191,"Among other objectives, EU policy initiatives ...",Living conditions in Europe - childcar...,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-30 14:23:00,Childcare and education arrangements The une...
862,6194,The content of this statistical article is bas...,Maritime passenger statistics,This article presents the latest statistical ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:21:00,Number of seaborne passengers reaches 419 mil...


### Related links

From the ESTAT.V1.dat_article_shared_link with article_division=2 ("Other articles", see ESTAT.V1.mod_article_division).  
link_id points to id in FROM ESTAT.V1.dat_link_info (where we select resource_information_id=1).


In [6]:
SQLCommand = """SELECT T1.article_id, T1.link_id, T2.title, T2.url 
                FROM dat_article_shared_link as T1 
                INNER JOIN ESTAT.V1.dat_link_info as T2  
                  ON T1.link_id=T2.id  
                WHERE T1.article_division_id=2 AND T2.resource_information_id=1
                ORDER BY T1.article_id, T1.link_id """

add_related_links = pd.read_sql(SQLCommand,c)
add_related_links

Unnamed: 0,article_id,link_id,title,url
0,8,10,Education and training in the EU - facts and f...,https://ec.europa.eu/eurostat/statistics-expla...
1,13,14,Adult learning statistics - characteristics of...,https://ec.europa.eu/eurostat/statistics-expla...
2,13,287,Vocational education and training statistics,https://ec.europa.eu/eurostat/statistics-expla...
3,13,931,Education and training statistics introduced,https://ec.europa.eu/eurostat/statistics-expla...
4,13,2241,Early leavers from education and training,https://ec.europa.eu/eurostat/statistics-expla...
...,...,...,...,...
2891,6194,2832,Maritime freight and vessels statistics,https://ec.europa.eu/eurostat/statistics-expla...
2892,6194,2833,Maritime transport of goods - quarterly data,https://ec.europa.eu/eurostat/statistics-expla...
2893,6194,2834,Maritime transport statistics - short sea ship...,https://ec.europa.eu/eurostat/statistics-expla...
2894,6199,1210,Structural business statistics introdu...,https://ec.europa.eu/eurostat/statistics-expla...


### Aggregate above by article id

Aggregate related titles and URLs in one string.

In [7]:
add_related_grouped = pd.DataFrame(add_related_links.groupby(['article_id'])['title','url'].aggregate(lambda x: list(x)))
add_related_grouped.reset_index(drop=False, inplace=True)
add_related_grouped.rename(columns={'title':'related_titles','url':'related_urls'},inplace=True)
add_related_grouped



  add_related_grouped = pd.DataFrame(add_related_links.groupby(['article_id'])['title','url'].aggregate(lambda x: list(x)))


Unnamed: 0,article_id,related_titles,related_urls
0,8,[Education and training in the EU - facts and ...,[https://ec.europa.eu/eurostat/statistics-expl...
1,13,[Adult learning statistics - characteristics o...,[https://ec.europa.eu/eurostat/statistics-expl...
2,14,[Education and training in the EU - facts and ...,[https://ec.europa.eu/eurostat/statistics-expl...
3,17,[Agri-environmental indicators],[https://ec.europa.eu/eurostat/statistics-expl...
4,28,[Ageing Europe ? looking at the lives of older...,[https://ec.europa.eu/eurostat/statistics-expl...
...,...,...,...
809,6169,[Living conditions in Europe - poverty and soc...,[https://ec.europa.eu/eurostat/statistics-expl...
810,6188,[All articles from the publication Being young...,[https://ec.europa.eu/eurostat/statistics-expl...
811,6191,[All articles from the publication Being young...,[https://ec.europa.eu/eurostat/statistics-expl...
812,6194,[ Freight transport statistics - modal ...,[https://ec.europa.eu/eurostat/statistics-expl...


### Merge above

* Inner join on id, article_id.

In [8]:
dat5 = pd.merge(dat4,add_related_grouped,left_on='id',right_on='article_id',how='inner')

del(dat4,add_related_links,add_related_grouped)

dat5

Unnamed: 0,id,context,title,abstract,url,last_update,raw content,article_id,related_titles,related_urls
0,8,Eurostat publishes data on education and trai...,Adult Education Survey (AES) methodolo...,,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-24 19:19:00,Introduction The AES provides an overview of...,8,[Education and training in the EU - facts and ...,[https://ec.europa.eu/eurostat/statistics-expl...
1,13,Lifelong learning can take place in a variety ...,Adult learning statistics,This article provides an overview of adult le...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 11:25:00,Participation rate of adults in learning in t...,13,[Adult learning statistics - characteristics o...,[https://ec.europa.eu/eurostat/statistics-expl...
2,14,Adults with a low level of educational attainm...,Adult learning statistics - characteristics of...,This article presents an overview of European ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 14:20:00,Formal and non-formal adult education and tra...,14,[Education and training in the EU - facts and ...,[https://ec.europa.eu/eurostat/statistics-expl...
3,17,The concept of HNV farmland has been emerging ...,Agri-environmental indicator - High Na...,This article provides a fact sheet of the Euro...,https://ec.europa.eu/eurostat/statistics-expla...,2020-04-15 10:34:00,Key messages The concept of high nature valu...,17,[Agri-environmental indicators],[https://ec.europa.eu/eurostat/statistics-expl...
4,28,,Ageing Europe - introduction,This introduction is one of a set of statistic...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-25 17:52:00,Why does population ageing matter? There wer...,28,[Ageing Europe ? looking at the lives of older...,[https://ec.europa.eu/eurostat/statistics-expl...
...,...,...,...,...,...,...,...,...,...,...
796,6169,In his first State of the Union speech in Sept...,Living conditions statistics at region...,"By global standards, most Europeans are relati...",https://ec.europa.eu/eurostat/statistics-expla...,2021-07-05 09:43:00,Poverty and deprivation There are two princi...,6169,[Living conditions in Europe - poverty and soc...,[https://ec.europa.eu/eurostat/statistics-expl...
797,6188,"Universal access to good healthcare, at an aff...",Living conditions in Europe - health c...,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-15 14:13:00,Key findings This article presents statistic...,6188,[All articles from the publication Being young...,[https://ec.europa.eu/eurostat/statistics-expl...
798,6191,"Among other objectives, EU policy initiatives ...",Living conditions in Europe - childcar...,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-30 14:23:00,Childcare and education arrangements The une...,6191,[All articles from the publication Being young...,[https://ec.europa.eu/eurostat/statistics-expl...
799,6194,The content of this statistical article is bas...,Maritime passenger statistics,This article presents the latest statistical ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:21:00,Number of seaborne passengers reaches 419 mil...,6194,[ Freight transport statistics - modal ...,[https://ec.europa.eu/eurostat/statistics-expl...


## B. Data cleansing
***


In [9]:
import re
import unicodedata as ud

def clean(x, quotes=True):
    if pd.isnull(x): return x  
    x = x.strip()
    
    ## make letter-question mark-letter -> letter-quote-space-letter !!! but NOT in the lists of URLs!!!
    if quotes:
        x = re.sub(r'([A-Za-z])\?([A-Za-z])','\\1\' \\2',x) ## NEW
    
    ## make letter-question mark-space lower case letter letter-quote-space letter
    x = re.sub(r'([A-Za-z])\? ([a-z])','\\1\' \\2',x) ## NEW

    ## delete ,000 commas in numbers    
    x = re.sub(r'\b(\d+),(\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## delete  000 spaces in numbers
    x = re.sub(r'\b(\d+) (\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## remove more than one spaces
    x = re.sub(r' +', ' ',x)
    
    ## remove start and end spaces
    x = re.sub(r'^ +| +$', '',x,flags=re.MULTILINE) 
    
    ## space-comma -> comma
    x = re.sub(r' \,',',',x)
    
    ## space-dot -> dot
    x = re.sub(r' \.','.',x)
    
    #x = x.encode('latin1').decode('utf-8') ## â\x80\x99
    x = ud.normalize('NFKD',x).encode('ascii', 'ignore').decode()
    
    return x

In [10]:
dat5['title'] = dat5['title'].apply(clean)
dat5['context'] = dat5['context'].apply(clean)
dat5['abstract']= dat5['abstract'].apply(clean)
dat5['raw content'] = dat5['raw content'].apply(clean)
dat5['url'] = dat5['url'].apply(clean,quotes=False)

dat5

Unnamed: 0,id,context,title,abstract,url,last_update,raw content,article_id,related_titles,related_urls
0,8,Eurostat publishes data on education and train...,Adult Education Survey (AES) methodology,,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-24 19:19:00,Introduction The AES provides an overview of t...,8,[Education and training in the EU - facts and ...,[https://ec.europa.eu/eurostat/statistics-expl...
1,13,Lifelong learning can take place in a variety ...,Adult learning statistics,This article provides an overview of adult lea...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 11:25:00,Participation rate of adults in learning in th...,13,[Adult learning statistics - characteristics o...,[https://ec.europa.eu/eurostat/statistics-expl...
2,14,Adults with a low level of educational attainm...,Adult learning statistics - characteristics of...,This article presents an overview of European ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 14:20:00,Formal and non-formal adult education and trai...,14,[Education and training in the EU - facts and ...,[https://ec.europa.eu/eurostat/statistics-expl...
3,17,The concept of HNV farmland has been emerging ...,Agri-environmental indicator - High Nature Val...,This article provides a fact sheet of the Euro...,https://ec.europa.eu/eurostat/statistics-expla...,2020-04-15 10:34:00,Key messages The concept of high nature value ...,17,[Agri-environmental indicators],[https://ec.europa.eu/eurostat/statistics-expl...
4,28,,Ageing Europe - introduction,This introduction is one of a set of statistic...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-25 17:52:00,Why does population ageing matter? There were ...,28,[Ageing Europe ? looking at the lives of older...,[https://ec.europa.eu/eurostat/statistics-expl...
...,...,...,...,...,...,...,...,...,...,...
796,6169,In his first State of the Union speech in Sept...,Living conditions statistics at regional level,"By global standards, most Europeans are relati...",https://ec.europa.eu/eurostat/statistics-expla...,2021-07-05 09:43:00,Poverty and deprivation There are two principa...,6169,[Living conditions in Europe - poverty and soc...,[https://ec.europa.eu/eurostat/statistics-expl...
797,6188,"Universal access to good healthcare, at an aff...",Living conditions in Europe - health conditions,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-15 14:13:00,Key findings This article presents statistics ...,6188,[All articles from the publication Being young...,[https://ec.europa.eu/eurostat/statistics-expl...
798,6191,"Among other objectives, EU policy initiatives ...",Living conditions in Europe - childcare arrang...,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-30 14:23:00,Childcare and education arrangements The unequ...,6191,[All articles from the publication Being young...,[https://ec.europa.eu/eurostat/statistics-expl...
799,6194,The content of this statistical article is bas...,Maritime passenger statistics,This article presents the latest statistical d...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:21:00,Number of seaborne passengers reaches 419 mill...,6194,[ Freight transport statistics - modal ...,[https://ec.europa.eu/eurostat/statistics-expl...


## C. Read categories from external file
***

In a later version, this will be done through a connection with the knowledge database.

In [11]:
import ast

categories = pd.read_excel('articles_6_25_19_30.xlsx',index_col=None) ## use the default index
categories['url']=categories['url'].apply(clean,quotes=False)
categories['title']=categories['title'].apply(clean,quotes=False)
categories['categories']=categories['categories'].apply(ast.literal_eval)
categories

Unnamed: 0.1,Unnamed: 0,abstract,alerts,categories,context,data_sources,excel,full_article,last_update,title,url,Titles,Raw content
0,0,This article presents an overview of European...,,"[Education and training, Participation in educ...",Adults with a low level of educational attainm...,The adult education survey (AES) is the sour...,[{'title': 'Adult learning statistics ET2018.x...,[{'content': ' About 44 % of adults aged ...,,Adult learning statistics - characteristics of...,https://ec.europa.eu/eurostat/statistics-expla...,Formal and non-formal adult education and trai...,Formal and non-formal adult education and trai...
1,1,Leaving the parental home is considered as a ...,,"[Household composition and family situation, L...","In addition to the Labour Force Survey (LFS), ...",Source: Statistics presented in this article ...,"[{'title': 'Map, Tables and figures.xlsx',\n '...",[{'content': ' Map 1 indicates that in 2019...,,Age of young people leaving their parental hou...,https://ec.europa.eu/eurostat/statistics-expla...,Geographical differences. Gender differences. ...,Geographical differences. Map 1 indicates that...
2,2,This article presents an overview of statistic...,,"[Services, Statistical article, Structural bus...",The freedom to provide services and the freedo...,Coverage Administrative and support services ...,,[{'content': ' In 2017 there were 1.4 milli...,,Administrative and support service statistics ...,https://ec.europa.eu/eurostat/statistics-expla...,Structural profile. Sectoral analysis. Country...,Structural profile. In 2017 there were 1.4 mil...
3,3,This article provides a picture of internation...,,"[Non-EU countries, Trade in goods, Statistical...",Trade is an important indicator of Europe’s pr...,EU data is taken from Eurostat's COMEXT da...,"[{'title': 'Africa 2021.xlsx',\n 'url': '/euro...","[{'content': ' In 2020, the largest trade p...",,Africa-EU - international trade in goods stati...,https://ec.europa.eu/eurostat/statistics-expla...,Africa’s main trade in goods partner is the EU...,Africa’s main trade in goods partner is the EU...
4,4,This article presents recent statistics on th...,"[{'content': '', 'title': 'Table 4 is availa...","[Asylum and migration, Population, Acquisition...","Within the European Commission, the Directorat...",Eurostat produces statistics on a range of is...,[{'title': 'Acquisitions of citizenship 15 Mar...,"[{'content': ' In 2019, 706 400 people obta...",,Acquisition of citizenship statistics,https://ec.europa.eu/eurostat/statistics-expla...,EU-27 Member States granted citizenship to 706...,EU-27 Member States granted citizenship to 706...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,620,Ageing Europe — looking at the lives of older ...,[{'content': 'Within the EU survey on income a...,"[Statistical article, Poverty and social exclu...",,,[{'title': '06 Ageing Europe Social life and o...,[{'content': ' People at work often exert t...,,Ageing Europe - statistics on social life and ...,https://ec.europa.eu/eurostat/statistics-expla...,Physical activity of older people. Older peopl...,Physical activity of older people. People at w...
621,621,Ageing Europe — looking at the lives of older ...,,"[Statistical article, Labour market, Accidents...",,,[{'title': '04 Ageing Europe Working and movin...,"[{'content': ' In 2019, there were 200.0 mi...",,Ageing Europe - statistics on working and movi...,https://ec.europa.eu/eurostat/statistics-expla...,Employment patterns among older people. Focus ...,Employment patterns among older people. In 201...
622,622,Ageing Europe — looking at the lives of older ...,,"[Statistical article, Population, Population a...",,,[{'title': '01 Ageing Europe Population develo...,[{'content': ' Population ageing will rapi...,,Ageing Europe - statistics on population devel...,https://ec.europa.eu/eurostat/statistics-expla...,Older people — population overview. Older peop...,Older people — population overview. Population...
623,623,Ageing Europe — looking at the lives of older ...,[{'content': 'Material deprivation is the enfo...,"[Statistical article, Housing, Living conditions]",,,[{'title': '02 Ageing Europe Housing and livin...,[{'content': ' Recent decades have been cha...,,Ageing Europe - statistics on housing and livi...,https://ec.europa.eu/eurostat/statistics-expla...,Household composition among older people. Olde...,Household composition among older people. Rece...


### Merge with main file

Merge on URL (inner join).

In [12]:
dat6 = pd.merge(dat5,categories[['categories','url']],on='url',how='inner')

del(dat5)


dat6

Unnamed: 0,id,context,title,abstract,url,last_update,raw content,article_id,related_titles,related_urls,categories
0,13,Lifelong learning can take place in a variety ...,Adult learning statistics,This article provides an overview of adult lea...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 11:25:00,Participation rate of adults in learning in th...,13,[Adult learning statistics - characteristics o...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Education and training, Lifelong learning, St..."
1,14,Adults with a low level of educational attainm...,Adult learning statistics - characteristics of...,This article presents an overview of European ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 14:20:00,Formal and non-formal adult education and trai...,14,[Education and training in the EU - facts and ...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Education and training, Participation in educ..."
2,57,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-28 16:29:00,Developments for GDP in the EU-27: growth sinc...,57,[European system of national and regional acco...,[https://ec.europa.eu/eurostat/statistics-expl...,"[National accounts (incl. GDP), Statistical ar..."
3,95,Trade is an important indicator of Europeas pr...,South Korea-EU - international trade in goods ...,This article provides a picture of the interna...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-21 14:25:00,"Recent developments, impact of COVID-19 The CO...",95,"[International trade in goods, Extra-EU trade ...",[https://ec.europa.eu/eurostat/statistics-expl...,"[Non-EU countries, Trade in goods, Statistical..."
4,225,The Europe 2020 strategy is the EUas growth st...,Statistics on European cities,More than half the worldas population resides ...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-03 11:18:00,Population Population statistics presented in ...,225,[ Territorial typologies for European c...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Cities, Regional yearbook, Regions and cities..."
...,...,...,...,...,...,...,...,...,...,...,...
604,6169,In his first State of the Union speech in Sept...,Living conditions statistics at regional level,"By global standards, most Europeans are relati...",https://ec.europa.eu/eurostat/statistics-expla...,2021-07-05 09:43:00,Poverty and deprivation There are two principa...,6169,[Living conditions in Europe - poverty and soc...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Living conditions, Poverty and social exclusi..."
605,6188,"Universal access to good healthcare, at an aff...",Living conditions in Europe - health conditions,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-15 14:13:00,Key findings This article presents statistics ...,6188,[All articles from the publication Being young...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Population and social conditions, Living cond..."
606,6191,"Among other objectives, EU policy initiatives ...",Living conditions in Europe - childcare arrang...,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-30 14:23:00,Childcare and education arrangements The unequ...,6191,[All articles from the publication Being young...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Population and social conditions, Living cond..."
607,6194,The content of this statistical article is bas...,Maritime passenger statistics,This article presents the latest statistical d...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:21:00,Number of seaborne passengers reaches 419 mill...,6194,[ Freight transport statistics - modal ...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Maritime, Passengers, Statistical article, Tr..."


### Create date information

And some more data cleansing.

In [13]:
dat6['new_date'] = [d.date() for d in dat6['last_update']]  
dat6['year'] = dat6['last_update'].dt.year
dat6['year'] =dat6["year"].astype(str)

dat6.replace('', np.nan, inplace=True)
dat6['related_titles'] = dat6['related_titles'].apply(lambda x: [re.sub(r'\?','-',el) for el in x])
#dat6['related_urls'] = dat6['related_urls'].apply(lambda x: ['https://ec.europa.eu'+el for el in x if el.startswith('/eurostat/')])

dat6['year'].fillna(value="Not found", inplace=True)

print(dat6.isnull().sum(),'\n')

dat6.reset_index(drop=True,inplace=True)
dat6

id                 0
context           60
title              0
abstract           7
url                0
last_update        0
raw content        0
article_id         0
related_titles     0
related_urls       0
categories         0
new_date           0
year               0
dtype: int64 



Unnamed: 0,id,context,title,abstract,url,last_update,raw content,article_id,related_titles,related_urls,categories,new_date,year
0,13,Lifelong learning can take place in a variety ...,Adult learning statistics,This article provides an overview of adult lea...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 11:25:00,Participation rate of adults in learning in th...,13,[Adult learning statistics - characteristics o...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Education and training, Lifelong learning, St...",2021-07-01,2021
1,14,Adults with a low level of educational attainm...,Adult learning statistics - characteristics of...,This article presents an overview of European ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 14:20:00,Formal and non-formal adult education and trai...,14,[Education and training in the EU - facts and ...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Education and training, Participation in educ...",2021-07-01,2021
2,57,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-28 16:29:00,Developments for GDP in the EU-27: growth sinc...,57,[European system of national and regional acco...,[https://ec.europa.eu/eurostat/statistics-expl...,"[National accounts (incl. GDP), Statistical ar...",2021-06-28,2021
3,95,Trade is an important indicator of Europeas pr...,South Korea-EU - international trade in goods ...,This article provides a picture of the interna...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-21 14:25:00,"Recent developments, impact of COVID-19 The CO...",95,"[International trade in goods, Extra-EU trade ...",[https://ec.europa.eu/eurostat/statistics-expl...,"[Non-EU countries, Trade in goods, Statistical...",2021-04-21,2021
4,225,The Europe 2020 strategy is the EUas growth st...,Statistics on European cities,More than half the worldas population resides ...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-03 11:18:00,Population Population statistics presented in ...,225,[ Territorial typologies for European c...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Cities, Regional yearbook, Regions and cities...",2020-12-03,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,6169,In his first State of the Union speech in Sept...,Living conditions statistics at regional level,"By global standards, most Europeans are relati...",https://ec.europa.eu/eurostat/statistics-expla...,2021-07-05 09:43:00,Poverty and deprivation There are two principa...,6169,[Living conditions in Europe - poverty and soc...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Living conditions, Poverty and social exclusi...",2021-07-05,2021
605,6188,"Universal access to good healthcare, at an aff...",Living conditions in Europe - health conditions,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-15 14:13:00,Key findings This article presents statistics ...,6188,[All articles from the publication Being young...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Population and social conditions, Living cond...",2020-12-15,2020
606,6191,"Among other objectives, EU policy initiatives ...",Living conditions in Europe - childcare arrang...,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-30 14:23:00,Childcare and education arrangements The unequ...,6191,[All articles from the publication Being young...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Population and social conditions, Living cond...",2021-03-30,2021
607,6194,The content of this statistical article is bas...,Maritime passenger statistics,This article presents the latest statistical d...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:21:00,Number of seaborne passengers reaches 419 mill...,6194,[ Freight transport statistics - modal ...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Maritime, Passengers, Statistical article, Tr...",2021-07-07,2021


## D. Add themes / sub-themes information in the articles
***

* We create dictionary _themes_ manually.
* Dictionary _dict_categories_ is used for debugging. The keys are the categories found in the SE articles and the values are the corresponding article ids.
* Each article will have a list of themes and corresponding sub-themes, potentially empty. If the article has a category which is a key of _themes_ the theme is added to the first list. If it has a category which is in one of the values of _themes_ i.e. it is a sub-theme, the corresponding key (theme) is added to the first list and the sub-theme is added to the second list.
* There are relatively few articles without such information:
    * No info in themes:  51
    * No info in sub_themes:  90

In [14]:
import ast

themes = {'General and regional statistics/EU policies':
          ['Non-EU countries','Regions and cities','Sustainable development goals',
          'Policy indicators'],
          'Economy and finance': 
          ['Balance of payments','Comparative price levels (PPPs)','Consumer prices',
           'Exchange rates and interest rates','Government finance','National accounts (incl. GDP)'],
          'Population and social conditions':
          ['Asylum and migration','Crime','Culture','Education and training','Health',
           'Labour market','Living conditions','Population','Social protection','Sport','Youth'],
          'Industry and services': ['Short-term business statistics','Structural business statistics',
                                    'Business registers','Globalisation in businesses','Production statistics',
                                    'Tourism'],
          'Agriculture, forestry and fisheries':['Agriculture','Fisheries','Forestry'],
          'International trade':['Goods','Services'],
          'Transport':[],
          'Environment and energy':['Energy','Environment'],
          'Science, technology and digital society':['Digital economy and society','Science and technology']}

dict_categories=dict()

for i in range(len(dat6)):
    
    cats=dat6.loc[i,'categories']
    cats = [cat.strip() for cat in cats]
        
    for cat in cats:
        if cat in dict_categories.keys():
            dict_categories[cat].append(dat6.loc[i,'id'])
        else:
            dict_categories[cat] = [dat6.loc[i,'id']]



dat6['themes'] = pd.Series([set() for i in range(len(dat6))])
dat6['sub_themes'] = pd.Series([set() for i in range(len(dat6))])
for i in range(len(dat6)):
    
    cats=dat6.loc[i,'categories']
    cats = [cat.strip() for cat in cats]

    for cat in cats:
        if cat in themes.keys():
            dat6.loc[i,'themes'].add(cat)
        else:
            for theme in themes.keys():
                if cat in themes[theme]:
                    dat6.loc[i,'themes'].add(theme)
                    dat6.loc[i,'sub_themes'].add(cat)
    
dat6['themes'] = dat6['themes'].apply(lambda x: ';'.join(x))    
dat6['sub_themes'] = dat6['sub_themes'].apply(lambda x: ';'.join(x))    

#dat6['categories']= dat6['categories'].apply(lambda x: ';'.join(x))  ## de-comment to produce the input file for R Shiny, 
## i.e. categories not in list but separated by semicolon    

print(dat6.isnull().sum(),'\n')
dat6

id                 0
context           60
title              0
abstract           7
url                0
last_update        0
raw content        0
article_id         0
related_titles     0
related_urls       0
categories         0
new_date           0
year               0
themes             0
sub_themes         0
dtype: int64 



Unnamed: 0,id,context,title,abstract,url,last_update,raw content,article_id,related_titles,related_urls,categories,new_date,year,themes,sub_themes
0,13,Lifelong learning can take place in a variety ...,Adult learning statistics,This article provides an overview of adult lea...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 11:25:00,Participation rate of adults in learning in th...,13,[Adult learning statistics - characteristics o...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Education and training, Lifelong learning, St...",2021-07-01,2021,Population and social conditions,Education and training
1,14,Adults with a low level of educational attainm...,Adult learning statistics - characteristics of...,This article presents an overview of European ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 14:20:00,Formal and non-formal adult education and trai...,14,[Education and training in the EU - facts and ...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Education and training, Participation in educ...",2021-07-01,2021,Population and social conditions,Education and training
2,57,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-28 16:29:00,Developments for GDP in the EU-27: growth sinc...,57,[European system of national and regional acco...,[https://ec.europa.eu/eurostat/statistics-expl...,"[National accounts (incl. GDP), Statistical ar...",2021-06-28,2021,Economy and finance,National accounts (incl. GDP)
3,95,Trade is an important indicator of Europeas pr...,South Korea-EU - international trade in goods ...,This article provides a picture of the interna...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-21 14:25:00,"Recent developments, impact of COVID-19 The CO...",95,"[International trade in goods, Extra-EU trade ...",[https://ec.europa.eu/eurostat/statistics-expl...,"[Non-EU countries, Trade in goods, Statistical...",2021-04-21,2021,International trade;General and regional stati...,Non-EU countries
4,225,The Europe 2020 strategy is the EUas growth st...,Statistics on European cities,More than half the worldas population resides ...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-03 11:18:00,Population Population statistics presented in ...,225,[ Territorial typologies for European c...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Cities, Regional yearbook, Regions and cities...",2020-12-03,2020,General and regional statistics/EU policies,Regions and cities
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,6169,In his first State of the Union speech in Sept...,Living conditions statistics at regional level,"By global standards, most Europeans are relati...",https://ec.europa.eu/eurostat/statistics-expla...,2021-07-05 09:43:00,Poverty and deprivation There are two principa...,6169,[Living conditions in Europe - poverty and soc...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Living conditions, Poverty and social exclusi...",2021-07-05,2021,Population and social conditions;General and r...,Living conditions;Regions and cities
605,6188,"Universal access to good healthcare, at an aff...",Living conditions in Europe - health conditions,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-15 14:13:00,Key findings This article presents statistics ...,6188,[All articles from the publication Being young...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Population and social conditions, Living cond...",2020-12-15,2020,Population and social conditions,Living conditions;Health
606,6191,"Among other objectives, EU policy initiatives ...",Living conditions in Europe - childcare arrang...,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-30 14:23:00,Childcare and education arrangements The unequ...,6191,[All articles from the publication Being young...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Population and social conditions, Living cond...",2021-03-30,2021,Population and social conditions,Living conditions
607,6194,The content of this statistical article is bas...,Maritime passenger statistics,This article presents the latest statistical d...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:21:00,Number of seaborne passengers reaches 419 mill...,6194,[ Freight transport statistics - modal ...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Maritime, Passengers, Statistical article, Tr...",2021-07-07,2021,Transport,


## E. Tokenize and stem the articles titles, contexts, abstracts and contents
***

In [15]:
#Stemming.

from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.porter import PorterStemmer

p = PorterStemmer()

def text_to_words(text):
    words = str(gensim.utils.simple_preprocess(text, deacc=True))
    words = remove_stopwords(words) 
    words = gensim.utils.tokenize(words)
        
    ## keep also original token!!! 
    words = [p.stem(token) for token in words]  
    #print(words)
    return ' '.join(words)        

for i in range(len(dat6)):
    dat6.loc[i,'title tokens']=text_to_words(dat6.loc[i,'title'])
    if not pd.isnull(dat6.loc[i,'context']):
        dat6.loc[i,'context tokens']=text_to_words(dat6.loc[i,'context'])
    else:
        dat6.loc[i,'context tokens']=''
    if not pd.isnull(dat6.loc[i,'abstract']):        
        dat6.loc[i,'abstract tokens']=text_to_words(dat6.loc[i,'abstract'])
    else:
        dat6.loc[i,'abstract tokens']=''
    dat6.loc[i,'raw content tokens']=text_to_words(dat6.loc[i,'raw content'])


dat6

Unnamed: 0,id,context,title,abstract,url,last_update,raw content,article_id,related_titles,related_urls,categories,new_date,year,themes,sub_themes,title tokens,context tokens,abstract tokens,raw content tokens
0,13,Lifelong learning can take place in a variety ...,Adult learning statistics,This article provides an overview of adult lea...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 11:25:00,Participation rate of adults in learning in th...,13,[Adult learning statistics - characteristics o...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Education and training, Lifelong learning, St...",2021-07-01,2021,Population and social conditions,Education and training,adult learn statist,lifelong learn can take place in varieti of en...,thi articl provid an overview of adult learn s...,particip rate of adult in learn in the last fo...
1,14,Adults with a low level of educational attainm...,Adult learning statistics - characteristics of...,This article presents an overview of European ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-01 14:20:00,Formal and non-formal adult education and trai...,14,[Education and training in the EU - facts and ...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Education and training, Participation in educ...",2021-07-01,2021,Population and social conditions,Education and training,adult learn statist characterist of educ and t...,adult with low level of educ attain and lack o...,thi articl present an overview of european uni...,formal and non formal adult educ and train abo...
2,57,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-28 16:29:00,Developments for GDP in the EU-27: growth sinc...,57,[European system of national and regional acco...,[https://ec.europa.eu/eurostat/statistics-expl...,"[National accounts (incl. GDP), Statistical ar...",2021-06-28,2021,Economy and finance,National accounts (incl. GDP),nation account and gdp,european institut govern central bank as well ...,nation account ar the sourc for multitud of we...,develop for gdp in the eu growth sinc the glob...
3,95,Trade is an important indicator of Europeas pr...,South Korea-EU - international trade in goods ...,This article provides a picture of the interna...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-21 14:25:00,"Recent developments, impact of COVID-19 The CO...",95,"[International trade in goods, Extra-EU trade ...",[https://ec.europa.eu/eurostat/statistics-expl...,"[Non-EU countries, Trade in goods, Statistical...",2021-04-21,2021,International trade;General and regional stati...,Non-EU countries,south korea eu intern trade in good statist,trade is an import indic of europea prosper an...,thi articl provid pictur of the intern trade i...,recent develop impact of covid the covid crisi...
4,225,The Europe 2020 strategy is the EUas growth st...,Statistics on European cities,More than half the worldas population resides ...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-03 11:18:00,Population Population statistics presented in ...,225,[ Territorial typologies for European c...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Cities, Regional yearbook, Regions and cities...",2020-12-03,2020,General and regional statistics/EU policies,Regions and cities,statist on european citi,the europ strategi is the eua growth strategi ...,more than half the worlda popul resid in urban...,popul popul statist present in thi chapter ar ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,6169,In his first State of the Union speech in Sept...,Living conditions statistics at regional level,"By global standards, most Europeans are relati...",https://ec.europa.eu/eurostat/statistics-expla...,2021-07-05 09:43:00,Poverty and deprivation There are two principa...,6169,[Living conditions in Europe - poverty and soc...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Living conditions, Poverty and social exclusi...",2021-07-05,2021,Population and social conditions;General and r...,Living conditions;Regions and cities,live condit statist at region level,in hi first state of the union speech in septe...,by global standard most european ar rel prospe...,poverti and depriv there ar two princip measur...
605,6188,"Universal access to good healthcare, at an aff...",Living conditions in Europe - health conditions,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-15 14:13:00,Key findings This article presents statistics ...,6188,[All articles from the publication Being young...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Population and social conditions, Living cond...",2020-12-15,2020,Population and social conditions,Living conditions;Health,live condit in europ health condit,univers access to good healthcar at an afford ...,thi articl is part of set of statist articl th...,kei find thi articl present statist relat to l...
606,6191,"Among other objectives, EU policy initiatives ...",Living conditions in Europe - childcare arrang...,This article is part of a set of statistical a...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-30 14:23:00,Childcare and education arrangements The unequ...,6191,[All articles from the publication Being young...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Population and social conditions, Living cond...",2021-03-30,2021,Population and social conditions,Living conditions,live condit in europ childcar arrang,among other object eu polici initi for exampl ...,thi articl is part of set of statist articl th...,childcar and educ arrang the unequ divis of ho...
607,6194,The content of this statistical article is bas...,Maritime passenger statistics,This article presents the latest statistical d...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:21:00,Number of seaborne passengers reaches 419 mill...,6194,[ Freight transport statistics - modal ...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Maritime, Passengers, Statistical article, Tr...",2021-07-07,2021,Transport,,maritim passeng statist,the content of thi statist articl is base on d...,thi articl present the latest statist data on ...,number of seaborn passeng reach million the nu...


In [18]:
dat6['related_urls'].apply(lambda x: len(x))

0      9
1      4
2      4
3      2
4      1
      ..
604    3
605    3
606    2
607    4
608    2
Name: related_urls, Length: 609, dtype: int64

In [19]:
print('No info in themes: ',sum(dat6['themes']==''))
print('No info in sub_themes: ',sum(dat6['sub_themes']==''))

import datetime

def file_name(pre,ext):
    current_time = datetime.datetime.now() 
    return pre + '_'+ str(current_time.month)+ '_' + str(current_time.day) + \
                 '_' + str(current_time.hour)+ '_' + str(current_time.minute)  +'.'+ext
    

outfile = file_name('dat6','xlsx')
dat6.to_excel(outfile)


No info in themes:  50
No info in sub_themes:  86


## F. Facets
***
### The function filtering the results.

In [22]:

df1 = None

def articles(Top_articles, Keywords, Categories, themes_dd, sub_themes_dd,year,related): 

    global df1
    
    Keywords = text_to_words(Keywords) 
    df1 = dat6[dat6['title tokens'].str.contains(Keywords,regex=False) | dat6['raw content tokens'].str.contains(Keywords,regex=False)]
    df1 = df1[df1['context tokens'].str.contains(Keywords,regex=False) | df1['abstract tokens'].str.contains(Keywords,regex=False)]
    
    if year != "All years":    
        df1 = df1[df1['year'].str.contains(year)]    
        
        
    if themes_dd != "All themes":
        df1 = df1[df1['themes'].str.contains(themes_dd,regex=False)]
        

    if sub_themes_dd is not None:
        if sub_themes_dd != "All sub-themes" and sub_themes_dd != "":    
            df1 = df1[df1['sub_themes'].str.contains(sub_themes_dd,regex=False)]
    
    if Categories is not None:
        if Categories != "All categories":
            
            df1 = df1[df1['categories'].apply(lambda x: Categories in x)]
        

    if len(df1) == 0:
        print("No matches found")
    else:
        df1.reset_index(inplace=True)
        print(df1.title.count()," articles found")
        h = ''
        for i in range(min(Top_articles,len(df1))):
            l,n = df1.loc[i,["url","title"]].values
            h += '<br/><u><a href="' + l + '" target="_blank">'+ n + '</a></u>'
            if related:
                h += '<blockquote>Related links:'
                for k in range(len(df1.loc[i,"related_titles"])):
                    title = df1.loc[i,"related_titles"][k]
                    url = df1.loc[i,"related_urls"][k]
                    h += '<br><a href="' + url + '" target="_blank">'+ title + '</a>'
                h += '</blockquote>'
        display(HTML( h))
  
    

### The widgets.

In [23]:
from IPython.display import display
from ipywidgets import HTML
layout = widgets.Layout(width='500px', height='30px')

def new_categories():
    if len(df1) > 0:
        res=set()
        for i in range(len(df1)):
            res.update(df1.loc[i,'categories'])
        return sorted(list(res))
    else:
        return []
    
    

def query_build2(themes):
    style = {'description_width': 'initial'}

    Categories = widgets.Dropdown(
        description='Select category:',
        options=['All categories'] + sorted(list(dict_categories.keys())),
        style=style
    )
    
    
    themes_dd = widgets.Dropdown(
        description='Select theme:',
        options=['All themes']+sorted([k for k in themes.keys()]),
        style=style
    )    
    

    def on_change_theme(change):
        if change['type'] == 'change' and change['name'] == 'value':
            if change['new'] == 'All themes':
                sub_themes_dd.options = []
                Categories.options = ['All categories']+sorted(list(dict_categories.keys()))
            else:    
                sub_themes_dd.options = ['All sub-themes']+themes[themes_dd.value]
                
                

    themes_dd.observe(on_change_theme)
    
    sub_themes_dd = widgets.Dropdown(
        description='Select sub-theme:',
        options= [''],
        style=style
    )    
    
    def on_change_subtheme(change):
        if change['type'] == 'change' and change['name'] == 'value':
            Categories.options= []
            Categories.options = ['All categories']+new_categories()

    sub_themes_dd.observe(on_change_subtheme)
 
    Top_articles = widgets.IntSlider(
        description='Display',
        tooltip='maximum:',
        value=20,
        min=1, 
        max = 30,
        style={'description_width': 'initial'}
    )


    Keywords = widgets.Text(
        value='',
        placeholder='Type something',
        description='Keywords:',
        disabled=False
    )
    

    year = widgets.Dropdown(
        options=['All years','2021','2020','2019','2018','Not found'],
        value='All years',
        description='Year:',
        disabled=False)


    
    related = widgets.Checkbox(
        value=False,
        description='Show related links',
        disabled=False,
        indent=True
     )    
    
    out = widgets.interactive_output(articles, {'Top_articles': Top_articles, 'Keywords': Keywords, 'Categories':Categories,
                                              'themes_dd':themes_dd,'sub_themes_dd':sub_themes_dd,'year':year,'related':related})




    left_box = widgets.VBox([themes_dd, sub_themes_dd])
    right_box = widgets.VBox([Keywords,Categories,related])
    box = widgets.HBox([left_box, right_box])
    display(box)
    
    display(year,Top_articles,out)
    
        

query_build2(themes)



HBox(children=(VBox(children=(Dropdown(description='Select theme:', options=('All themes', 'Agriculture, fores…

Dropdown(description='Year:', options=('All years', '2021', '2020', '2019', '2018', 'Not found'), value='All y…

IntSlider(value=20, description='Display', max=30, min=1, style=SliderStyle(description_width='initial'))

Output()