# Use Case A - Faceted search (in progress)
***

In [1]:
import pandas as pd
import numpy as np

import ipywidgets as widgets

import pyodbc

import gensim


## A. Import Statistics Explained data from the database
***

T1: id, context and last update.  
T2: ids, titles and URLs.  
Merge above (inner join on id).

T3: ids, titles and abstracts: from the ESTAT.V1.dat_article_paragraph table with abstract=1.  
Merge above (inner join on id, article_id).


In [2]:


c = pyodbc.connect('DSN=Virtuoso All;DBA=ESTAT;UID=user_name;PWD=password')
cursor = c.cursor()

SQLCommand = """SELECT T1.id, T1.context, T1.last_update, T2.title, T2.url, T3.article_id, T3.content 
                FROM ESTAT.V1.dat_article as T1 
                INNER JOIN ESTAT.V1.dat_link_info as T2  
                  ON T1.id=T2.id  
                INNER JOIN ESTAT.V1.dat_article_paragraph as T3  
                  ON T2.id=T3.article_id  
                WHERE T3.abstract=1"""

dat3 = pd.read_sql(SQLCommand,c)
dat3.rename(columns={'content':'abstract'},inplace=True)
dat3 = dat3[['id','context','title','abstract','url','last_update']]

dat3


Unnamed: 0,id,context,title,abstract,url,last_update
0,1,The COVID-19 pandemic hit Europe in January an...,Absences from work - quarterly statist...,Absences from work can be classified into two...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-14 16:09:00
1,10,The importance of action to prevent accidents ...,Accidents and injuries statistics,This article presents an overview of European ...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-16 14:36:00
2,39,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00
3,50,"A safe, healthy working environment is a cruci...",Accidents at work - statistics by econ...,This article presents a set of main statistic...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-07 17:31:00
4,56,Trade is an important indicator of Europeâs ...,Africa-EU - international trade in goo...,This article provides a picture of internation...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-29 10:45:00
...,...,...,...,...,...,...
638,4970,Trade is an important indicator of Europeâs ...,EU trade in COVID-19 related products ...,To help prevent the spread of the COVID-19 pa...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-31 13:04:00
639,4972,Having a secure supply of energy is crucial fo...,EU imports of energy products - recent...,This article provides a picture of trade in en...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-10 11:23:00
640,4987,Economic and financial statistics have become ...,European Neighbourhood Policy - South ...,This article is part of an online publication ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-22 11:47:00
641,4989,,Ageing Europe - statistics on pensions...,Ageing Europe â looking at the lives of olde...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-28 08:48:00


### Content
ids, titles and contents.  
From the ESTAT.V1.dat_article_paragraph table with abstract=0.

In [3]:
add_content = pd.read_sql("SELECT id,article_id,title,content,abstract FROM ESTAT.V1.dat_article_paragraph WHERE abstract=0",c)
add_content

Unnamed: 0,id,article_id,title,content,abstract
0,1,1,Absences from work sharply increase in first h...,Absences from work recorded unprecedented hig...,0
1,2,1,Absences: 9.5 % of employment in Q4 2019 and 1...,The article's next figure (Figure 4) compares...,0
2,3,1,Higher share of absences from work among women...,"Considering all four quarters of 2020, the sh...",0
3,4,1,Absences from work due to own illness or disab...,"From Q4 2019 to Q4 2020, the number of people...",0
4,5,1,Absences from work due to holidays,"Expressed as a share of employed people, abse...",0
...,...,...,...,...,...
2723,3366,4989,Measuring expenditure patterns,Household budget surveys (HBS) focus on collec...,0
2724,3367,4997,Participation rate of adults in learning in th...,The strategic framework for European cooperat...,0
2725,3368,4997,Participation rate of adults in learning in th...,In addition to the data from the labour force...,0
2726,3369,4997,Providers of non-formal education and training...,Employers were the most common providers of n...,0


### Aggregate above by article id

Aggregate titles and contents in one string.

In [4]:
add_content_grouped = add_content.groupby(['article_id'])['title','content'].aggregate(lambda x: list(x))
add_content_grouped.reset_index(inplace=True)
for i in range(len(add_content_grouped)):
    add_content_grouped.loc[i,'raw content'] = ''
    for (a,b) in zip(add_content_grouped.loc[i,'title'],add_content_grouped.loc[i,'content']):
        add_content_grouped.loc[i,'raw content'] += ' '+a + ' ' + b
add_content_grouped = add_content_grouped[['article_id','raw content']]    

add_content_grouped

  add_content_grouped = add_content.groupby(['article_id'])['title','content'].aggregate(lambda x: list(x))


Unnamed: 0,article_id,raw content
0,1,Absences from work sharply increase in first ...
1,10,"Deaths from accidents, injuries and assault ..."
2,39,"Number of accidents In 2018, there were 3.1 ..."
3,50,Developments over time Non-fatal accidents ...
4,56,Africaâs main trade in goods partner is the...
...,...,...
615,4970,Sharp increase in COVID-19 related imports in...
616,4972,Overview The latest figures show the upward ...
617,4987,Current account The current account of the ...
618,4989,Pensions The transition for individuals from...


### Merge above

* Inner join on id, article_id.

In [5]:

dat4 = pd.merge(dat3,add_content_grouped,left_on='id',right_on='article_id',how='inner')
dat4.drop(['article_id'],axis=1,inplace=True)

dat4


Unnamed: 0,id,context,title,abstract,url,last_update,raw content
0,1,The COVID-19 pandemic hit Europe in January an...,Absences from work - quarterly statist...,Absences from work can be classified into two...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-14 16:09:00,Absences from work sharply increase in first ...
1,10,The importance of action to prevent accidents ...,Accidents and injuries statistics,This article presents an overview of European ...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-16 14:36:00,"Deaths from accidents, injuries and assault ..."
2,39,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00,"Number of accidents In 2018, there were 3.1 ..."
3,50,"A safe, healthy working environment is a cruci...",Accidents at work - statistics by econ...,This article presents a set of main statistic...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-07 17:31:00,Developments over time Non-fatal accidents ...
4,56,Trade is an important indicator of Europeâs ...,Africa-EU - international trade in goo...,This article provides a picture of internation...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-29 10:45:00,Africaâs main trade in goods partner is the...
...,...,...,...,...,...,...,...
615,4970,Trade is an important indicator of Europeâs ...,EU trade in COVID-19 related products ...,To help prevent the spread of the COVID-19 pa...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-31 13:04:00,Sharp increase in COVID-19 related imports in...
616,4972,Having a secure supply of energy is crucial fo...,EU imports of energy products - recent...,This article provides a picture of trade in en...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-10 11:23:00,Overview The latest figures show the upward ...
617,4987,Economic and financial statistics have become ...,European Neighbourhood Policy - South ...,This article is part of an online publication ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-22 11:47:00,Current account The current account of the ...
618,4989,,Ageing Europe - statistics on pensions...,Ageing Europe â looking at the lives of olde...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-28 08:48:00,Pensions The transition for individuals from...


## B. Data cleansing
***


In [6]:
import re

def clean(x, quotes=True):
    if pd.isnull(x): return x  
    x = x.strip()
    
    ## make letter-question mark-letter -> letter-quote-space-letter !!! but NOT in the lists of URLs!!!
    if quotes:
        x = re.sub(r'([A-Za-z])\?([A-Za-z])','\\1\' \\2',x) ## NEW
    
    ## make letter-question mark-space lower case letter letter-quote-space letter
    x = re.sub(r'([A-Za-z])\? ([a-z])','\\1\' \\2',x) ## NEW

    ## delete ,000 commas in numbers    
    x = re.sub(r'\b(\d+),(\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## delete  000 spaces in numbers
    x = re.sub(r'\b(\d+) (\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## remove more than one spaces
    x = re.sub(r' +', ' ',x)
    
    ## remove start and end spaces
    x = re.sub(r'^ +| +$', '',x,flags=re.MULTILINE) 
    
    ## space-comma -> comma
    x = re.sub(r' \,',',',x)
    
    ## space-dot -> dot
    x = re.sub(r' \.','.',x)
    
    #x = x.encode('latin1').decode('utf-8') ## â\x80\x99
    
    return x

In [7]:
dat4['title'].apply(clean)
dat4['abstract'].apply(clean)
dat4['raw content'].apply(clean)
dat4['url'].apply(clean,quotes=False)

dat4

Unnamed: 0,id,context,title,abstract,url,last_update,raw content
0,1,The COVID-19 pandemic hit Europe in January an...,Absences from work - quarterly statist...,Absences from work can be classified into two...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-14 16:09:00,Absences from work sharply increase in first ...
1,10,The importance of action to prevent accidents ...,Accidents and injuries statistics,This article presents an overview of European ...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-16 14:36:00,"Deaths from accidents, injuries and assault ..."
2,39,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00,"Number of accidents In 2018, there were 3.1 ..."
3,50,"A safe, healthy working environment is a cruci...",Accidents at work - statistics by econ...,This article presents a set of main statistic...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-07 17:31:00,Developments over time Non-fatal accidents ...
4,56,Trade is an important indicator of Europeâs ...,Africa-EU - international trade in goo...,This article provides a picture of internation...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-29 10:45:00,Africaâs main trade in goods partner is the...
...,...,...,...,...,...,...,...
615,4970,Trade is an important indicator of Europeâs ...,EU trade in COVID-19 related products ...,To help prevent the spread of the COVID-19 pa...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-31 13:04:00,Sharp increase in COVID-19 related imports in...
616,4972,Having a secure supply of energy is crucial fo...,EU imports of energy products - recent...,This article provides a picture of trade in en...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-10 11:23:00,Overview The latest figures show the upward ...
617,4987,Economic and financial statistics have become ...,European Neighbourhood Policy - South ...,This article is part of an online publication ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-22 11:47:00,Current account The current account of the ...
618,4989,,Ageing Europe - statistics on pensions...,Ageing Europe â looking at the lives of olde...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-28 08:48:00,Pensions The transition for individuals from...


## C. Read categories from external file
***

In a later version, this will be done through a connection with the knowledge database.

In [8]:
categories = pd.read_excel('articles_6_25_19_30.xlsx',index_col=None) ## use the default index
categories['url'].apply(clean,quotes=False)
categories['title'].apply(clean,quotes=False)
categories

Unnamed: 0.1,Unnamed: 0,abstract,alerts,categories,context,data_sources,excel,full_article,last_update,title,url,Titles,Raw content
0,0,This article presents an overview of European...,,"['Education and training', 'Participation in e...",Adults with a low level of educational attainm...,The adult education survey (AES) is the sour...,[{'title': 'Adult learning statistics ET2018.x...,[{'content': ' About 44 % of adults aged ...,,Adult learning statistics - cha...,https://ec.europa.eu/eurostat/statistics-expla...,Formal and non-formal adult education and trai...,Formal and non-formal adult education and trai...
1,1,Leaving the parental home is considered as a ...,,"['Household composition and family situation',...","In addition to the Labour Force Survey (LFS), ...",Source: Statistics presented in this article ...,"[{'title': 'Map, Tables and figures.xlsx',\n '...",[{'content': ' Map 1 indicates that in 2019...,,Age of young people leaving the...,https://ec.europa.eu/eurostat/statistics-expla...,Geographical differences. Gender differences. ...,Geographical differences. Map 1 indicates that...
2,2,This article presents an overview of statistic...,,"['Services', 'Statistical article', 'Structura...",The freedom to provide services and the freedo...,Coverage Administrative and support services ...,,[{'content': ' In 2017 there were 1.4 milli...,,Administrative and support serv...,https://ec.europa.eu/eurostat/statistics-expla...,Structural profile. Sectoral analysis. Country...,Structural profile. In 2017 there were 1.4 mil...
3,3,This article provides a picture of internation...,,"['Non-EU countries', 'Trade in goods', 'Statis...",Trade is an important indicator of Europe’s pr...,EU data is taken from Eurostat's COMEXT da...,"[{'title': 'Africa 2021.xlsx',\n 'url': '/euro...","[{'content': ' In 2020, the largest trade p...",,Africa-EU - international trade...,https://ec.europa.eu/eurostat/statistics-expla...,Africa’s main trade in goods partner is the EU...,Africa’s main trade in goods partner is the EU...
4,4,This article presents recent statistics on th...,"[{'content': '', 'title': 'Table 4 is availa...","['Asylum and migration', 'Population', 'Acquis...","Within the European Commission, the Directorat...",Eurostat produces statistics on a range of is...,[{'title': 'Acquisitions of citizenship 15 Mar...,"[{'content': ' In 2019, 706 400 people obta...",,Acquisition of citizenship stat...,https://ec.europa.eu/eurostat/statistics-expla...,EU-27 Member States granted citizenship to 706...,EU-27 Member States granted citizenship to 706...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,620,Ageing Europe — looking at the lives of older ...,[{'content': 'Within the EU survey on income a...,"['Statistical article', 'Poverty and social ex...",,,[{'title': '06 Ageing Europe Social life and o...,[{'content': ' People at work often exert t...,,Ageing Europe - statistics on s...,https://ec.europa.eu/eurostat/statistics-expla...,Physical activity of older people. Older peopl...,Physical activity of older people. People at w...
621,621,Ageing Europe — looking at the lives of older ...,,"['Statistical article', 'Labour market', 'Acci...",,,[{'title': '04 Ageing Europe Working and movin...,"[{'content': ' In 2019, there were 200.0 mi...",,Ageing Europe - statistics on w...,https://ec.europa.eu/eurostat/statistics-expla...,Employment patterns among older people. Focus ...,Employment patterns among older people. In 201...
622,622,Ageing Europe — looking at the lives of older ...,,"['Statistical article', 'Population', 'Populat...",,,[{'title': '01 Ageing Europe Population develo...,[{'content': ' Population ageing will rapi...,,Ageing Europe - statistics on p...,https://ec.europa.eu/eurostat/statistics-expla...,Older people — population overview. Older peop...,Older people — population overview. Population...
623,623,Ageing Europe — looking at the lives of older ...,[{'content': 'Material deprivation is the enfo...,"['Statistical article', 'Housing', 'Living con...",,,[{'title': '02 Ageing Europe Housing and livin...,[{'content': ' Recent decades have been cha...,,Ageing Europe - statistics on h...,https://ec.europa.eu/eurostat/statistics-expla...,Household composition among older people. Olde...,Household composition among older people. Rece...


### Merge with main file

Merge on URL (inner join).

In [9]:
dat5 = pd.merge(dat4,categories[['categories','url']],on='url',how='inner')

import datetime

def file_name(pre,ext):
    current_time = datetime.datetime.now() 
    return pre + '_'+ str(current_time.month)+ '_' + str(current_time.day) + \
                 '_' + str(current_time.hour)+ '_' + str(current_time.minute)  +'.'+ext
    
dat5

Unnamed: 0,id,context,title,abstract,url,last_update,raw content,categories
0,1,The COVID-19 pandemic hit Europe in January an...,Absences from work - quarterly statist...,Absences from work can be classified into two...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-14 16:09:00,Absences from work sharply increase in first ...,"['Employment', 'Labour market', 'Statistical a..."
1,10,The importance of action to prevent accidents ...,Accidents and injuries statistics,This article presents an overview of European ...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-16 14:36:00,"Deaths from accidents, injuries and assault ...","['Health', 'Health status', 'Statistical artic..."
2,39,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00,"Number of accidents In 2018, there were 3.1 ...","['Accidents at work', 'Health', 'Health and sa..."
3,50,"A safe, healthy working environment is a cruci...",Accidents at work - statistics by econ...,This article presents a set of main statistic...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-07 17:31:00,Developments over time Non-fatal accidents ...,"['Accidents at work', 'Health', 'Health and sa..."
4,56,Trade is an important indicator of Europeâs ...,Africa-EU - international trade in goo...,This article provides a picture of internation...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-29 10:45:00,Africaâs main trade in goods partner is the...,"['Non-EU countries', 'Trade in goods', 'Statis..."
...,...,...,...,...,...,...,...,...
614,4970,Trade is an important indicator of Europeâs ...,EU trade in COVID-19 related products ...,To help prevent the spread of the COVID-19 pa...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-31 13:04:00,Sharp increase in COVID-19 related imports in...,"['International trade', 'Trade in goods', 'Tra..."
615,4972,Having a secure supply of energy is crucial fo...,EU imports of energy products - recent...,This article provides a picture of trade in en...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-10 11:23:00,Overview The latest figures show the upward ...,"['Energy', 'Trade in goods', 'Trade in goods b..."
616,4987,Economic and financial statistics have become ...,European Neighbourhood Policy - South ...,This article is part of an online publication ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-22 11:47:00,Current account The current account of the ...,"['Balance of payments', 'European Neighbourhoo..."
617,4989,,Ageing Europe - statistics on pensions...,Ageing Europe â looking at the lives of olde...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-28 08:48:00,Pensions The transition for individuals from...,"['Statistical article', 'Social protection', '..."


In [10]:
del(add_content)
del(dat3,add_content_grouped)
del(dat4,categories)

### Create date information

In [11]:
dat5['new_date'] = [d.date() for d in dat5['last_update']]  
dat5['year'] = dat5['last_update'].dt.year
dat5['year'] =dat5["year"].astype(str)
#pd.set_option('precision', 0)

dat5.replace('', np.nan, inplace=True)

dat5['year'].fillna(value="Not found", inplace=True)

print(dat5.isnull().sum(),'\n')

dat5.reset_index(drop=True,inplace=True)
dat5

id              0
context        65
title           0
abstract        0
url             0
last_update     0
raw content     0
categories      0
new_date        0
year            0
dtype: int64 



Unnamed: 0,id,context,title,abstract,url,last_update,raw content,categories,new_date,year
0,1,The COVID-19 pandemic hit Europe in January an...,Absences from work - quarterly statist...,Absences from work can be classified into two...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-14 16:09:00,Absences from work sharply increase in first ...,"['Employment', 'Labour market', 'Statistical a...",2021-04-14,2021
1,10,The importance of action to prevent accidents ...,Accidents and injuries statistics,This article presents an overview of European ...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-16 14:36:00,"Deaths from accidents, injuries and assault ...","['Health', 'Health status', 'Statistical artic...",2020-09-16,2020
2,39,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00,"Number of accidents In 2018, there were 3.1 ...","['Accidents at work', 'Health', 'Health and sa...",2020-11-26,2020
3,50,"A safe, healthy working environment is a cruci...",Accidents at work - statistics by econ...,This article presents a set of main statistic...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-07 17:31:00,Developments over time Non-fatal accidents ...,"['Accidents at work', 'Health', 'Health and sa...",2020-12-07,2020
4,56,Trade is an important indicator of Europeâs ...,Africa-EU - international trade in goo...,This article provides a picture of internation...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-29 10:45:00,Africaâs main trade in goods partner is the...,"['Non-EU countries', 'Trade in goods', 'Statis...",2021-04-29,2021
...,...,...,...,...,...,...,...,...,...,...
614,4970,Trade is an important indicator of Europeâs ...,EU trade in COVID-19 related products ...,To help prevent the spread of the COVID-19 pa...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-31 13:04:00,Sharp increase in COVID-19 related imports in...,"['International trade', 'Trade in goods', 'Tra...",2021-03-31,2021
615,4972,Having a secure supply of energy is crucial fo...,EU imports of energy products - recent...,This article provides a picture of trade in en...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-10 11:23:00,Overview The latest figures show the upward ...,"['Energy', 'Trade in goods', 'Trade in goods b...",2021-06-10,2021
616,4987,Economic and financial statistics have become ...,European Neighbourhood Policy - South ...,This article is part of an online publication ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-22 11:47:00,Current account The current account of the ...,"['Balance of payments', 'European Neighbourhoo...",2021-06-22,2021
617,4989,,Ageing Europe - statistics on pensions...,Ageing Europe â looking at the lives of olde...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-28 08:48:00,Pensions The transition for individuals from...,"['Statistical article', 'Social protection', '...",2020-09-28,2020


## D. Add themes / sub-themes information in the articles
***

* We create dictionary _themes_ manually.
* Dictionary _dict_categories_ is used for debugging. The keys are the categories found in the SE articles and the values are the corresponding article ids.
* Each article will have a list of themes and corresponding sub-themes, potentially empty. If the article has a category which is a key of _themes_ the theme is added to the first list. If it has a category which is in one of the values of _themes_ i.e. it is a sub-theme, the corresponding key (theme) is added to the first list and the sub-theme is added to the second list.
* There are relatively few articles without such information:
    * No info in themes:  51
    * No info in sub_themes:  90

In [12]:
import ast

themes = {'General and regional statistics/EU policies':
          ['Non-EU countries','Regions and cities','Sustainable development goals',
          'Policy indicators'],
          'Economy and finance': 
          ['Balance of payments','Comparative price levels (PPPs)','Consumer prices',
           'Exchange rates and interest rates','Government finance','National accounts (incl. GDP)'],
          'Population and social conditions':
          ['Asylum and migration','Crime','Culture','Education and training','Health',
           'Labour market','Living conditions','Population','Social protection','Sport','Youth'],
          'Industry and services': ['Short-term business statistics','Structural business statistics',
                                    'Business registers','Globalisation in businesses','Production statistics',
                                    'Tourism'],
          'Agriculture, forestry and fisheries':['Agriculture','Fisheries','Forestry'],
          'International trade':['Goods','Services'],
          'Transport':['Transport'],
          'Environment and energy':['Energy','Environment'],
          'Science, technology and digital society':['Digital economy and society','Science and technology']}

dict_categories=dict()

for i in range(len(dat5)):
    
    cats=dat5.loc[i,'categories']
    cats = [i.split('/')[-1].strip() for i in ast.literal_eval(cats)]
    cats = [cat.strip() for cat in cats]
        
    for cat in cats:
        if cat in dict_categories.keys():
            dict_categories[cat].append(dat5.loc[i,'id'])
        else:
            dict_categories[cat] = [dat5.loc[i,'id']]



dat5['themes'] = pd.Series([set() for i in range(len(dat5))])
dat5['sub_themes'] = pd.Series([set() for i in range(len(dat5))])
for i in range(len(dat5)):
    
    cats=dat5.loc[i,'categories']
    cats = [i.split('/')[-1].strip() for i in ast.literal_eval(cats)]
    cats = [cat.strip() for cat in cats]
    #dat5.loc[i,'themes'] = set()
    #dat5.loc[i,'sub_themes'] = set()

    for cat in cats:
        if cat in themes.keys():
            #print('found theme: ',cat)
            dat5.loc[i,'themes'].add(cat)
            #dat5.loc[i,'sub_themes'].add(cat)
        else:
            for theme in themes.keys():
                if cat in themes[theme]:
                    #print('i=',i,' cat = ',cat,' theme= ',theme)
                    dat5.loc[i,'themes'].add(theme)
                    dat5.loc[i,'sub_themes'].add(cat)
    
dat5['themes'] = dat5['themes'].apply(lambda x: ';'.join(x))    
dat5['sub_themes'] = dat5['sub_themes'].apply(lambda x: ';'.join(x))    

dat5

Unnamed: 0,id,context,title,abstract,url,last_update,raw content,categories,new_date,year,themes,sub_themes
0,1,The COVID-19 pandemic hit Europe in January an...,Absences from work - quarterly statist...,Absences from work can be classified into two...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-14 16:09:00,Absences from work sharply increase in first ...,"['Employment', 'Labour market', 'Statistical a...",2021-04-14,2021,Population and social conditions,Labour market
1,10,The importance of action to prevent accidents ...,Accidents and injuries statistics,This article presents an overview of European ...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-16 14:36:00,"Deaths from accidents, injuries and assault ...","['Health', 'Health status', 'Statistical artic...",2020-09-16,2020,Population and social conditions,Health
2,39,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00,"Number of accidents In 2018, there were 3.1 ...","['Accidents at work', 'Health', 'Health and sa...",2020-11-26,2020,Population and social conditions,Health;Labour market
3,50,"A safe, healthy working environment is a cruci...",Accidents at work - statistics by econ...,This article presents a set of main statistic...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-07 17:31:00,Developments over time Non-fatal accidents ...,"['Accidents at work', 'Health', 'Health and sa...",2020-12-07,2020,Population and social conditions,Health;Labour market
4,56,Trade is an important indicator of Europeâs ...,Africa-EU - international trade in goo...,This article provides a picture of internation...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-29 10:45:00,Africaâs main trade in goods partner is the...,"['Non-EU countries', 'Trade in goods', 'Statis...",2021-04-29,2021,International trade;General and regional stati...,Non-EU countries
...,...,...,...,...,...,...,...,...,...,...,...,...
614,4970,Trade is an important indicator of Europeâs ...,EU trade in COVID-19 related products ...,To help prevent the spread of the COVID-19 pa...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-31 13:04:00,Sharp increase in COVID-19 related imports in...,"['International trade', 'Trade in goods', 'Tra...",2021-03-31,2021,International trade,
615,4972,Having a secure supply of energy is crucial fo...,EU imports of energy products - recent...,This article provides a picture of trade in en...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-10 11:23:00,Overview The latest figures show the upward ...,"['Energy', 'Trade in goods', 'Trade in goods b...",2021-06-10,2021,Environment and energy,Energy
616,4987,Economic and financial statistics have become ...,European Neighbourhood Policy - South ...,This article is part of an online publication ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-22 11:47:00,Current account The current account of the ...,"['Balance of payments', 'European Neighbourhoo...",2021-06-22,2021,General and regional statistics/EU policies;Ec...,Balance of payments;Non-EU countries
617,4989,,Ageing Europe - statistics on pensions...,Ageing Europe â looking at the lives of olde...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-28 08:48:00,Pensions The transition for individuals from...,"['Statistical article', 'Social protection', '...",2020-09-28,2020,Population and social conditions,Social protection


## E. Tokenize and stem the articles titles, contexts, abstracts and contents
***

In [13]:
#Stemming.

from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.porter import PorterStemmer

p = PorterStemmer()

def text_to_words(text):
    words = str(gensim.utils.simple_preprocess(text, deacc=True))
    words = remove_stopwords(words) 
    words = gensim.utils.tokenize(words)
        
    ## keep also original token!!! 
    words = [p.stem(token) for token in words]  
    #print(words)
    return ' '.join(words)        

for i in range(len(dat5)):
    dat5.loc[i,'title tokens']=text_to_words(dat5.loc[i,'title'])
    if not pd.isnull(dat5.loc[i,'context']):
        dat5.loc[i,'context tokens']=text_to_words(dat5.loc[i,'context'])
    else:
        dat5.loc[i,'context tokens']=''
    dat5.loc[i,'abstract tokens']=text_to_words(dat5.loc[i,'abstract'])
    dat5.loc[i,'raw content tokens']=text_to_words(dat5.loc[i,'raw content'])

dat5

Unnamed: 0,id,context,title,abstract,url,last_update,raw content,categories,new_date,year,themes,sub_themes,title tokens,context tokens,abstract tokens,raw content tokens
0,1,The COVID-19 pandemic hit Europe in January an...,Absences from work - quarterly statist...,Absences from work can be classified into two...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-14 16:09:00,Absences from work sharply increase in first ...,"['Employment', 'Labour market', 'Statistical a...",2021-04-14,2021,Population and social conditions,Labour market,absenc from work quarterli statist,the covid pandem hit europ in januari and febr...,absenc from work can be classifi into two grou...,absenc from work sharpli increas in first half...
1,10,The importance of action to prevent accidents ...,Accidents and injuries statistics,This article presents an overview of European ...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-16 14:36:00,"Deaths from accidents, injuries and assault ...","['Health', 'Health status', 'Statistical artic...",2020-09-16,2020,Population and social conditions,Health,accid and injuri statist,the import of action to prevent accid and inju...,thi articl present an overview of european uni...,death from accid injuri and assault in there w...
2,39,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00,"Number of accidents In 2018, there were 3.1 ...","['Accidents at work', 'Health', 'Health and sa...",2020-11-26,2020,Population and social conditions,Health;Labour market,accid at work statist,safe healthi work environ is crucial factor in...,thi articl present set of main statist find in...,number of accid in there were million non fata...
3,50,"A safe, healthy working environment is a cruci...",Accidents at work - statistics by econ...,This article presents a set of main statistic...,https://ec.europa.eu/eurostat/statistics-expla...,2020-12-07 17:31:00,Developments over time Non-fatal accidents ...,"['Accidents at work', 'Health', 'Health and sa...",2020-12-07,2020,Population and social conditions,Health;Labour market,accid at work statist by econom activ,safe healthi work environ is crucial factor in...,thi articl present set of main statist find in...,develop over time non fatal accid in there wer...
4,56,Trade is an important indicator of Europeâs ...,Africa-EU - international trade in goo...,This article provides a picture of internation...,https://ec.europa.eu/eurostat/statistics-expla...,2021-04-29 10:45:00,Africaâs main trade in goods partner is the...,"['Non-EU countries', 'Trade in goods', 'Statis...",2021-04-29,2021,International trade;General and regional stati...,Non-EU countries,africa eu intern trade in good statist,trade is an import indic of europea prosper an...,thi articl provid pictur of intern trade in go...,africaa main trade in good partner is the eu i...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
614,4970,Trade is an important indicator of Europeâs ...,EU trade in COVID-19 related products ...,To help prevent the spread of the COVID-19 pa...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-31 13:04:00,Sharp increase in COVID-19 related imports in...,"['International trade', 'Trade in goods', 'Tra...",2021-03-31,2021,International trade,,eu trade in covid relat product,trade is an import indic of europea prosper an...,to help prevent the spread of the covid pandem...,sharp increas in covid relat import in april a...
615,4972,Having a secure supply of energy is crucial fo...,EU imports of energy products - recent...,This article provides a picture of trade in en...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-10 11:23:00,Overview The latest figures show the upward ...,"['Energy', 'Trade in goods', 'Trade in goods b...",2021-06-10,2021,Environment and energy,Energy,eu import of energi product recent develop,have secur suppli of energi is crucial for the...,thi articl provid pictur of trade in energi pr...,overview the latest figur show the upward tren...
616,4987,Economic and financial statistics have become ...,European Neighbourhood Policy - South ...,This article is part of an online publication ...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-22 11:47:00,Current account The current account of the ...,"['Balance of payments', 'European Neighbourhoo...",2021-06-22,2021,General and regional statistics/EU policies;Ec...,Balance of payments;Non-EU countries,european neighbourhood polici south statist on...,econom and financi statist have becom on of th...,thi articl is part of an onlin public and prov...,current account the current account of the eu ...
617,4989,,Ageing Europe - statistics on pensions...,Ageing Europe â looking at the lives of olde...,https://ec.europa.eu/eurostat/statistics-expla...,2020-09-28 08:48:00,Pensions The transition for individuals from...,"['Statistical article', 'Social protection', '...",2020-09-28,2020,Population and social conditions,Social protection,ag europ statist on pension incom and expenditur,,ag europ look at the live of older peopl in th...,pension the transit for individu from work to ...


In [14]:
print('No info in themes: ',sum(dat5['themes']==''))
print('No info in sub_themes: ',sum(dat5['sub_themes']==''))

outfile = file_name('dat5','xlsx')
dat5.to_excel(outfile)


No info in themes:  51
No info in sub_themes:  90


## F. Facets
***
### The function filtering the results.

In [15]:

def articles(Top_articles, Keywords, Categories, themes_dd, sub_themes_dd,year):

    Keywords = text_to_words(Keywords) 
    print(Keywords)
    
    idx = np.where(dat5['title tokens'].str.contains(Keywords) | dat5['raw content tokens'].str.contains(Keywords) |  
               dat5['context tokens'].str.contains(Keywords) | dat5['abstract tokens'].str.contains(Keywords))
    df1 = dat5.loc[idx]
    
    if year != "All years":    
        df1 = df1[df1['year'].str.contains(year)]    

    if Categories != "All categories":
        df1 = df1[df1['categories'].str.contains(Categories)]
        
        
    if themes_dd != "All themes":
        df1 = df1[df1['themes'].str.contains(themes_dd)]

    if sub_themes_dd:    
        df1 = df1[df1['sub_themes'].str.contains(sub_themes_dd)]

    if df1.title.count() == 0:
        print("No matches found")
    else:
        print(df1.title.count()," articles found")
        h = '<br/>'.join(['<a href="' + l + '" target="_blank">'+ n for l,n in df1[["url","title"]][0:Top_articles].values])
        
        return display(HTML( h))
  
    

### The widgets.

In [16]:
from IPython.display import display
from ipywidgets import HTML
layout = widgets.Layout(width='500px', height='30px')


def query_build2(themes):
    style = {'description_width': 'initial'}

    Categories = widgets.Dropdown(
        description='Select Category:',
        options=['All categories'] + sorted(list(dict_categories.keys())),
        style=style
    )
    
    themes_dd = widgets.Dropdown(
        description='Select Theme:',
        options=['All themes']+sorted([k for k in themes.keys()]),
        style=style
    )    
    
    sub_themes_dd = widgets.Dropdown(
        description='Select Sub-theme:',
        options= [''],
        style=style
    )    
    

    def on_change(change):
        if change['type'] == 'change' and change['name'] == 'value':
            #print("changed to %s" % change['new'])
            if change['new'] == 'All themes':
                sub_themes_dd.options = []
            else:    
                sub_themes_dd.options = themes[themes_dd.value]
                #Article_ids.value = str(np.where(dat5['themes']==themes_dd.value))
                
            

    themes_dd.observe(on_change)

 
    Top_articles = widgets.IntSlider(
        description='Display',
        tooltip='maximum:',
        value=20,
        min=1, 
        max = 30,
        style={'description_width': 'initial'}
    )


    Keywords = widgets.Text(
        value='',
        placeholder='Type something',
        description='Keywords:',
        disabled=False
    )
    

    year = widgets.Dropdown(
        options=['All years','2021','2020','2019','2018','Not found'],
        value='All years',
        description='Year:',
        disabled=False)

    out = widgets.interactive_output(articles, {'Top_articles': Top_articles, 'Keywords': Keywords, 'Categories':Categories,
                                              'themes_dd':themes_dd,'sub_themes_dd':sub_themes_dd,'year':year})

    left_box = widgets.VBox([themes_dd, sub_themes_dd])
    right_box = widgets.VBox([Keywords,Categories])
    box = widgets.HBox([left_box, right_box])
    display(box)
    
    display(year, out, Top_articles)

query_build2(themes)

HBox(children=(VBox(children=(Dropdown(description='Select Theme:', options=('All themes', 'Agriculture, fores…

Dropdown(description='Year:', options=('All years', '2021', '2020', '2019', '2018', 'Not found'), value='All y…

Output()

IntSlider(value=20, description='Display', max=30, min=1, style=SliderStyle(description_width='initial'))