# Use Case A - Faceted search 
***
### In the Statistics Explained articles using also themes, sub-themes and categories

### Adjusted (May 2022) to read all related resources from the Knowledge Database (and not only related SE articles)

### Installation instructions

* This is a Google Colab notebook. You must have a Google account. 
* Launch the notebook and put your own credentials in the chunk with title "Connect to the Virtuoso database".

In [1]:
!pip install pyodbc

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyodbc
  Downloading pyodbc-4.0.32.tar.gz (280 kB)
[K     |████████████████████████████████| 280 kB 5.1 MB/s 
[?25hBuilding wheels for collected packages: pyodbc
  Building wheel for pyodbc (setup.py) ... [?25l[?25hdone
  Created wheel for pyodbc: filename=pyodbc-4.0.32-cp37-cp37m-linux_x86_64.whl size=287352 sha256=cc44d0ef083262cded05df8e80e226b9670ad0cb98f6732f8106a1ddf36df746
  Stored in directory: /root/.cache/pip/wheels/2e/9c/da/8652fd42e0f662015554f00a9e96fe4f438dfd1ef59787879e
Successfully built pyodbc
Installing collected packages: pyodbc
Successfully installed pyodbc-4.0.32


In [2]:
!pip install SPARQLWrapper
!pip install sparql_dataframe

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Collecting rdflib>=6.1.1
  Downloading rdflib-6.1.1-py3-none-any.whl (482 kB)
[K     |████████████████████████████████| 482 kB 8.1 MB/s 
Collecting isodate
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 545 kB/s 
Installing collected packages: isodate, rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 isodate-0.6.1 rdflib-6.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sparql_dataframe
  Downloading sparql_dataframe-0.4-py3-none-any.whl (3.5 kB)
Installing collected packages: sparql-dataframe
Successfully installed sparql-dataframe-0.4


In [3]:
!apt-get install virtuoso-opensource

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libvirtodbc0 virtuoso-opensource-6.1 virtuoso-opensource-6.1-bin
  virtuoso-opensource-6.1-common virtuoso-server virtuoso-vad-conductor
  virtuoso-vsp-startpage
Suggested packages:
  virtuoso-vad-doc virtuoso-vad-demo virtuoso-vad-tutorial
  virtuoso-vad-rdfmappers virtuoso-vad-sparqldemo virtuoso-vad-syncml
  virtuoso-vad-bpel virtuoso-vad-isparql virtuoso-vad-ods virtuoso-vad-dbpedia
  virtuoso-vad-facetedbrowser
The following NEW packages will be installed:
  libvirtodbc0 virtuoso-opensource virtuoso-opensource-6.1
  virtuoso-opensource-6.1-bin virtuoso-opensource-6.1-common virtuoso-server
  virtuoso-vad-conductor virtuoso-vsp-startpage
0 upgraded, 8 newly installed, 0 to remove and 49 not upgraded.


In [4]:
import pandas as pd
import numpy as np

import ipywidgets as widgets

import pyodbc
import gensim

import os 
import re
import logging
import sys
import hashlib
from datetime import datetime
from SPARQLWrapper import SPARQLWrapper, POST, DIGEST, GET
from SPARQLWrapper import JSON, INSERT, DELETE
import sparql_dataframe

### The data cleansing function

In [5]:
import re
import unicodedata as ud

def clean(x, quotes=True):
    if pd.isnull(x): return x  
    x = x.strip()
    
    ## make letter-question mark-letter -> letter-quote-space-letter !!! but NOT in the lists of URLs!!!
    if quotes:
        x = re.sub(r'([A-Za-z])\?([A-Za-z])','\\1\' \\2',x) ## NEW
    
    ## make letter-question mark-space lower case letter letter-quote-space letter
    x = re.sub(r'([A-Za-z])\? ([a-z])','\\1\' \\2',x) ## NEW

    ## delete ,000 commas in numbers    
    x = re.sub(r'\b(\d+),(\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## delete  000 spaces in numbers
    x = re.sub(r'\b(\d+) (\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## remove more than one spaces
    x = re.sub(r' +', ' ',x)
    
    ## remove start and end spaces
    x = re.sub(r'^ +| +$', '',x,flags=re.MULTILINE) 
    
    ## space-comma -> comma
    x = re.sub(r' \,',',',x)
    
    ## space-dot -> dot
    x = re.sub(r' \.','.',x)
    
    #x = x.encode('latin1').decode('utf-8') ## â\x80\x99
    x = ud.normalize('NFKD',x).encode('ascii', 'ignore').decode()
    
    return x

###Connect to the Virtuoso database

In [6]:
user = 'xxxxx'
passw = 'xxxxx'

In [7]:
c = pyodbc.connect('DRIVER=/usr/lib/odbc/virtodbc.so;HOST=lod.csd.auth.gr:1111;UID='+user+';PWD='+passw+';DATABASE=ESTAT')

In [8]:
#set encoding
c.setdecoding(pyodbc.SQL_CHAR, encoding='latin-1')
c.setencoding(encoding="latin-1")
cursor = c.cursor()

In [9]:
def load_table(cursor,query):
  cursor.execute(query)
  t1 = cursor.fetchall()
  df = pd.DataFrame.from_records(t1, columns=[x[0] for x in cursor.description])
  return df

In [10]:
def connect_virtuoso(DSN, UID, PWD):

    sparql = SPARQLWrapper(DSN)
    sparql.setHTTPAuth(DIGEST)
    sparql.setCredentials(UID, PWD)
    sparql.setMethod(GET)

    return sparql

# Connection to the KDB 
endpoint = "http://lod.csd.auth.gr:8890/sparql/"
sparql = connect_virtuoso(endpoint,user,passw)

## A. Import Statistics Explained data from the database
***

* Id, context and last update from table dat_article.  
* Title and url from table dat_link_info, on matching id and resource_information_id=1 (i.e. Eurostat).
* Abstract from field content in table dat_article_paragraph, on matching article_id and abstract=1 ("yes").
* Apply data cleansing.

In [11]:
query =      """SELECT T1.id, T1.context, T1.last_update, T2.title, T2.url, T3.content 
                FROM ESTAT.V1.dat_article as T1 
                INNER JOIN ESTAT.V1.dat_link_info as T2  
                  ON T1.id=T2.id  
                INNER JOIN ESTAT.V1.dat_article_paragraph as T3  
                  ON T2.id=T3.article_id  
                WHERE T2.resource_information_id=1 AND T3.abstract=1"""

SE_df = load_table(cursor,query)
SE_df.rename(columns={'content':'abstract'},inplace=True)
SE_df = SE_df[['id','context','title','abstract','url','last_update']]

SE_df['context'] = SE_df['context'].apply(clean)
SE_df['title'] = SE_df['title'].apply(clean)
SE_df['title'] = SE_df['title'].apply(lambda x: re.sub(r'\?','-',x)) ## also replace question marks by dashes in titles
SE_df['abstract'] = SE_df['abstract'].apply(clean)

SE_df.head(5)

Unnamed: 0,id,context,title,abstract,url,last_update
0,7,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00
1,13,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-28 16:29:00
2,16,National rail networks have different technica...,Railway safety statistics in the EU,"In 2019, 1516 significant railway accidents we...",https://ec.europa.eu/eurostat/statistics-expla...,2021-06-25 18:31:00
3,17,The content of this statistical article is bas...,Railway freight transport statistics,This article focuses on recent rail freight tr...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-27 18:19:00
4,18,The content of this statistical article is bas...,Railway passenger transport statistics - quart...,This article takes a look at recent annual and...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:30:00


### Paragraph titles and contents

* From the dat_article_paragraph table with abstract=0 and matching article_id.
* Apply data cleansing.

In [12]:
query =      """SELECT article_id, title, content 
                FROM ESTAT.V1.dat_article_paragraph
                WHERE abstract=0 AND article_id IN (SELECT id FROM ESTAT.V1.dat_article) """

add_content = load_table(cursor,query)
add_content.sort_values(by=['article_id'],inplace=True)
add_content['title'] = add_content['title'].apply(clean)
add_content['content'] = add_content['content'].apply(clean)
add_content.head(5)

Unnamed: 0,article_id,title,content
9,7,Number of accidents,"In 2018, there were 3.1 million non-fatal acci..."
10,7,Incidence rates,An alternative way to analyse the information ...
11,7,Standardised incidence rates,"When comparing data between countries, inciden..."
12,7,Analysis by activity,"As noted above, one of the main reasons why th..."
13,7,Analysis by type of injury,Figure 6 presents an analysis of data accordin...


### Aggregate the above paragraph titles and contents  

* Create a column _raw content_ which gathers all paragraph titles and contents in one text per article.

In [13]:
add_content_grouped = add_content.groupby(['article_id'])[['title','content']].aggregate(lambda x: list(x))
add_content_grouped.reset_index(drop=False, inplace=True)
for i in range(len(add_content_grouped)):
    add_content_grouped.loc[i,'raw content'] = ''
    for (a,b) in zip(add_content_grouped.loc[i,'title'],add_content_grouped.loc[i,'content']):
        add_content_grouped.loc[i,'raw content'] += ' '+a + ' ' + b
add_content_grouped = add_content_grouped[['article_id','raw content']]    

add_content_grouped.head(5)

Unnamed: 0,article_id,raw content
0,7,"Number of accidents In 2018, there were 3.1 m..."
1,13,Household consumption Consumption expenditure...
2,16,Suicides on railways Suicides occurring on th...
3,17,Geographical location plays a key role in the...
4,18,Number of passengers transported by rail incr...


### Merge the raw content of the SE articles with the main file

In [14]:
SE_df = pd.merge(SE_df,add_content_grouped,left_on='id',right_on='article_id',how='inner')
SE_df.drop(['article_id'],axis=1,inplace=True)

SE_df.head(5)

Unnamed: 0,id,context,title,abstract,url,last_update,raw content
0,7,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00,"Number of accidents In 2018, there were 3.1 m..."
1,13,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-28 16:29:00,Household consumption Consumption expenditure...
2,16,National rail networks have different technica...,Railway safety statistics in the EU,"In 2019, 1516 significant railway accidents we...",https://ec.europa.eu/eurostat/statistics-expla...,2021-06-25 18:31:00,Suicides on railways Suicides occurring on th...
3,17,The content of this statistical article is bas...,Railway freight transport statistics,This article focuses on recent rail freight tr...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-27 18:19:00,Geographical location plays a key role in the...
4,18,The content of this statistical article is bas...,Railway passenger transport statistics - quart...,This article takes a look at recent annual and...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:30:00,Number of passengers transported by rail incr...


In [15]:
del(add_content_grouped)

## B. Read categories from the database
***


In [16]:
import ast

query =      """SELECT article_id, categories 
                FROM ESTAT.V1.SE_articles_categories """

categories = load_table(cursor,query)
categories['categories']=categories['categories'].apply(ast.literal_eval)
categories

Unnamed: 0,article_id,categories
0,7,"[Accidents at work, Health, Health and safety,..."
1,13,"[National accounts (incl. GDP), Statistical ar..."
2,16,"[Rail, Statistical article, Transport, Transpo..."
3,17,"[Freight, Rail, Statistical article, Transport]"
4,18,"[Passengers, Rail, Statistical article, Transp..."
...,...,...
600,9472,"[International trade, Trade in goods, Trade in..."
601,9477,"[Trade in goods, Statistical article]"
602,9479,"[Trade in goods, Statistical article, Internat..."
603,9492,"[Household composition and family situation, L..."


### Merge with the main file

In [17]:
SE_df = pd.merge(SE_df,categories,left_on='id',right_on='article_id',how='inner')
SE_df

Unnamed: 0,id,context,title,abstract,url,last_update,raw content,article_id,categories
0,7,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00,"Number of accidents In 2018, there were 3.1 m...",7,"[Accidents at work, Health, Health and safety,..."
1,13,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-28 16:29:00,Household consumption Consumption expenditure...,13,"[National accounts (incl. GDP), Statistical ar..."
2,16,National rail networks have different technica...,Railway safety statistics in the EU,"In 2019, 1516 significant railway accidents we...",https://ec.europa.eu/eurostat/statistics-expla...,2021-06-25 18:31:00,Suicides on railways Suicides occurring on th...,16,"[Rail, Statistical article, Transport, Transpo..."
3,17,The content of this statistical article is bas...,Railway freight transport statistics,This article focuses on recent rail freight tr...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-27 18:19:00,Geographical location plays a key role in the...,17,"[Freight, Rail, Statistical article, Transport]"
4,18,The content of this statistical article is bas...,Railway passenger transport statistics - quart...,This article takes a look at recent annual and...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:30:00,Number of passengers transported by rail incr...,18,"[Passengers, Rail, Statistical article, Transp..."
...,...,...,...,...,...,...,...,...,...
600,9472,Trade is an important indicator of Europeas pr...,EU trade in COVID-19 related products,To help prevent the spread of the COVID-19 pan...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-31 13:04:00,Sharp increase in COVID-19 related imports in...,9472,"[International trade, Trade in goods, Trade in..."
601,9477,Trade is an important indicator of Europeas pr...,EU international trade in goods - latest devel...,This article provides a picture of the interna...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-02 16:55:00,Extra-EU trade by product: Strongest fluctuat...,9477,"[Trade in goods, Statistical article]"
602,9479,Trade is an important indicator of Europeas pr...,EU and main world traders,International trade a especially the size and ...,https://ec.europa.eu/eurostat/statistics-expla...,2020-10-07 15:19:00,"Main world traders: EU, USA and China In 2019...",9479,"[Trade in goods, Statistical article, Internat..."
603,9492,"In addition to the Labour Force Survey (LFS), ...",Age of young people leaving their parental hou...,Leaving the parental home is considered as a m...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-30 14:54:00,Geographical differences Map 1 indicates that...,9492,"[Household composition and family situation, L..."


In [18]:
del(categories)

### Exract last update year

* And check missing values.

In [19]:
SE_df['new_date'] = [d.date() for d in SE_df['last_update']]  
SE_df['year'] = SE_df['last_update'].dt.year
SE_df['year'] =SE_df["year"].astype(str)

SE_df.replace('', np.nan, inplace=True)

SE_df['year'].fillna(value="Not found", inplace=True)

print(SE_df.isnull().sum(),'\n')

SE_df.reset_index(drop=True,inplace=True)
SE_df.drop(columns=['last_update','new_date'],inplace=True)
SE_df.head(5)

id              0
context        64
title           0
abstract        9
url             0
last_update     0
raw content     0
article_id      0
categories      0
new_date        0
year            0
dtype: int64 



Unnamed: 0,id,context,title,abstract,url,raw content,article_id,categories,year
0,7,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,"Number of accidents In 2018, there were 3.1 m...",7,"[Accidents at work, Health, Health and safety,...",2020
1,13,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,Household consumption Consumption expenditure...,13,"[National accounts (incl. GDP), Statistical ar...",2021
2,16,National rail networks have different technica...,Railway safety statistics in the EU,"In 2019, 1516 significant railway accidents we...",https://ec.europa.eu/eurostat/statistics-expla...,Suicides on railways Suicides occurring on th...,16,"[Rail, Statistical article, Transport, Transpo...",2021
3,17,The content of this statistical article is bas...,Railway freight transport statistics,This article focuses on recent rail freight tr...,https://ec.europa.eu/eurostat/statistics-expla...,Geographical location plays a key role in the...,17,"[Freight, Rail, Statistical article, Transport]",2020
4,18,The content of this statistical article is bas...,Railway passenger transport statistics - quart...,This article takes a look at recent annual and...,https://ec.europa.eu/eurostat/statistics-expla...,Number of passengers transported by rail incr...,18,"[Passengers, Rail, Statistical article, Transp...",2021


## C. Add themes / sub-themes information in the articles
***

* We create dictionary _themes_ manually.
* Dictionary _dict_categories_ is used for debugging. The keys are the categories found in the SE articles and the values are the corresponding article ids.
* Each article will have a list of themes and corresponding sub-themes, potentially empty. If an article has a category which is a key of _themes_ the theme is added to the first list. If it has a category which is in one of the values of _themes_ i.e. it is a sub-theme, the corresponding key (theme) is added to the first list and the sub-theme is added to the second list.
* There are relatively few articles without such information, see below.


In [20]:
import ast

themes = {'General and regional statistics/EU policies':
          ['Non-EU countries','Regions and cities','Sustainable development goals',
          'Policy indicators'],
          'Economy and finance': 
          ['Balance of payments','Comparative price levels (PPPs)','Consumer prices',
           'Exchange rates and interest rates','Government finance','National accounts (incl. GDP)'],
          'Population and social conditions':
          ['Asylum and migration','Crime','Culture','Education and training','Health',
           'Labour market','Living conditions','Population','Social protection','Sport','Youth'],
          'Industry and services': ['Short-term business statistics','Structural business statistics',
                                    'Business registers','Globalisation in businesses','Production statistics',
                                    'Tourism'],
          'Agriculture, forestry and fisheries':['Agriculture','Fisheries','Forestry'],
          'International trade':['Goods','Services'],
          'Transport':[],
          'Environment and energy':['Energy','Environment'],
          'Science, technology and digital society':['Digital economy and society','Science and technology']}

dict_categories=dict()

for i in range(len(SE_df)):
    
    cats=SE_df.loc[i,'categories']
    cats = [cat.strip() for cat in cats]
        
    for cat in cats:
        if cat in dict_categories.keys():
            dict_categories[cat].append(SE_df.loc[i,'id'])
        else:
            dict_categories[cat] = [SE_df.loc[i,'id']]



SE_df['themes'] = pd.Series([set() for i in range(len(SE_df))])
SE_df['sub_themes'] = pd.Series([set() for i in range(len(SE_df))])
for i in range(len(SE_df)):
    
    cats=SE_df.loc[i,'categories']
    cats = [cat.strip() for cat in cats]

    for cat in cats:
        if cat in themes.keys():
            SE_df.loc[i,'themes'].add(cat)
        else:
            for theme in themes.keys():
                if cat in themes[theme]:
                    SE_df.loc[i,'themes'].add(theme)
                    SE_df.loc[i,'sub_themes'].add(cat)
    
SE_df['themes'] = SE_df['themes'].apply(lambda x: ';'.join(x))    
SE_df['sub_themes'] = SE_df['sub_themes'].apply(lambda x: ';'.join(x))    

#SE_df['categories']= SE_df['categories'].apply(lambda x: ';'.join(x))  ## de-comment to produce the input file for R Shiny, 
## i.e. categories not in list but separated by semicolon    

print(SE_df.isnull().sum(),'\n')

print('No info in themes: ',sum(SE_df['themes']==''))
print('No info in sub_themes: ',sum(SE_df['sub_themes']==''))

#SE_df.sort_values(by=['title'],inplace=True)

SE_df.head(5)

id              0
context        64
title           0
abstract        9
url             0
raw content     0
article_id      0
categories      0
year            0
themes          0
sub_themes      0
dtype: int64 

No info in themes:  49
No info in sub_themes:  85


Unnamed: 0,id,context,title,abstract,url,raw content,article_id,categories,year,themes,sub_themes
0,7,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,"Number of accidents In 2018, there were 3.1 m...",7,"[Accidents at work, Health, Health and safety,...",2020,Population and social conditions,Labour market;Health
1,13,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,Household consumption Consumption expenditure...,13,"[National accounts (incl. GDP), Statistical ar...",2021,Economy and finance,National accounts (incl. GDP)
2,16,National rail networks have different technica...,Railway safety statistics in the EU,"In 2019, 1516 significant railway accidents we...",https://ec.europa.eu/eurostat/statistics-expla...,Suicides on railways Suicides occurring on th...,16,"[Rail, Statistical article, Transport, Transpo...",2021,Transport,
3,17,The content of this statistical article is bas...,Railway freight transport statistics,This article focuses on recent rail freight tr...,https://ec.europa.eu/eurostat/statistics-expla...,Geographical location plays a key role in the...,17,"[Freight, Rail, Statistical article, Transport]",2020,Transport,
4,18,The content of this statistical article is bas...,Railway passenger transport statistics - quart...,This article takes a look at recent annual and...,https://ec.europa.eu/eurostat/statistics-expla...,Number of passengers transported by rail incr...,18,"[Passengers, Rail, Statistical article, Transp...",2021,Transport,


## D. Tokenize and stem the articles titles, contexts, abstracts and contents
***

* Also remove stop-words.
* Create columns _title tokens_, _context tokens_, _abstract tokens_, _raw content tokens_.

In [21]:
#Stemming.

from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.porter import PorterStemmer

p = PorterStemmer()

def text_to_words(text):
    words = str(gensim.utils.simple_preprocess(text, deacc=True))
    words = remove_stopwords(words) 
    words = gensim.utils.tokenize(words)
        
    words = [p.stem(token) for token in words] 
    
    return ' '.join(words)        

for i in range(len(SE_df)):
    SE_df.loc[i,'title tokens']=text_to_words(SE_df.loc[i,'title'])
    if not pd.isnull(SE_df.loc[i,'context']):
        SE_df.loc[i,'context tokens']=text_to_words(SE_df.loc[i,'context'])
    else:
        SE_df.loc[i,'context tokens']=''
    if not pd.isnull(SE_df.loc[i,'abstract']):        
        SE_df.loc[i,'abstract tokens']=text_to_words(SE_df.loc[i,'abstract'])
    else:
        SE_df.loc[i,'abstract tokens']=''
    SE_df.loc[i,'raw content tokens']=text_to_words(SE_df.loc[i,'raw content'])


SE_df.head(5)

Unnamed: 0,id,context,title,abstract,url,raw content,article_id,categories,year,themes,sub_themes,title tokens,context tokens,abstract tokens,raw content tokens
0,7,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,"Number of accidents In 2018, there were 3.1 m...",7,"[Accidents at work, Health, Health and safety,...",2020,Population and social conditions,Labour market;Health,accid at work statist,safe healthi work environ is crucial factor in...,thi articl present set of main statist find in...,number of accid in there were million non fata...
1,13,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,Household consumption Consumption expenditure...,13,"[National accounts (incl. GDP), Statistical ar...",2021,Economy and finance,National accounts (incl. GDP),nation account and gdp,european institut govern central bank as well ...,nation account ar the sourc for multitud of we...,household consumpt consumpt expenditur of hous...
2,16,National rail networks have different technica...,Railway safety statistics in the EU,"In 2019, 1516 significant railway accidents we...",https://ec.europa.eu/eurostat/statistics-expla...,Suicides on railways Suicides occurring on th...,16,"[Rail, Statistical article, Transport, Transpo...",2021,Transport,,railwai safeti statist in the eu,nation rail network have differ technic specif...,in signific railwai accid were report in the e...,suicid on railwai suicid occur on the railwai ...
3,17,The content of this statistical article is bas...,Railway freight transport statistics,This article focuses on recent rail freight tr...,https://ec.europa.eu/eurostat/statistics-expla...,Geographical location plays a key role in the...,17,"[Freight, Rail, Statistical article, Transport]",2020,Transport,,railwai freight transport statist,the content of thi statist articl is base on d...,thi articl focus on recent rail freight transp...,geograph locat plai kei role in the share of i...
4,18,The content of this statistical article is bas...,Railway passenger transport statistics - quart...,This article takes a look at recent annual and...,https://ec.europa.eu/eurostat/statistics-expla...,Number of passengers transported by rail incr...,18,"[Passengers, Rail, Statistical article, Transp...",2021,Transport,,railwai passeng transport statist quarterli an...,the content of thi statist articl is base on d...,thi articl take look at recent annual and quar...,number of passeng transport by rail increas in...


## E. All related resources from the Knowledge Database
***
* Drop duplicates due to reference type.
* Group by unique URI and put related titles and related URLs in columns with lists.

In [22]:
RelationsStatements = """
    DEFINE input:inference <https://ec.europa.eu/eurostat/NLP4StatRef/knowledge/>
    PREFIX estat: <https://ec.europa.eu/eurostat/NLP4StatRef/ontology/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    select ?a ?t ?url ?ref ?ref_type ?ref_t ?ref_uri ?ref_url where { 
    ?a rdf:type estat:StatisticsExplainedArticle .
    ?a estat:title ?t .
    ?a estat:hasURL ?url .
    ?a estat:hasReference ?ref .
    ?ref a ?ref_type .
    ?ref estat:title ?ref_t .
    OPTIONAL {?ref estat:hasURI ?ref_uri} .
    ?ref estat:hasURL ?ref_url .
} 
"""
  
sparql.setQuery(RelationsStatements)
sparql.method = "POST"
sparql.setReturnFormat(JSON)
results = sparql.query().convert()['results']['bindings']
results = pd.json_normalize(results)
print(len(results))
print(results.columns)

results.drop_duplicates(subset=['a.value','t.value','url.value','ref_t.value','ref_url.value'],inplace=True)
print(len(results))

results2 = results.groupby(['a.value']).agg({'t.value': lambda x: x.iloc[0],'url.value': lambda x: x.iloc[0], \
                                             'ref_t.value':lambda x: list(x), 'ref_url.value':lambda x: list(x)}).reset_index()

                                                                            
results2


25000
Index(['a.type', 'a.value', 't.type', 't.value', 'url.type', 'url.datatype',
       'url.value', 'ref.type', 'ref.value', 'ref_type.type', 'ref_type.value',
       'ref_t.type', 'ref_t.value', 'ref_uri.type', 'ref_uri.value',
       'ref_url.type', 'ref_url.datatype', 'ref_url.value'],
      dtype='object')
5982


Unnamed: 0,a.value,t.value,url.value,ref_t.value,ref_url.value
0,https://ec.europa.eu/eurostat/NLP4StatRef/know...,Merging statistics and geospatial information ...,https://ec.europa.eu/eurostat/statistics-expla...,[Final report],[https://circabc.europa.eu/sd/a/acd1a0f9-72c6-...
1,https://ec.europa.eu/eurostat/NLP4StatRef/know...,Statistics in development cooperation coordina...,https://ec.europa.eu/eurostat/statistics-expla...,"[Statistical cooperation, Statistics in develo...",[https://ec.europa.eu/eurostat/statistics-expl...
2,https://ec.europa.eu/eurostat/NLP4StatRef/know...,Statistics in development cooperation data ava...,https://ec.europa.eu/eurostat/statistics-expla...,"[Statistical cooperation, Statistics in develo...",[https://ec.europa.eu/eurostat/statistics-expl...
3,https://ec.europa.eu/eurostat/NLP4StatRef/know...,Statistics in development cooperation interven...,https://ec.europa.eu/eurostat/statistics-expla...,"[Statistical cooperation, Statistics in develo...",[https://ec.europa.eu/eurostat/statistics-expl...
4,https://ec.europa.eu/eurostat/NLP4StatRef/know...,Statistics in development cooperation national...,https://ec.europa.eu/eurostat/statistics-expla...,"[Statistical cooperation, Statistics in develo...",[https://ec.europa.eu/eurostat/statistics-expl...
...,...,...,...,...,...
369,https://ec.europa.eu/eurostat/NLP4StatRef/know...,European Neighbourhood Policy East statistics ...,https://ec.europa.eu/eurostat/statistics-expla...,[ENPE Science Technology Digital Society 2021 ...,[https://ec.europa.eu/eurostat/statistics-expl...
370,https://ec.europa.eu/eurostat/NLP4StatRef/know...,Measuring international trade in services from...,https://ec.europa.eu/eurostat/statistics-expla...,[Balance of Payments and International Investm...,[https://ec.europa.eu/eurostat/statistics-expl...
371,https://ec.europa.eu/eurostat/NLP4StatRef/know...,EU international trade in other business services,https://ec.europa.eu/eurostat/statistics-expla...,[Balance of Payments and International Investm...,[https://ec.europa.eu/eurostat/statistics-expl...
372,https://ec.europa.eu/eurostat/NLP4StatRef/know...,Balance of payments statistics,https://ec.europa.eu/eurostat/statistics-expla...,[Balance of Payments and International Investm...,[https://ec.europa.eu/eurostat/statistics-expl...


In [23]:
del(results)

### Merge with main file



In [None]:
print(SE_df.columns)
print(results2.columns)
SE_df2 = pd.merge(SE_df,results2,how='left',left_on='url',right_on='url.value')
print(SE_df2.columns)

SE_df2.drop(columns=['a.value','t.value','url.value'],inplace=True)
SE_df2.rename(columns={'ref_t.value':'related_titles','ref_url.value':'related_urls'},inplace=True)

SE_df2.sort_values(by=['title'],inplace=True)

SE_df2

In [25]:
del(SE_df,results2)

### Create a column *related_types* from the pattern in column *related_urls*

In [None]:
def recode_list(llist):
  
  new_list=[]
  if not isinstance(llist, list): return new_list
  for el in llist:
    
    if 'Glossary' in el:
      new_list.append('SE GL articles')
    elif 'statistics-explained/index.php' in el:  
      new_list.append('SE articles')
    elif 'eurostat/product' in el:        
      new_list.append('Publications')
    elif 'eur-lex.europa.eu' in el:        
      new_list.append('Legislation')
    else:
      new_list.append('Other')
  return new_list    

SE_df2['related_types'] = SE_df2['related_urls'].apply(lambda x: recode_list(x))
SE_df2


## F. Facets
***
### The function filtering the results.

In [28]:
df1 = None

def articles(Top_articles, Keywords, Categories, themes_dd, sub_themes_dd,year,related): 

    global df1
    Keywords = text_to_words(Keywords) 

    df1 = SE_df2[SE_df2['title tokens'].str.contains(Keywords,regex=False) | SE_df2['raw content tokens'].str.contains(Keywords,regex=False)]
    df1 = df1[df1['context tokens'].str.contains(Keywords,regex=False) | df1['abstract tokens'].str.contains(Keywords,regex=False)]
    
    if year != "All years":    
        df1 = df1[df1['year'].str.contains(year)]    
        
        
    if themes_dd != "All themes":
        df1 = df1[df1['themes'].str.contains(themes_dd,regex=False)]
        

    if sub_themes_dd is not None:
        if sub_themes_dd != "All sub-themes" and sub_themes_dd != "":    
            df1 = df1[df1['sub_themes'].str.contains(sub_themes_dd,regex=False)]
    
    if Categories is not None:
        if Categories != "All categories":
            
            df1 = df1[df1['categories'].apply(lambda x: Categories in x)]
 
          
    if len(df1) == 0:
        print("No matches found")
    else:
        df1.reset_index(inplace=True)
        print(df1.title.count()," articles found")
        h = ''
        for i in range(min(Top_articles,len(df1))):
            l,n = df1.loc[i,["url","title"]].values
            h += '<br><br/><u><b><a href="' + l + '" target="_blank">'+ n + '</a></b></u>'
            if related and len(df1.loc[i,"related_types"]) > 0:
                h += '<blockquote>Related links:'
                for type_group in ['SE articles','SE GL articles','Publications','Legislation','Other']:
                    found = False
                    
                    for k in range(len(df1.loc[i,"related_titles"])):  
                        type_k = df1.loc[i,"related_types"][k]
                        if type_k == type_group:
                            if not found: ## exists, first line
                                h += '<blockquote>'+type_k+'</blockquote>'
                                found = True
                            ## exists, first or next lines   
                            title = df1.loc[i,"related_titles"][k]
                            url = df1.loc[i,"related_urls"][k]
                            h += '<blockquote><blockquote><a href="' + url + '" target="_blank">'+ title + '</a></blockquote></blockquote>'
                h += '</blockquote>'
        display(HTML( h))
  

### The widgets.

In [29]:
from IPython.display import display
from ipywidgets import HTML
layout = widgets.Layout(width='500px', height='500px')

In [30]:
def new_categories():
    if len(df1) > 0:
        res=set()
        for i in range(len(df1)):
            res.update(df1.loc[i,'categories'])
        return sorted(list(res))
    else:
        return []
    
    

def query_build2(themes):
    style = {'description_width': 'initial'}

    Categories = widgets.Dropdown(
        description='Select category:',
        options=['All categories'] + sorted(list(dict_categories.keys())),
        style=style
    )
    
    
    themes_dd = widgets.Dropdown(
        description='Select theme:',
        options=['All themes']+sorted([k for k in themes.keys()]),
        style=style
    )    
    

    def on_change_theme(change):
        if change['type'] == 'change' and change['name'] == 'value':
            if change['new'] == 'All themes':
                sub_themes_dd.options = []
                Categories.options = ['All categories']+sorted(list(dict_categories.keys()))
            else:    
                sub_themes_dd.options = ['All sub-themes']+themes[themes_dd.value]
                
                

    themes_dd.observe(on_change_theme)
    
    sub_themes_dd = widgets.Dropdown(
        description='Select sub-theme:',
        options= [''],
        style=style
    )    
    
    def on_change_subtheme(change):
        if change['type'] == 'change' and change['name'] == 'value':
            Categories.options= []
            Categories.options = ['All categories']+new_categories()

    sub_themes_dd.observe(on_change_subtheme)
 
    Top_articles = widgets.IntSlider(
        description='Display',
        tooltip='maximum:',
        value=20,
        min=1, 
        max = 30,
        style={'description_width': 'initial'}
    )


    Keywords = widgets.Text(
        value='',
        placeholder='Type something',
        description='Keywords:',
        disabled=False
    )
    

    year = widgets.Dropdown(
        options=['All years','2021','2020','2019','2018','Not found'],
        value='All years',
        description='Year:',
        disabled=False)


    
    related = widgets.Checkbox(
        value=False,
        description='Show related links',
        disabled=False,
        indent=True
     )    
    
    out = widgets.interactive_output(articles, {'Top_articles': Top_articles, 'Keywords': Keywords, 'Categories':Categories,
                                              'themes_dd':themes_dd,'sub_themes_dd':sub_themes_dd,'year':year,'related':related})

    print(Top_articles)


    left_box = widgets.VBox([themes_dd, sub_themes_dd])
    right_box = widgets.VBox([Keywords,Categories,related])
    box = widgets.HBox([left_box, right_box])
    display(box)
    
    display(year,Top_articles,out)
    
        
query_build2(themes)

IntSlider(value=20, description='Display', max=30, min=1, style=SliderStyle(description_width='initial'))


HBox(children=(VBox(children=(Dropdown(description='Select theme:', options=('All themes', 'Agriculture, fores…

Dropdown(description='Year:', options=('All years', '2021', '2020', '2019', '2018', 'Not found'), value='All y…

IntSlider(value=20, description='Display', max=30, min=1, style=SliderStyle(description_width='initial'))

Output()