# Prepare data for R Shiny and Power BI
***

### Revised (January 2022) to read all data from the database

### Installation instructions
*    For the setup of the Virtuoso ODBC data source please see section 1a in https://github.com/eurostat/NLP4Stat/tree/testing/Software%20Environment
*    Download the notebook as "raw" file and save it with extension .ipynb (cut the .txt extension which is added)
*    Install the necessary libraries from your jupyter command prompt. These, together with the versions used, are:
    *    pyodbc==4.0.32
    *    pandas==1.3.5
    *    numpy==1.20.3
    *    ipywidgets==7.6.5
    *    gensim==4.1.2
*   Launch the notebook and put your own credentials for access to the Virtuoso database in the call to pyodbc.connect() in the chunk with title "A. Import Statistics Explained data from the database"  

In [1]:
import pandas as pd
import numpy as np

import ipywidgets as widgets

import pyodbc

import gensim


### The data cleansing function

In [2]:
import re
import unicodedata as ud

def clean(x, quotes=True):
    if pd.isnull(x): return x  
    x = x.strip()
    
    ## make letter-question mark-letter -> letter-quote-space-letter !!! but NOT in the lists of URLs!!!
    if quotes:
        x = re.sub(r'([A-Za-z])\?([A-Za-z])','\\1\' \\2',x) ## NEW
    
    ## make letter-question mark-space lower case letter letter-quote-space letter
    x = re.sub(r'([A-Za-z])\? ([a-z])','\\1\' \\2',x) ## NEW

    ## delete ,000 commas in numbers    
    x = re.sub(r'\b(\d+),(\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## delete  000 spaces in numbers
    x = re.sub(r'\b(\d+) (\d+)\b','\\1\\2',x) ## CORRECTED
    
    ## remove more than one spaces
    x = re.sub(r' +', ' ',x)
    
    ## remove start and end spaces
    x = re.sub(r'^ +| +$', '',x,flags=re.MULTILINE) 
    
    ## space-comma -> comma
    x = re.sub(r' \,',',',x)
    
    ## space-dot -> dot
    x = re.sub(r' \.','.',x)
    
    #x = x.encode('latin1').decode('utf-8') ## â\x80\x99
    x = ud.normalize('NFKD',x).encode('ascii', 'ignore').decode()
    
    return x

## A. Import Statistics Explained data from the database
***

* Id, context and last update from table dat_article.  
* Title and url from table dat_link_info, on matching id and resource_information_id=1 (i.e. Eurostat).
* Abstract from field content in table dat_article_paragraph, on matching article_id and abstract=1 ("yes").
* Apply data cleansing.


In [3]:


c = pyodbc.connect('DSN=Virtuoso All;DBA=ESTAT;UID=xxxxx;PWD=xxxxx')
cursor = c.cursor()

SQLCommand = """SELECT T1.id, T1.context, T1.last_update, T2.title, T2.url, T3.content 
                FROM ESTAT.V1.dat_article as T1 
                INNER JOIN ESTAT.V1.dat_link_info as T2  
                  ON T1.id=T2.id  
                INNER JOIN ESTAT.V1.dat_article_paragraph as T3  
                  ON T2.id=T3.article_id  
                WHERE T2.resource_information_id=1 AND T3.abstract=1"""

SE_df = pd.read_sql(SQLCommand,c)
SE_df.rename(columns={'content':'abstract'},inplace=True)
SE_df = SE_df[['id','context','title','abstract','url','last_update']]

SE_df['context'] = SE_df['context'].apply(clean)
SE_df['title'] = SE_df['title'].apply(clean)
SE_df['abstract'] = SE_df['abstract'].apply(clean)

SE_df.head(5)


Unnamed: 0,id,context,title,abstract,url,last_update
0,7,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00
1,13,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-28 16:29:00
2,16,National rail networks have different technica...,Railway safety statistics in the EU,"In 2019, 1516 significant railway accidents we...",https://ec.europa.eu/eurostat/statistics-expla...,2021-06-25 18:31:00
3,17,The content of this statistical article is bas...,Railway freight transport statistics,This article focuses on recent rail freight tr...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-27 18:19:00
4,18,The content of this statistical article is bas...,Railway passenger transport statistics - quart...,This article takes a look at recent annual and...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:30:00


### Paragraph titles and contents

* From the dat_article_paragraph table with abstract=0 and matching article_id.
* Apply data cleansing.

In [4]:
SQLCommand = """SELECT article_id, title, content 
                FROM ESTAT.V1.dat_article_paragraph
                WHERE abstract=0 AND article_id IN (SELECT id FROM ESTAT.V1.dat_article) """

add_content = pd.read_sql(SQLCommand,c)
add_content.sort_values(by=['article_id'],inplace=True)
add_content['title'] = add_content['title'].apply(clean)
add_content['content'] = add_content['content'].apply(clean)
add_content.head(5)

Unnamed: 0,article_id,title,content
9,7,Number of accidents,"In 2018, there were 3.1 million non-fatal acci..."
10,7,Incidence rates,An alternative way to analyse the information ...
11,7,Standardised incidence rates,"When comparing data between countries, inciden..."
12,7,Analysis by activity,"As noted above, one of the main reasons why th..."
13,7,Analysis by type of injury,Figure 6 presents an analysis of data accordin...


### Aggregate the above paragraph titles and contents  

* Create a column _raw content_ which gathers all paragraph titles and contents in one text per article.

In [5]:
add_content_grouped = add_content.groupby(['article_id'])[['title','content']].aggregate(lambda x: list(x))
add_content_grouped.reset_index(drop=False, inplace=True)
for i in range(len(add_content_grouped)):
    add_content_grouped.loc[i,'raw content'] = ''
    for (a,b) in zip(add_content_grouped.loc[i,'title'],add_content_grouped.loc[i,'content']):
        add_content_grouped.loc[i,'raw content'] += ' '+a + ' ' + b
add_content_grouped = add_content_grouped[['article_id','raw content']]    

add_content_grouped.head(5)

Unnamed: 0,article_id,raw content
0,7,"Number of accidents In 2018, there were 3.1 m..."
1,13,Household consumption Consumption expenditure...
2,16,Suicides on railways Suicides occurring on th...
3,17,Geographical location plays a key role in the...
4,18,Number of passengers transported by rail incr...


### Merge the raw content of the SE articles with the main file


In [6]:
SE_df = pd.merge(SE_df,add_content_grouped,left_on='id',right_on='article_id',how='inner')
SE_df.drop(['article_id'],axis=1,inplace=True)

SE_df.head(5)

Unnamed: 0,id,context,title,abstract,url,last_update,raw content
0,7,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00,"Number of accidents In 2018, there were 3.1 m..."
1,13,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-28 16:29:00,Household consumption Consumption expenditure...
2,16,National rail networks have different technica...,Railway safety statistics in the EU,"In 2019, 1516 significant railway accidents we...",https://ec.europa.eu/eurostat/statistics-expla...,2021-06-25 18:31:00,Suicides on railways Suicides occurring on th...
3,17,The content of this statistical article is bas...,Railway freight transport statistics,This article focuses on recent rail freight tr...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-27 18:19:00,Geographical location plays a key role in the...
4,18,The content of this statistical article is bas...,Railway passenger transport statistics - quart...,This article takes a look at recent annual and...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:30:00,Number of passengers transported by rail incr...


### Related links

* From the dat_article_shared_link table with article_division=2 ("Other articles", see mod_article_division table).  
* link_id points to id in dat_link_info (where we select resource_information_id=1).
* Apply data cleansing (with an additional step to replace question marks from the related titles).


In [7]:
SQLCommand = """SELECT T1.article_id, T1.link_id, T2.title, T2.url 
                FROM dat_article_shared_link as T1 
                INNER JOIN ESTAT.V1.dat_link_info as T2  
                  ON T1.link_id=T2.id  
                WHERE T1.article_division_id=2 AND T2.resource_information_id=1
                ORDER BY T1.article_id, T1.link_id """

add_related_links = pd.read_sql(SQLCommand,c)
add_related_links['title'] = add_related_links['title'] .apply(clean)
add_related_links['title'] = add_related_links['title'] .apply(lambda x: re.sub(r'\?','-',x))
add_related_links.head(5)

Unnamed: 0,article_id,link_id,title,url
0,7,229,Health in the European Union a facts and figures,https://ec.europa.eu/eurostat/statistics-expla...
1,7,1157,Health statistics introduced,https://ec.europa.eu/eurostat/statistics-expla...
2,7,2914,Accidents and injuries statistics,https://ec.europa.eu/eurostat/statistics-expla...
3,7,2946,Accidents at work - statistics by economic act...,https://ec.europa.eu/eurostat/statistics-expla...
4,7,2947,Accidents at work - statistics on causes and c...,https://ec.europa.eu/eurostat/statistics-expla...


### Aggregate above by article id

* Aggregate related titles and URLs in one string.

In [8]:
add_related_grouped = pd.DataFrame(add_related_links.groupby(['article_id'])[['title','url']].aggregate(lambda x: list(x)))
add_related_grouped.reset_index(drop=False, inplace=True)
add_related_grouped.rename(columns={'title':'related_titles','url':'related_urls'},inplace=True)
add_related_grouped.head(5)



Unnamed: 0,article_id,related_titles,related_urls
0,7,[Health in the European Union a facts and figu...,[https://ec.europa.eu/eurostat/statistics-expl...
1,13,"[Sector accounts, European system of national ...",[https://ec.europa.eu/eurostat/statistics-expl...
2,16,"[Railway freight transport statistics, Railway...",[https://ec.europa.eu/eurostat/statistics-expl...
3,17,"[Transport statistics at regional level, All a...",[https://ec.europa.eu/eurostat/statistics-expl...
4,18,"[Railway freight transport statistics, Freight...",[https://ec.europa.eu/eurostat/statistics-expl...


### Merge above with the main file


In [9]:
SE_df = pd.merge(SE_df,add_related_grouped,left_on='id',right_on='article_id',how='inner')
SE_df.drop(['article_id'],axis=1,inplace=True)


SE_df.head(5)

Unnamed: 0,id,context,title,abstract,url,last_update,raw content,related_titles,related_urls
0,7,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00,"Number of accidents In 2018, there were 3.1 m...",[Health in the European Union a facts and figu...,[https://ec.europa.eu/eurostat/statistics-expl...
1,13,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-28 16:29:00,Household consumption Consumption expenditure...,"[Sector accounts, European system of national ...",[https://ec.europa.eu/eurostat/statistics-expl...
2,16,National rail networks have different technica...,Railway safety statistics in the EU,"In 2019, 1516 significant railway accidents we...",https://ec.europa.eu/eurostat/statistics-expla...,2021-06-25 18:31:00,Suicides on railways Suicides occurring on th...,"[Railway freight transport statistics, Railway...",[https://ec.europa.eu/eurostat/statistics-expl...
3,17,The content of this statistical article is bas...,Railway freight transport statistics,This article focuses on recent rail freight tr...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-27 18:19:00,Geographical location plays a key role in the...,"[Transport statistics at regional level, All a...",[https://ec.europa.eu/eurostat/statistics-expl...
4,18,The content of this statistical article is bas...,Railway passenger transport statistics - quart...,This article takes a look at recent annual and...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:30:00,Number of passengers transported by rail incr...,"[Railway freight transport statistics, Freight...",[https://ec.europa.eu/eurostat/statistics-expl...


## B. Read categories from the database
***

* Apply data cleansing.

In [10]:
import ast

SQLCommand = """SELECT article_id, categories 
                FROM ESTAT.V1.SE_articles_categories """

categories = pd.read_sql(SQLCommand,c)
categories['categories']=categories['categories'].apply(ast.literal_eval)
categories

Unnamed: 0,article_id,categories
0,7,"[Accidents at work, Health, Health and safety,..."
1,13,"[National accounts (incl. GDP), Statistical ar..."
2,16,"[Rail, Statistical article, Transport, Transpo..."
3,17,"[Freight, Rail, Statistical article, Transport]"
4,18,"[Passengers, Rail, Statistical article, Transp..."
...,...,...
600,9472,"[International trade, Trade in goods, Trade in..."
601,9477,"[Trade in goods, Statistical article]"
602,9479,"[Trade in goods, Statistical article, Internat..."
603,9492,"[Household composition and family situation, L..."


### Merge with the main file


In [11]:
SE_df = pd.merge(SE_df,categories,left_on='id',right_on='article_id',how='inner')
SE_df.drop(['article_id'],axis=1,inplace=True)

SE_df

Unnamed: 0,id,context,title,abstract,url,last_update,raw content,related_titles,related_urls,categories
0,7,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00,"Number of accidents In 2018, there were 3.1 m...",[Health in the European Union a facts and figu...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Accidents at work, Health, Health and safety,..."
1,13,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-28 16:29:00,Household consumption Consumption expenditure...,"[Sector accounts, European system of national ...",[https://ec.europa.eu/eurostat/statistics-expl...,"[National accounts (incl. GDP), Statistical ar..."
2,16,National rail networks have different technica...,Railway safety statistics in the EU,"In 2019, 1516 significant railway accidents we...",https://ec.europa.eu/eurostat/statistics-expla...,2021-06-25 18:31:00,Suicides on railways Suicides occurring on th...,"[Railway freight transport statistics, Railway...",[https://ec.europa.eu/eurostat/statistics-expl...,"[Rail, Statistical article, Transport, Transpo..."
3,17,The content of this statistical article is bas...,Railway freight transport statistics,This article focuses on recent rail freight tr...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-27 18:19:00,Geographical location plays a key role in the...,"[Transport statistics at regional level, All a...",[https://ec.europa.eu/eurostat/statistics-expl...,"[Freight, Rail, Statistical article, Transport]"
4,18,The content of this statistical article is bas...,Railway passenger transport statistics - quart...,This article takes a look at recent annual and...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:30:00,Number of passengers transported by rail incr...,"[Railway freight transport statistics, Freight...",[https://ec.europa.eu/eurostat/statistics-expl...,"[Passengers, Rail, Statistical article, Transp..."
...,...,...,...,...,...,...,...,...,...,...
587,9472,Trade is an important indicator of Europeas pr...,EU trade in COVID-19 related products,To help prevent the spread of the COVID-19 pan...,https://ec.europa.eu/eurostat/statistics-expla...,2021-03-31 13:04:00,Sharp increase in COVID-19 related imports in...,"[International trade in goods, Extra-EU trade ...",[https://ec.europa.eu/eurostat/statistics-expl...,"[International trade, Trade in goods, Trade in..."
588,9477,Trade is an important indicator of Europeas pr...,EU international trade in goods - latest devel...,This article provides a picture of the interna...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-02 16:55:00,Extra-EU trade by product: Strongest fluctuat...,"[International trade in goods, Extra-EU trade ...",[https://ec.europa.eu/eurostat/statistics-expl...,"[Trade in goods, Statistical article]"
589,9479,Trade is an important indicator of Europeas pr...,EU and main world traders,International trade a especially the size and ...,https://ec.europa.eu/eurostat/statistics-expla...,2020-10-07 15:19:00,"Main world traders: EU, USA and China In 2019...","[International trade in goods, Extra-EU trade ...",[https://ec.europa.eu/eurostat/statistics-expl...,"[Trade in goods, Statistical article, Internat..."
590,9492,"In addition to the Labour Force Survey (LFS), ...",Age of young people leaving their parental hou...,Leaving the parental home is considered as a m...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-30 14:54:00,Geographical differences Map 1 indicates that...,"[Labour market, EU labour force survey, Househ...",[https://ec.europa.eu/eurostat/statistics-expl...,"[Household composition and family situation, L..."


### Exract last update year

* And check missing values.

In [12]:
SE_df['new_date'] = [d.date() for d in SE_df['last_update']]  
SE_df['year'] = SE_df['last_update'].dt.year
SE_df['year'] =SE_df["year"].astype(str)

SE_df.replace('', np.nan, inplace=True)

SE_df['year'].fillna(value="Not found", inplace=True)

print(SE_df.isnull().sum(),'\n')

SE_df.reset_index(drop=True,inplace=True)
SE_df.head(5)

id                 0
context           59
title              0
abstract           9
url                0
last_update        0
raw content        0
related_titles     0
related_urls       0
categories         0
new_date           0
year               0
dtype: int64 



Unnamed: 0,id,context,title,abstract,url,last_update,raw content,related_titles,related_urls,categories,new_date,year
0,7,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00,"Number of accidents In 2018, there were 3.1 m...",[Health in the European Union a facts and figu...,[https://ec.europa.eu/eurostat/statistics-expl...,"[Accidents at work, Health, Health and safety,...",2020-11-26,2020
1,13,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-28 16:29:00,Household consumption Consumption expenditure...,"[Sector accounts, European system of national ...",[https://ec.europa.eu/eurostat/statistics-expl...,"[National accounts (incl. GDP), Statistical ar...",2021-06-28,2021
2,16,National rail networks have different technica...,Railway safety statistics in the EU,"In 2019, 1516 significant railway accidents we...",https://ec.europa.eu/eurostat/statistics-expla...,2021-06-25 18:31:00,Suicides on railways Suicides occurring on th...,"[Railway freight transport statistics, Railway...",[https://ec.europa.eu/eurostat/statistics-expl...,"[Rail, Statistical article, Transport, Transpo...",2021-06-25,2021
3,17,The content of this statistical article is bas...,Railway freight transport statistics,This article focuses on recent rail freight tr...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-27 18:19:00,Geographical location plays a key role in the...,"[Transport statistics at regional level, All a...",[https://ec.europa.eu/eurostat/statistics-expl...,"[Freight, Rail, Statistical article, Transport]",2020-11-27,2020
4,18,The content of this statistical article is bas...,Railway passenger transport statistics - quart...,This article takes a look at recent annual and...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:30:00,Number of passengers transported by rail incr...,"[Railway freight transport statistics, Freight...",[https://ec.europa.eu/eurostat/statistics-expl...,"[Passengers, Rail, Statistical article, Transp...",2021-07-07,2021


## C. Add themes / sub-themes information in the articles
***

* We create dictionary _themes_ manually.
* Dictionary _dict_categories_ is used for debugging. The keys are the categories found in the SE articles and the values are the corresponding article ids.
* Each article will have a list of themes and corresponding sub-themes, potentially empty. If an article has a category which is a key of _themes_ the theme is added to the first list. If it has a category which is in one of the values of _themes_ i.e. it is a sub-theme, the corresponding key (theme) is added to the first list and the sub-theme is added to the second list.
* There are relatively few articles without such information, see below.


In [13]:
import ast

themes = {'General and regional statistics/EU policies':
          ['Non-EU countries','Regions and cities','Sustainable development goals',
          'Policy indicators'],
          'Economy and finance': 
          ['Balance of payments','Comparative price levels (PPPs)','Consumer prices',
           'Exchange rates and interest rates','Government finance','National accounts (incl. GDP)'],
          'Population and social conditions':
          ['Asylum and migration','Crime','Culture','Education and training','Health',
           'Labour market','Living conditions','Population','Social protection','Sport','Youth'],
          'Industry and services': ['Short-term business statistics','Structural business statistics',
                                    'Business registers','Globalisation in businesses','Production statistics',
                                    'Tourism'],
          'Agriculture, forestry and fisheries':['Agriculture','Fisheries','Forestry'],
          'International trade':['Goods','Services'],
          'Transport':[],
          'Environment and energy':['Energy','Environment'],
          'Science, technology and digital society':['Digital economy and society','Science and technology']}

dict_categories=dict()

for i in range(len(SE_df)):
    
    cats=SE_df.loc[i,'categories']
    cats = [cat.strip() for cat in cats]
        
    for cat in cats:
        if cat in dict_categories.keys():
            dict_categories[cat].append(SE_df.loc[i,'id'])
        else:
            dict_categories[cat] = [SE_df.loc[i,'id']]



SE_df['themes'] = pd.Series([set() for i in range(len(SE_df))])
SE_df['sub_themes'] = pd.Series([set() for i in range(len(SE_df))])
for i in range(len(SE_df)):
    
    cats=SE_df.loc[i,'categories']
    cats = [cat.strip() for cat in cats]

    for cat in cats:
        if cat in themes.keys():
            SE_df.loc[i,'themes'].add(cat)
        else:
            for theme in themes.keys():
                if cat in themes[theme]:
                    SE_df.loc[i,'themes'].add(theme)
                    SE_df.loc[i,'sub_themes'].add(cat)
    
SE_df['themes'] = SE_df['themes'].apply(lambda x: ';'.join(x))    
SE_df['sub_themes'] = SE_df['sub_themes'].apply(lambda x: ';'.join(x))    

SE_df['categories']= SE_df['categories'].apply(lambda x: ';'.join(x))  ## de-comment to produce the input file for R Shiny and Power BI 
## i.e. categories not in list but separated by semicolon    

print(SE_df.isnull().sum(),'\n')

print('No info in themes: ',sum(SE_df['themes']==''))
print('No info in sub_themes: ',sum(SE_df['sub_themes']==''))


SE_df.head(5)

id                 0
context           59
title              0
abstract           9
url                0
last_update        0
raw content        0
related_titles     0
related_urls       0
categories         0
new_date           0
year               0
themes             0
sub_themes         0
dtype: int64 

No info in themes:  48
No info in sub_themes:  83


Unnamed: 0,id,context,title,abstract,url,last_update,raw content,related_titles,related_urls,categories,new_date,year,themes,sub_themes
0,7,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00,"Number of accidents In 2018, there were 3.1 m...",[Health in the European Union a facts and figu...,[https://ec.europa.eu/eurostat/statistics-expl...,Accidents at work;Health;Health and safety;Lab...,2020-11-26,2020,Population and social conditions,Labour market;Health
1,13,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-28 16:29:00,Household consumption Consumption expenditure...,"[Sector accounts, European system of national ...",[https://ec.europa.eu/eurostat/statistics-expl...,National accounts (incl. GDP);Statistical article,2021-06-28,2021,Economy and finance,National accounts (incl. GDP)
2,16,National rail networks have different technica...,Railway safety statistics in the EU,"In 2019, 1516 significant railway accidents we...",https://ec.europa.eu/eurostat/statistics-expla...,2021-06-25 18:31:00,Suicides on railways Suicides occurring on th...,"[Railway freight transport statistics, Railway...",[https://ec.europa.eu/eurostat/statistics-expl...,Rail;Statistical article;Transport;Transport s...,2021-06-25,2021,Transport,
3,17,The content of this statistical article is bas...,Railway freight transport statistics,This article focuses on recent rail freight tr...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-27 18:19:00,Geographical location plays a key role in the...,"[Transport statistics at regional level, All a...",[https://ec.europa.eu/eurostat/statistics-expl...,Freight;Rail;Statistical article;Transport,2020-11-27,2020,Transport,
4,18,The content of this statistical article is bas...,Railway passenger transport statistics - quart...,This article takes a look at recent annual and...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:30:00,Number of passengers transported by rail incr...,"[Railway freight transport statistics, Freight...",[https://ec.europa.eu/eurostat/statistics-expl...,Passengers;Rail;Statistical article;Transport,2021-07-07,2021,Transport,


## D. Tokenize and stem the articles titles, contexts, abstracts and contents
***

* Also remove stop-words.
* Create columns _title tokens_, _context tokens_, _abstract tokens_, _raw content tokens_.

In [14]:
#Stemming.

from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.porter import PorterStemmer

p = PorterStemmer()

def text_to_words(text):
    words = str(gensim.utils.simple_preprocess(text, deacc=True))
    words = remove_stopwords(words) 
    words = gensim.utils.tokenize(words)
        
    words = [p.stem(token) for token in words]  
    return ' '.join(words)        

for i in range(len(SE_df)):
    SE_df.loc[i,'title tokens']=text_to_words(SE_df.loc[i,'title'])
    if not pd.isnull(SE_df.loc[i,'context']):
        SE_df.loc[i,'context tokens']=text_to_words(SE_df.loc[i,'context'])
    else:
        SE_df.loc[i,'context tokens']=''
    if not pd.isnull(SE_df.loc[i,'abstract']):        
        SE_df.loc[i,'abstract tokens']=text_to_words(SE_df.loc[i,'abstract'])
    else:
        SE_df.loc[i,'abstract tokens']=''
    SE_df.loc[i,'raw content tokens']=text_to_words(SE_df.loc[i,'raw content'])

SE_df.rename(columns={'id':'article_id'},inplace=True)
SE_df.head(5)

Unnamed: 0,article_id,context,title,abstract,url,last_update,raw content,related_titles,related_urls,categories,new_date,year,themes,sub_themes,title tokens,context tokens,abstract tokens,raw content tokens
0,7,"A safe, healthy working environment is a cruci...",Accidents at work statistics,This article presents a set of main statistica...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-26 16:06:00,"Number of accidents In 2018, there were 3.1 m...",[Health in the European Union a facts and figu...,[https://ec.europa.eu/eurostat/statistics-expl...,Accidents at work;Health;Health and safety;Lab...,2020-11-26,2020,Population and social conditions,Labour market;Health,accid at work statist,safe healthi work environ is crucial factor in...,thi articl present set of main statist find in...,number of accid in there were million non fata...
1,13,"European institutions, governments, central ba...",National accounts and GDP,National accounts are the source for a multitu...,https://ec.europa.eu/eurostat/statistics-expla...,2021-06-28 16:29:00,Household consumption Consumption expenditure...,"[Sector accounts, European system of national ...",[https://ec.europa.eu/eurostat/statistics-expl...,National accounts (incl. GDP);Statistical article,2021-06-28,2021,Economy and finance,National accounts (incl. GDP),nation account and gdp,european institut govern central bank as well ...,nation account ar the sourc for multitud of we...,household consumpt consumpt expenditur of hous...
2,16,National rail networks have different technica...,Railway safety statistics in the EU,"In 2019, 1516 significant railway accidents we...",https://ec.europa.eu/eurostat/statistics-expla...,2021-06-25 18:31:00,Suicides on railways Suicides occurring on th...,"[Railway freight transport statistics, Railway...",[https://ec.europa.eu/eurostat/statistics-expl...,Rail;Statistical article;Transport;Transport s...,2021-06-25,2021,Transport,,railwai safeti statist in the eu,nation rail network have differ technic specif...,in signific railwai accid were report in the e...,suicid on railwai suicid occur on the railwai ...
3,17,The content of this statistical article is bas...,Railway freight transport statistics,This article focuses on recent rail freight tr...,https://ec.europa.eu/eurostat/statistics-expla...,2020-11-27 18:19:00,Geographical location plays a key role in the...,"[Transport statistics at regional level, All a...",[https://ec.europa.eu/eurostat/statistics-expl...,Freight;Rail;Statistical article;Transport,2020-11-27,2020,Transport,,railwai freight transport statist,the content of thi statist articl is base on d...,thi articl focus on recent rail freight transp...,geograph locat plai kei role in the share of i...
4,18,The content of this statistical article is bas...,Railway passenger transport statistics - quart...,This article takes a look at recent annual and...,https://ec.europa.eu/eurostat/statistics-expla...,2021-07-07 10:30:00,Number of passengers transported by rail incr...,"[Railway freight transport statistics, Freight...",[https://ec.europa.eu/eurostat/statistics-expl...,Passengers;Rail;Statistical article;Transport,2021-07-07,2021,Transport,,railwai passeng transport statist quarterli an...,the content of thi statist articl is base on d...,thi articl take look at recent annual and quar...,number of passeng transport by rail increas in...


## E. Produce the output file with the Statistics Explained articles data
***


In [15]:
import datetime

def file_name(pre,ext):
    current_time = datetime.datetime.now() 
    return pre + '_'+ str(current_time.month)+ '_' + str(current_time.day) + \
                 '_' + str(current_time.hour)+ '_' + str(current_time.minute)  +'.'+ext
    

outfile = file_name('SE_df','xlsx')
print('Writing to file: ',outfile)
SE_df.to_excel(outfile)


Writing to file:  SE_df_2_16_22_34.xlsx


## F. Also read the branches & datasets information from Eurostat's database and produce the input file required
***

This is the full description of the database tree from the parsing of [table_of_contents.xml](https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml)

In [16]:

SQLCommand = """SELECT id, number, codes, names, file_descr, file_code, level, link 
                FROM ESTAT.V1.dat_all_datasets """

crumbsDF = pd.read_sql(SQLCommand,c)

crumbsDF['codes'] = crumbsDF['codes'].apply(lambda x: ';'.join(ast.literal_eval(x)))
crumbsDF['names'] = crumbsDF['names'].apply(lambda x: ';'.join(ast.literal_eval(x)))
crumbsDF.rename(columns={'number':'Numbers','codes':'Codes','names':'Names','link':'Files','level':'Level'},inplace=True)
crumbsDF = crumbsDF[['Numbers','Codes','Names','Files','Level']]
crumbsDF

outfile = file_name('Crumbs','xlsx')
print('Writing to file: ',outfile)
crumbsDF.to_excel(outfile,index=False)
crumbsDF

Writing to file:  Crumbs_2_16_22_34.xlsx


Unnamed: 0,Numbers,Codes,Names,Files,Level
0,1,data,Database by themes,,0
1,1.1,data;general,Database by themes;General and regional statis...,,1
2,1.1.1,data;general;euroind,Database by themes;General and regional statis...,,2
3,1.1.1.1,data;general;euroind;ei_bcs,Database by themes;General and regional statis...,,3
4,1.1.1.1.1,data;general;euroind;ei_bcs;ei_bcs_cs,Database by themes;General and regional statis...,,4
...,...,...,...,...,...
10619,4.8.6.4.1,cc;sks;sks_dev;sks_devict;isoc_ske_ittn2,Cross cutting topics;Skills-related statistics...,https://ec.europa.eu/eurostat/estat-navtree-po...,4
10620,4.8.6.5,cc;sks;sks_dev;sks_devcvt,Cross cutting topics;Skills-related statistics...,,3
10621,4.8.6.5.1,cc;sks;sks_dev;sks_devcvt;trng_cvt_01s,Cross cutting topics;Skills-related statistics...,https://ec.europa.eu/eurostat/estat-navtree-po...,4
10622,4.8.6.5.2,cc;sks;sks_dev;sks_devcvt;trng_cvt_12s,Cross cutting topics;Skills-related statistics...,https://ec.europa.eu/eurostat/estat-navtree-po...,4


## G. Finally, load two topic modelling information tables from the database and produce the two input files required
***


* Table tm_articles_to_topics which contains the probabilities of each SE article belonging to a topic

In [17]:
SQLCommand = """SELECT id,article_id, topic_id, probability 
                FROM ESTAT.V1.tm_articles_to_topics """
tm_articles_topics_df = pd.read_sql(SQLCommand,c)
 
tm_articles_topics_df.to_excel('tm_articles_to_topics.xlsx') 
tm_articles_topics_df


Unnamed: 0,id,article_id,topic_id,probability
0,0,7,0,0.0
1,1,7,1,0.0
2,2,7,2,0.0
3,3,7,3,0.0
4,4,7,4,0.0
...,...,...,...,...
17295,17295,10539,15,0.0
17296,17296,10539,16,0.0
17297,17297,10539,17,0.0
17298,17298,10539,18,0.0


* Table tm_topics which contains the keywords and the description of each topic.

In [18]:
SQLCommand = """SELECT topic_id,topic_keywords, topic_name 
                FROM ESTAT.V1.tm_topics """
tm_topics_df = pd.read_sql(SQLCommand,c)
 
tm_topics_df.to_excel('tm_topics.xlsx') 
tm_topics_df



Unnamed: 0,topic_id,topic_keywords,topic_name
0,0,"popul, peopl, older, birth, project, chang, wo...",Population projections
1,1,"product, price, million, quarter, agricultur, ...",Products and prices
2,2,"energi, consumpt, product, electr, emiss, rene...","Energy production, consumption and the environ..."
3,3,"billion, export, trade, china, partner, millio...",EU exports to the world
4,4,"household, cultur, internet, adult, onlin, poi...",Internet in households
5,5,"region, sector, economi, employ, person, finan...",Regions and economic sectors
6,6,"person, group, emploi, health, children, women...",Employment and health
7,7,"activ, accid, servic, fatal, sector, emploi, p...",Accidents at work
8,8,"peopl, activ, popul, particip, adult, famili, ...",Participation in activities
9,9,"citizen, foreign, resid, nation, permit, first...",Citizenship
