In [1]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup

def get_tsa(country):
    country = country.title()
    url = f"https://travel.state.gov/content/travel/en/international-travel/International-Travel-Country-Information-Pages/{country}.html"

    url_html = requests.get(url).text

    soup = BeautifulSoup(url_html)

    return soup.find_all(class_='tsg-rwd-eab-title-frame')

In [2]:
print(get_tsa('canada'))

[<h3 class="tsg-rwd-eab-title-frame">Worldwide Caution</h3>, <h3 class="tsg-rwd-eab-title-frame">Information for U.S. Citizens in the Middle East</h3>, <h3 class="tsg-rwd-eab-title-frame">Canada - Level 1: Exercise Normal Precautions</h3>]


In [46]:
def get_eurostat_data(article_title, time=2020):
    article_title = article_title.upper()
    url = f'https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/{article_title}?format=JSON&lang=EN&time={time}'

    output = requests.get(url).text

    data = json.loads(output)

    return data

In [47]:
raw_data = get_eurostat_data('cult_emp_sex', time = 2019)
raw_data
happiness_idx

{'version': '2.0',
 'class': 'dataset',
 'label': 'Persons being happy in the last 4 weeks by sex, age, educational attainment and frequency',
 'source': 'ESTAT',
 'updated': '2024-04-22T23:00:00+0200',
 'value': {},
 'id': ['freq', 'unit', 'isced11', 'frequenc', 'sex', 'age', 'geo', 'time'],
 'size': [1, 1, 5, 6, 3, 7, 38, 0],
 'dimension': {'freq': {'label': 'Time frequency',
   'category': {'index': {'A': 0}, 'label': {'A': 'Annual'}}},
  'unit': {'label': 'Unit of measure',
   'category': {'index': {'PC': 0}, 'label': {'PC': 'Percentage'}}},
  'isced11': {'label': 'International Standard Classification of Education (ISCED 2011)',
   'category': {'index': {'TOTAL': 0,
     'ED0-2': 1,
     'ED3_4': 2,
     'ED5-8': 3,
     'ED5_6': 4},
    'label': {'TOTAL': 'All ISCED 2011 levels',
     'ED0-2': 'Less than primary, primary and lower secondary education (levels 0-2)',
     'ED3_4': 'Upper secondary and post-secondary non-tertiary education (levels 3 and 4)',
     'ED5-8': 'Tertiary 

In [48]:
happiness_idx = get_eurostat_data('ilc_pw08')
happy_series = pd.Series(happiness_idx)

# Convert nested dictionaries to DataFrame
df_dimension = pd.json_normalize(happy_series['dimension'])

# Convert the Series to DataFrame and transpose
df = happy_series.to_frame().T

# Combine DataFrames
df = pd.concat([df.drop('dimension', axis=1), df_dimension.add_prefix('dimension.')], axis=1)

df

Unnamed: 0,version,class,label,source,updated,value,id,size,extension,dimension.freq.label,...,dimension.geo.category.label.NO,dimension.geo.category.label.CH,dimension.geo.category.label.UK,dimension.geo.category.label.ME,dimension.geo.category.label.MK,dimension.geo.category.label.AL,dimension.geo.category.label.RS,dimension.geo.category.label.TR,dimension.geo.category.label.XK,dimension.time.label
0,2.0,dataset,Persons being happy in the last 4 weeks by sex...,ESTAT,2024-04-22T23:00:00+0200,{},"[freq, unit, isced11, frequenc, sex, age, geo,...","[1, 1, 5, 6, 3, 7, 38, 0]","{'lang': 'EN', 'id': 'ILC_PW08', 'agencyId': '...",Time frequency,...,Norway,Switzerland,United Kingdom,Montenegro,North Macedonia,Albania,Serbia,Türkiye,Kosovo*,Time


In [49]:
url = 'https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/ilc_pw08?format=JSON&time=2022&unit=PC&isced11=TOTAL&isced11=ED0-2&isced11=ED3_4&isced11=ED5-8&isced11=ED5_6&frequenc=ALW&frequenc=MOST&frequenc=SMT&frequenc=RAR&frequenc=NVR&frequenc=UNK&sex=T&sex=M&sex=F&age=Y16-24&age=Y_GE16&age=Y25-34&age=Y35-49&age=Y50-64&age=Y65-74&age=Y_GE75&lang=en'

out = requests.get(url).text
data = json.loads(out)
data

{'version': '2.0',
 'class': 'dataset',
 'label': 'Persons being happy in the last 4 weeks by sex, age, educational attainment and frequency',
 'source': 'ESTAT',
 'updated': '2024-04-22T23:00:00+0200',
 'value': {'5340': 9.5,
  '5321': 17.7,
  '5322': 4.6,
  '5350': 9.8,
  '5333': 24.3,
  '5323': 16.6,
  '5324': 7.9,
  '5326': 8.9,
  '5328': 25.4,
  '5329': 32.1,
  '5320': 19.1,
  '5346': 0.0,
  '5330': 19.5,
  '5331': 24.3,
  '5337': 22.3,
  '5332': 25.8,
  '5335': 13.9,
  '5336': 12.2,
  '5334': 23.4,
  '5352': 23.1,
  '5338': 1.2,
  '5339': 2.8,
  '5349': 8.1,
  '5341': 22.7,
  '5342': 39.4,
  '5343': 13.2,
  '5355': 27.5,
  '5347': 7.4,
  '5344': 1.7,
  '5345': 24.7,
  '5356': 15.7,
  '5416': 24.2,
  '5397': 9.2,
  '5398': 5.1,
  '5426': 26.5,
  '5409': 3.8,
  '5399': 3.9,
  '5400': 0.0,
  '5402': 4.5,
  '5404': 10.8,
  '5405': 28.3,
  '5406': 25.2,
  '5413': 19.1,
  '5408': 18.5,
  '5411': 2.9,
  '5410': 8.2,
  '5428': 6.6,
  '5414': 12.2,
  '5415': 1.5,
  '5425': 10.3,
  '5417':

In [50]:
df = pd.json_normalize(data)
df

Unnamed: 0,version,class,label,source,updated,id,size,value.5340,value.5321,value.5322,...,extension.status.label.u,extension.status.label.n,extension.positions-with-no-data.freq,extension.positions-with-no-data.unit,extension.positions-with-no-data.isced11,extension.positions-with-no-data.frequenc,extension.positions-with-no-data.sex,extension.positions-with-no-data.age,extension.positions-with-no-data.geo,extension.positions-with-no-data.time
0,2.0,dataset,Persons being happy in the last 4 weeks by sex...,ESTAT,2024-04-22T23:00:00+0200,"[freq, unit, isced11, frequenc, sex, age, geo,...","[1, 1, 5, 6, 3, 7, 38, 1]",9.5,17.7,4.6,...,low reliability,not significant,[],[],[4],[],[],[],"[28, 31, 33, 34, 37]",[]


In [51]:
print(df['id'])

0    [freq, unit, isced11, frequenc, sex, age, geo,...
Name: id, dtype: object


In [52]:
from pyjstat import pyjstat

collection = pyjstat.Dataset.read(url)

# write to dataframe
df = collection.write('dataframe')

# read from dataframe
dataset_from_df = pyjstat.Dataset.read(df)

df

Unnamed: 0,Time frequency,Unit of measure,International Standard Classification of Education (ISCED 2011),Frequency,Sex,Age class,Geopolitical entity (reporting),Time,value
0,Annual,Percentage,All ISCED 2011 levels,Always,Total,From 16 to 24 years,European Union - 27 countries (from 2020),2022,19.1
1,Annual,Percentage,All ISCED 2011 levels,Always,Total,From 16 to 24 years,Belgium,2022,19.6
2,Annual,Percentage,All ISCED 2011 levels,Always,Total,From 16 to 24 years,Bulgaria,2022,14.9
3,Annual,Percentage,All ISCED 2011 levels,Always,Total,From 16 to 24 years,Czechia,2022,8.6
4,Annual,Percentage,All ISCED 2011 levels,Always,Total,From 16 to 24 years,Denmark,2022,8.5
...,...,...,...,...,...,...,...,...,...
23935,Annual,Percentage,Short-cycle tertiary education and Bachelor's ...,Unknown,Females,75 years or over,North Macedonia,2022,
23936,Annual,Percentage,Short-cycle tertiary education and Bachelor's ...,Unknown,Females,75 years or over,Albania,2022,
23937,Annual,Percentage,Short-cycle tertiary education and Bachelor's ...,Unknown,Females,75 years or over,Serbia,2022,
23938,Annual,Percentage,Short-cycle tertiary education and Bachelor's ...,Unknown,Females,75 years or over,Türkiye,2022,


In [53]:
def get_eurostat_data(article_title, *args):
    '''
    To Do:
        - Write *args
        - Write _url_builder() helper func.
    '''
    
    article_title = article_title.upper()
    url = f'https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/{article_title}?format=JSON&lang=EN'

    output = pyjstat.Dataset.read(url)
    df = output.write('dataframe')

    return df

In [54]:
crime = get_eurostat_data('crim_off_cat')
crime

Unnamed: 0,Time frequency,International classification of crime for statistical purposes (ICCS),Unit of measure,Geopolitical entity (reporting),Time,value
0,Annual,Intentional homicide,Number,Belgium,2008,204.00
1,Annual,Intentional homicide,Number,Belgium,2009,189.00
2,Annual,Intentional homicide,Number,Belgium,2010,189.00
3,Annual,Intentional homicide,Number,Belgium,2011,214.00
4,Annual,Intentional homicide,Number,Belgium,2012,206.00
...,...,...,...,...,...,...
25825,Annual,Participation in an organized criminal group,Per hundred thousand inhabitants,Kosovo*,2018,0.11
25826,Annual,Participation in an organized criminal group,Per hundred thousand inhabitants,Kosovo*,2019,0.17
25827,Annual,Participation in an organized criminal group,Per hundred thousand inhabitants,Kosovo*,2020,0.28
25828,Annual,Participation in an organized criminal group,Per hundred thousand inhabitants,Kosovo*,2021,


In [55]:
happiness = get_eurostat_data('ilc_pw08')
happiness

Unnamed: 0,Time frequency,Unit of measure,International Standard Classification of Education (ISCED 2011),Frequency,Sex,Age class,Geopolitical entity (reporting),Time,value
0,Annual,Percentage,All ISCED 2011 levels,Always,Total,From 16 to 24 years,European Union - 27 countries (from 2020),2013,16.2
1,Annual,Percentage,All ISCED 2011 levels,Always,Total,From 16 to 24 years,European Union - 27 countries (from 2020),2018,19.5
2,Annual,Percentage,All ISCED 2011 levels,Always,Total,From 16 to 24 years,European Union - 27 countries (from 2020),2022,19.1
3,Annual,Percentage,All ISCED 2011 levels,Always,Total,From 16 to 24 years,Belgium,2013,16.7
4,Annual,Percentage,All ISCED 2011 levels,Always,Total,From 16 to 24 years,Belgium,2018,18.7
...,...,...,...,...,...,...,...,...,...
71815,Annual,Percentage,Short-cycle tertiary education and Bachelor's ...,Unknown,Females,75 years or over,Türkiye,2018,
71816,Annual,Percentage,Short-cycle tertiary education and Bachelor's ...,Unknown,Females,75 years or over,Türkiye,2022,
71817,Annual,Percentage,Short-cycle tertiary education and Bachelor's ...,Unknown,Females,75 years or over,Kosovo*,2013,
71818,Annual,Percentage,Short-cycle tertiary education and Bachelor's ...,Unknown,Females,75 years or over,Kosovo*,2018,


In [56]:
train_system = get_eurostat_data('ttr00015')
train_system

Unnamed: 0,Time frequency,Unit of measure,Transport coverage,Geopolitical entity (reporting),Time,value
0,Annual,Millions of passenger-kilometres,Total transport,European Union - 27 countries (from 2020),2011,
1,Annual,Millions of passenger-kilometres,Total transport,European Union - 27 countries (from 2020),2012,
2,Annual,Millions of passenger-kilometres,Total transport,European Union - 27 countries (from 2020),2013,
3,Annual,Millions of passenger-kilometres,Total transport,European Union - 27 countries (from 2020),2014,
4,Annual,Millions of passenger-kilometres,Total transport,European Union - 27 countries (from 2020),2015,375713.0
...,...,...,...,...,...,...
475,Annual,Millions of passenger-kilometres,Total transport,Türkiye,2018,4374.0
476,Annual,Millions of passenger-kilometres,Total transport,Türkiye,2019,4912.0
477,Annual,Millions of passenger-kilometres,Total transport,Türkiye,2020,1520.0
478,Annual,Millions of passenger-kilometres,Total transport,Türkiye,2021,2251.0


In [57]:
leisure = get_eurostat_data('ilc_scp01')
leisure

Unnamed: 0,Time frequency,Frequency,International Standard Classification of Education (ISCED 2011),Age class,Sex,Unit of measure,Geopolitical entity (reporting),Time,value
0,Annual,Not in the last 12 months,All ISCED 2011 levels,From 16 to 24 years,Total,Percentage,European Union - 27 countries (from 2020),2006,
1,Annual,Not in the last 12 months,All ISCED 2011 levels,From 16 to 24 years,Total,Percentage,European Union - 27 countries (from 2020),2015,12.4
2,Annual,Not in the last 12 months,All ISCED 2011 levels,From 16 to 24 years,Total,Percentage,European Union - 27 countries (from 2020),2022,
3,Annual,Not in the last 12 months,All ISCED 2011 levels,From 16 to 24 years,Total,Percentage,Belgium,2006,6.4
4,Annual,Not in the last 12 months,All ISCED 2011 levels,From 16 to 24 years,Total,Percentage,Belgium,2015,9.8
...,...,...,...,...,...,...,...,...,...
51835,Annual,At least once,Tertiary education (levels 5-8),75 years or over,Females,Percentage,Serbia,2015,30.6
51836,Annual,At least once,Tertiary education (levels 5-8),75 years or over,Females,Percentage,Serbia,2022,26.0
51837,Annual,At least once,Tertiary education (levels 5-8),75 years or over,Females,Percentage,Türkiye,2006,
51838,Annual,At least once,Tertiary education (levels 5-8),75 years or over,Females,Percentage,Türkiye,2015,


In [58]:
social_exclusion = get_eurostat_data('ilc_peps01n')
social_exclusion

Unnamed: 0,Time frequency,Unit of measure,Age class,Sex,Geopolitical entity (reporting),Time,value
0,Annual,Thousand persons,Total,Total,"European Union (EU6-1958, EU9-1973, EU10-1981,...",2014,
1,Annual,Thousand persons,Total,Total,"European Union (EU6-1958, EU9-1973, EU10-1981,...",2015,119632.0
2,Annual,Thousand persons,Total,Total,"European Union (EU6-1958, EU9-1973, EU10-1981,...",2016,117794.0
3,Annual,Thousand persons,Total,Total,"European Union (EU6-1958, EU9-1973, EU10-1981,...",2017,112399.0
4,Annual,Thousand persons,Total,Total,"European Union (EU6-1958, EU9-1973, EU10-1981,...",2018,109997.0
...,...,...,...,...,...,...,...
89755,Annual,Percentage,75 years or over,Females,Türkiye,2019,26.2
89756,Annual,Percentage,75 years or over,Females,Türkiye,2020,25.8
89757,Annual,Percentage,75 years or over,Females,Türkiye,2021,20.3
89758,Annual,Percentage,75 years or over,Females,Türkiye,2022,


In [59]:
pop_demo = get_eurostat_data('demo_pjangroup')

print(f"Count before dropping NaN values: {pop_demo.shape}")
print(f"Count after dropping NaN values: {pop_demo.dropna().shape}")

Count before dropping NaN values: (253440, 7)
Count after dropping NaN values: (162376, 7)
