# Calculate educational attainment growth of regions in Finland

Data source: Statistic Finland

Table: 12bq -- Population aged 15 or over by level of education, municipality, gender and age, 1970-2022

Link: https://pxdata.stat.fi/PxWeb/pxweb/en/StatFin/StatFin__vkour/statfin_vkour_pxt_12bq.px/

In [1]:
import pandas as pd
import numpy as np

In [9]:
# Create the base with region column DataFrame
base_df = pd.read_csv('../data/Indices/population_density_index.csv')
base_df = base_df[['Region code', 'Region name (en)', 'Region name (fi)']].copy()

base_df

Unnamed: 0,Region code,Region name (en),Region name (fi)
0,MK01,Uusimaa,Uusimaa
1,MK02,Southwest Finland,Varsinais-Suomi
2,MK04,Satakunta,Satakunta
3,MK05,Kanta-Häme,Kanta-Häme
4,MK06,Pirkanmaa,Pirkanmaa
5,MK07,Päijät-Häme,Päijät-Häme
6,MK08,Kymenlaakso,Kymenlaakso
7,MK09,South Karelia,Etelä-Karjala
8,MK10,South Savo,Etelä-Savo
9,MK11,North Savo,Pohjois-Savo


In [10]:
import requests
import json

# Define the URL and JSON query
url = "https://pxdata.stat.fi:443/PxWeb/api/v1/en/StatFin/vkour/statfin_vkour_pxt_12bq.px"
query = {
  "query": [
    {
      "code": "Vuosi",
      "selection": {
        "filter": "item",
        "values": [
          "2018",
          "2019",
          "2020",
          "2021",
          "2022"
        ]
      }
    },
    {
      "code": "Alue",
      "selection": {
        "filter": "agg:_Regions 2023.agg",
        "values": [
          "MK01",
          "MK02",
          "MK04",
          "MK05",
          "MK06",
          "MK07",
          "MK08",
          "MK09",
          "MK10",
          "MK11",
          "MK12",
          "MK13",
          "MK14",
          "MK15",
          "MK16",
          "MK17",
          "MK18",
          "MK19",
          "MK21"
        ]
      }
    },
    {
      "code": "Ikä",
      "selection": {
        "filter": "item",
        "values": [
          "25-29",
          "30-34",
          "35-39",
          "40-44",
          "45-49",
          "50-54",
          "55-59",
          "60-64",
          "65-69",
          "70-74",
          "75-79",
          "80-"
        ]
      }
    },
    {
      "code": "Sukupuoli",
      "selection": {
        "filter": "item",
        "values": [
          "SSS"
        ]
      }
    },
    {
      "code": "Koulutusaste",
      "selection": {
        "filter": "item",
        "values": [
          "SSS",
          "6",
          "7",
          "8"
        ]
      }
    }
  ],
  "response": {
    "format": "json-stat2"
  }
}

# Send the POST request
response = requests.post(url, json=query)

# Check the response status and content
if response.status_code == 200:
    json_data = response.json()
else:
    print("Request failed with status code:", response.status_code)

json_data

{'class': 'dataset',
 'label': 'Population aged 15 or over by level of education, municipality, gender and age by Year, Area, Age, Gender, Level of education and Information',
 'source': 'Statistics Finland, educational structure of population',
 'updated': '2023-10-03T05:00:00Z',
 'id': ['Vuosi', 'Alue', 'Ikä', 'Sukupuoli', 'Koulutusaste', 'Tiedot'],
 'size': [5, 19, 12, 1, 4, 1],
 'dimension': {'Vuosi': {'extension': {'show': 'value'},
   'label': 'Year',
   'category': {'index': {'2018': 0,
     '2019': 1,
     '2020': 2,
     '2021': 3,
     '2022': 4},
    'label': {'2018': '2018',
     '2019': '2019',
     '2020': '2020',
     '2021': '2021',
     '2022': '2022'}}},
  'Alue': {'extension': {'show': 'value'},
   'label': 'Area',
   'category': {'index': {'MK01': 0,
     'MK02': 1,
     'MK04': 2,
     'MK05': 3,
     'MK06': 4,
     'MK07': 5,
     'MK08': 6,
     'MK09': 7,
     'MK10': 8,
     'MK11': 9,
     'MK12': 10,
     'MK13': 11,
     'MK14': 12,
     'MK15': 13,
     'M

In [11]:
values = json_data['value']
shape = json_data['size']

json_year = json_data['dimension']['Vuosi']['category']['label'].values()
json_region = json_data['dimension']['Alue']['category']['label'].values()
json_age = json_data['dimension']['Ik\u00e4']['category']['label'].values()
json_education = json_data['dimension']['Koulutusaste']['category']['label'].values()
print(json_year)
print(json_region)
print(json_age)
print(json_education)
print(len(json_year))
print(len(json_region))
print(len(json_age))
print(len(json_education))
print(shape)

# Load values into a 1D NumPy array
values = np.array(values).reshape(shape).squeeze()
# reshape = np.reshape(values, (19,12,6))
# print(reshape.shape)
# reshape
print(values.shape)
values
# transformed = np.rollaxis(values, 0, -1).reshape(len(json_year), len(json_region), len(json_age), len(json_education))
# print(transformed.shape)
# transformed

dict_values(['2018', '2019', '2020', '2021', '2022'])
dict_values(['MK01 Uusimaa', 'MK02 Southwest Finland', 'MK04 Satakunta', 'MK05 Kanta-Häme', 'MK06 Pirkanmaa', 'MK07 Päijät-Häme', 'MK08 Kymenlaakso', 'MK09 South Karelia', 'MK10 South Savo', 'MK11 North Savo', 'MK12 North Karelia', 'MK13 Central Finland', 'MK14 South Ostrobothnia', 'MK15 Ostrobothnia', 'MK16 Central Ostrobothnia', 'MK17 North Ostrobothnia', 'MK18 Kainuu', 'MK19 Lapland', 'MK21 Åland'])
dict_values(['25 - 29', '30 - 34', '35 - 39', '40 - 44', '45 - 49', '50 - 54', '55 - 59', '60 - 64', '65 - 69', '70 - 74', '75 - 79', '80 -'])
dict_values(['Total', "6 Bachelor's or equivalent level", "7 Master's or equivalent level", '8 Doctoral or equivalent level'])
5
19
12
4
[5, 19, 12, 1, 4, 1]
(5, 19, 12, 4)


array([[[[126901,  31153,  15639,    187],
         [126772,  29890,  27680,   1466],
         [125614,  29043,  30176,   2720],
         ...,
         [ 84602,   8013,   9574,   1772],
         [ 51139,   4455,   5200,   1118],
         [ 65954,   4478,   4673,    887]],

        [[ 31301,   7629,   2466,     36],
         [ 29563,   7000,   4417,    282],
         [ 30169,   6989,   4995,    530],
         ...,
         [ 30626,   1933,   1838,    429],
         [ 18756,   1186,   1004,    236],
         [ 28313,   1281,    818,    189]],

        [[ 11409,   2232,    486,      4],
         [ 11781,   2697,    969,     19],
         [ 12466,   3012,   1362,     43],
         ...,
         [ 15663,    831,    516,     32],
         [ 10140,    490,    356,     41],
         [ 15344,    543,    242,     15]],

        ...,

        [[  3559,    818,    116,      1],
         [  3678,    876,    297,      6],
         [  3827,    943,    442,     15],
         ...,
         [  5119,    

In [20]:
# Create the nested index
tuples = []
for year in json_year:
    for region in json_region:
        for age in json_age:
            tuples.append((year, region, age))

index = pd.MultiIndex.from_tuples(tuples, names=["Year", "Region", "Age Group"])

# Create a NumPy array from the value list
value_array = np.array(json_data['value'])
shape = json_data['size']

transformed_array = np.array(value_array).reshape(shape).squeeze()
# reshape from (5, 19, 12, 4) to (5*19*12, 4)
transformed_array = transformed_array.reshape(-1, len(json_education))

# Create a DataFrame from the value list with the nested index
df = pd.DataFrame(data=transformed_array, index=index, columns=json_education)

# Print the DataFrame
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Total,6 Bachelor's or equivalent level,7 Master's or equivalent level,8 Doctoral or equivalent level
Year,Region,Age Group,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018,MK01 Uusimaa,25 - 29,126901,31153,15639,187
2018,MK01 Uusimaa,30 - 34,126772,29890,27680,1466
2018,MK01 Uusimaa,35 - 39,125614,29043,30176,2720
2018,MK01 Uusimaa,40 - 44,118076,23643,27936,3253
2018,MK01 Uusimaa,45 - 49,103968,13620,22388,2725
...,...,...,...,...,...,...
2022,MK21 Åland,60 - 64,1956,135,160,4
2022,MK21 Åland,65 - 69,1943,170,120,13
2022,MK21 Åland,70 - 74,1860,158,93,15
2022,MK21 Åland,75 - 79,1574,104,76,7


In [21]:
df = df.groupby(level=['Year', 'Region']).sum().copy()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Total,6 Bachelor's or equivalent level,7 Master's or equivalent level,8 Doctoral or equivalent level
Year,Region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018,MK01 Uusimaa,1206163,181144,206256,23612
2018,MK02 Southwest Finland,351801,44012,36427,5106
2018,MK04 Satakunta,164031,18119,10449,584
2018,MK05 Kanta-Häme,127293,13876,9668,752
2018,MK06 Pirkanmaa,374489,50631,43914,4621
...,...,...,...,...,...
2022,MK16 Central Ostrobothnia,48087,6317,3650,198
2022,MK17 North Ostrobothnia,287818,42391,32213,3907
2022,MK18 Kainuu,54338,6308,3781,181
2022,MK19 Lapland,132814,16657,10929,680


In [22]:
df.reset_index(inplace=True)
df['Region code'] = df['Region'].str[:4]

# Calculate ratio of Bachelor's, Master's and Doctoral degrees to total for each year in each region
df["Bachelor's Percentage"] = df["6 Bachelor\'s or equivalent level"] / df['Total'] * 100
df["Master's Percentage"] = df["7 Master\'s or equivalent level"] / df['Total'] * 100
df["Doctoral Percentage"] = df["8 Doctoral or equivalent level"] / df['Total'] * 100

# # Calculate education attainment absolute growth index
# education_attainment["Bachelor's Aboslute Growth 2018-2022 (persons)"] = education_attainment["2022 6 Bachelor\'s or equivalent level"] - education_attainment["2018 6 Bachelor\'s or equivalent level"]
# education_attainment["Master's Aboslute Growth 2018-2022 (persons)"] = education_attainment["2022 7 Master\'s or equivalent level"] - education_attainment["2018 7 Master\'s or equivalent level"]
# education_attainment["Doctoral Aboslute Growth 2018-2022 (persons)"] = education_attainment["2022 8 Doctoral or equivalent level"] - education_attainment["2018 8 Doctoral or equivalent level"]

# # Calculate education attainment relative growth index
# education_attainment["Bachelor's Relative Growth 2018-2022 (%)"] = education_attainment["Bachelor's Aboslute Growth 2018-2022 (persons)"] / education_attainment["2018 6 Bachelor\'s or equivalent level"] * 100
# education_attainment["Master's Relative Growth 2018-2022 (%)"] = education_attainment["Master's Aboslute Growth 2018-2022 (persons)"] / education_attainment["2018 7 Master\'s or equivalent level"] * 100
# education_attainment["Doctoral Relative Growth 2018-2022 (%)"] = education_attainment["Doctoral Aboslute Growth 2018-2022 (persons)"] / education_attainment["2018 8 Doctoral or equivalent level"] * 100

# # Calculate education attainment ratio growth index
# education_attainment["Bachelor's Ratio Growth 2018-2022 (%)"] = education_attainment["2022 Bachelor's Percentage"] - education_attainment["2018 Bachelor's Percentage"]
# education_attainment["Master's Ratio Growth 2018-2022 (%)"] = education_attainment["2022 Master's Percentage"] - education_attainment["2018 Master's Percentage"]
# education_attainment["Doctoral Ratio Growth 2018-2022 (%)"] = education_attainment["2022 Doctoral Percentage"] - education_attainment["2018 Doctoral Percentage"]

growth_columns = [
    "6 Bachelor's or equivalent level",
    "7 Master's or equivalent level",
    '8 Doctoral or equivalent level',
]

ratio_columns = [
    "Bachelor's Percentage",
    "Master's Percentage",
    'Doctoral Percentage',
]

for col in growth_columns:
    degree_name = col.split(' ')[1]
    # Calculate the difference between consecutive years for columns
    df[f'{degree_name} absolute growth (person)'] = df.groupby('Region')[col].diff()
    # Fill NaN values with 0 for the first year
    df[f'{degree_name} absolute growth (person)'].fillna(0, inplace=True)

    # Calculate the relative growth for columns
    df[f'{degree_name} relative growth (%)'] = df[f'{degree_name} absolute growth (person)'] / df[col] * 100
    # Fill NaN values with 0 for the first year
    df[f'{degree_name} relative growth (%)'].fillna(0, inplace=True)

for col in ratio_columns:
    degree_name = col.split(' ')[0]
    # Calculate the difference between consecutive years for columns
    df[f'{degree_name} ratio growth (%)'] = df.groupby('Region')[col].diff()
    # Fill NaN values with 0 for the first year
    df[f'{degree_name} ratio growth (%)'].fillna(0, inplace=True)

df

Unnamed: 0,Year,Region,Total,6 Bachelor's or equivalent level,7 Master's or equivalent level,8 Doctoral or equivalent level,Region code,Bachelor's Percentage,Master's Percentage,Doctoral Percentage,Bachelor's absolute growth (person),Bachelor's relative growth (%),Master's absolute growth (person),Master's relative growth (%),Doctoral absolute growth (person),Doctoral relative growth (%),Bachelor's ratio growth (%),Master's ratio growth (%),Doctoral ratio growth (%)
0,2018,MK01 Uusimaa,1206163,181144,206256,23612,MK01,15.018202,17.100176,1.957613,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1,2018,MK02 Southwest Finland,351801,44012,36427,5106,MK02,12.510482,10.354433,1.451389,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
2,2018,MK04 Satakunta,164031,18119,10449,584,MK04,11.046083,6.370137,0.356030,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
3,2018,MK05 Kanta-Häme,127293,13876,9668,752,MK05,10.900835,7.595076,0.590763,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
4,2018,MK06 Pirkanmaa,374489,50631,43914,4621,MK06,13.520023,11.726379,1.233948,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,2022,MK16 Central Ostrobothnia,48087,6317,3650,198,MK16,13.136607,7.590409,0.411754,155.0,2.453696,104.0,2.849315,8.0,4.040404,0.302048,0.204602,0.016011
91,2022,MK17 North Ostrobothnia,287818,42391,32213,3907,MK17,14.728405,11.192142,1.357455,999.0,2.356632,1118.0,3.470648,104.0,2.661889,0.250058,0.315543,0.027218
92,2022,MK18 Kainuu,54338,6308,3781,181,MK18,11.608819,6.958298,0.333100,58.0,0.919467,107.0,2.829939,-5.0,-2.762431,0.202252,0.253062,-0.006359
93,2022,MK19 Lapland,132814,16657,10929,680,MK19,12.541600,8.228801,0.511994,428.0,2.569490,330.0,3.019489,15.0,2.205882,0.336132,0.257531,0.011863


In [23]:
df.columns.tolist()

['Year',
 'Region',
 'Total',
 "6 Bachelor's or equivalent level",
 "7 Master's or equivalent level",
 '8 Doctoral or equivalent level',
 'Region code',
 "Bachelor's Percentage",
 "Master's Percentage",
 'Doctoral Percentage',
 "Bachelor's absolute growth (person)",
 "Bachelor's relative growth (%)",
 "Master's absolute growth (person)",
 "Master's relative growth (%)",
 'Doctoral absolute growth (person)',
 'Doctoral relative growth (%)',
 "Bachelor's ratio growth (%)",
 "Master's ratio growth (%)",
 'Doctoral ratio growth (%)']

In [26]:
columns = [
    'Year',
    'Region code',
    'Total',
    "6 Bachelor's or equivalent level",
    "7 Master's or equivalent level",
    '8 Doctoral or equivalent level',
    "Bachelor's Percentage",
    "Master's Percentage",
    'Doctoral Percentage',
    "Bachelor's absolute growth (person)",
    "Bachelor's relative growth (%)",
    "Master's absolute growth (person)",
    "Master's relative growth (%)",
    'Doctoral absolute growth (person)',
    'Doctoral relative growth (%)',
    "Bachelor's ratio growth (%)",
    "Master's ratio growth (%)",
    'Doctoral ratio growth (%)'
]
education_attainment = df[columns].copy()
education_attainment

Unnamed: 0,Year,Region code,Total,6 Bachelor's or equivalent level,7 Master's or equivalent level,8 Doctoral or equivalent level,Bachelor's Percentage,Master's Percentage,Doctoral Percentage,Bachelor's absolute growth (person),Bachelor's relative growth (%),Master's absolute growth (person),Master's relative growth (%),Doctoral absolute growth (person),Doctoral relative growth (%),Bachelor's ratio growth (%),Master's ratio growth (%),Doctoral ratio growth (%)
0,2018,MK01,1206163,181144,206256,23612,15.018202,17.100176,1.957613,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1,2018,MK02,351801,44012,36427,5106,12.510482,10.354433,1.451389,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
2,2018,MK04,164031,18119,10449,584,11.046083,6.370137,0.356030,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
3,2018,MK05,127293,13876,9668,752,10.900835,7.595076,0.590763,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
4,2018,MK06,374489,50631,43914,4621,13.520023,11.726379,1.233948,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,2022,MK16,48087,6317,3650,198,13.136607,7.590409,0.411754,155.0,2.453696,104.0,2.849315,8.0,4.040404,0.302048,0.204602,0.016011
91,2022,MK17,287818,42391,32213,3907,14.728405,11.192142,1.357455,999.0,2.356632,1118.0,3.470648,104.0,2.661889,0.250058,0.315543,0.027218
92,2022,MK18,54338,6308,3781,181,11.608819,6.958298,0.333100,58.0,0.919467,107.0,2.829939,-5.0,-2.762431,0.202252,0.253062,-0.006359
93,2022,MK19,132814,16657,10929,680,12.541600,8.228801,0.511994,428.0,2.569490,330.0,3.019489,15.0,2.205882,0.336132,0.257531,0.011863


In [27]:
final_df = pd.merge(base_df, education_attainment, on='Region code', how='inner')
final_df

Unnamed: 0,Region code,Region name (en),Region name (fi),Year,Total,6 Bachelor's or equivalent level,7 Master's or equivalent level,8 Doctoral or equivalent level,Bachelor's Percentage,Master's Percentage,Doctoral Percentage,Bachelor's absolute growth (person),Bachelor's relative growth (%),Master's absolute growth (person),Master's relative growth (%),Doctoral absolute growth (person),Doctoral relative growth (%),Bachelor's ratio growth (%),Master's ratio growth (%),Doctoral ratio growth (%)
0,MK01,Uusimaa,Uusimaa,2018,1206163,181144,206256,23612,15.018202,17.100176,1.957613,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1,MK01,Uusimaa,Uusimaa,2019,1225411,187318,213765,24287,15.286137,17.444351,1.981947,6174.0,3.295999,7509.0,3.512736,675.0,2.779265,0.267934,0.344175,0.024335
2,MK01,Uusimaa,Uusimaa,2020,1239634,192142,221822,24869,15.499898,17.894153,2.006157,4824.0,2.510643,8057.0,3.632192,582.0,2.340263,0.213761,0.449801,0.024209
3,MK01,Uusimaa,Uusimaa,2021,1251592,196265,227784,25288,15.681228,18.199541,2.020467,4123.0,2.100731,5962.0,2.617392,419.0,1.656912,0.181331,0.305388,0.014310
4,MK01,Uusimaa,Uusimaa,2022,1268209,200152,233489,25668,15.782257,18.410924,2.023957,3887.0,1.942024,5705.0,2.443370,380.0,1.480443,0.101028,0.211383,0.003490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,MK21,Åland,Ahvenanmaa,2018,21990,2735,1762,107,12.437472,8.012733,0.486585,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
91,MK21,Åland,Ahvenanmaa,2019,22081,2814,1845,105,12.743988,8.355600,0.475522,79.0,2.807392,83.0,4.498645,-2.0,-1.904762,0.306516,0.342867,-0.011063
92,MK21,Åland,Ahvenanmaa,2020,22348,2862,1875,106,12.806515,8.390013,0.474315,48.0,1.677149,30.0,1.600000,1.0,0.943396,0.062527,0.034413,-0.001207
93,MK21,Åland,Ahvenanmaa,2021,22558,2968,1942,109,13.157195,8.608919,0.483199,106.0,3.571429,67.0,3.450051,3.0,2.752294,0.350680,0.218907,0.008883


In [83]:
final_df.to_csv('../data/Indices/education_attainment_index.csv', index=False)