In [1]:
# one variable, multiple territorial units

In [None]:
import pandas as pd
import requests

## Single request example

In [None]:
example_url='https://bdl.stat.gov.pl/api/v1/data/by-variable/76037?year=2017&year=2016&unit-level=3&page=0&page-size=100'
response = requests.get(example_url, timeout=7)
example_data=response.json() # returns a dictionary

### Examining data structure



In [None]:
example_data.keys()

dict_keys(['totalRecords', 'variableId', 'measureUnitId', 'aggregateId', 'lastUpdate', 'results'])

In [None]:
print(f"Total records: {example_data['totalRecords']}")
print(f"Variable Id: {example_data['variableId']}")

Total records: 17
Variable Id: 76037


In [None]:
# first two results
example_data['results'][:2]

[{'id': '011210000000',
  'name': 'REGION MAŁOPOLSKIE',
  'values': [{'attrId': 1, 'val': 15068865100.02, 'year': '2016'},
   {'attrId': 1, 'val': 16279000320.3, 'year': '2017'}]},
 {'id': '012410000000',
  'name': 'REGION ŚLĄSKIE',
  'values': [{'attrId': 1, 'val': 20512387686.29, 'year': '2016'},
   {'attrId': 1, 'val': 21736363459.19, 'year': '2017'}]}]

In [None]:
# unit name for the first record:
example_data['results'][0]['name']

'REGION MAŁOPOLSKIE'

In [None]:
# value for the first unit in first year
example_data['results'][0]['values'][0]['val']

15068865100.02

## Defining a function for downloading data

description


In [None]:
def load_data(var_num, unit_level=2, first_year=2015 ,last_year=2020):
    # makroregiony - 1, województwa - 2, regiony stat. - 3, podregiony - 4, powiaty - 5, gminy - 6
    # 0 - Polska
    
    page_size=100 # 100 (records) is the maximum page size
    first_url=f'https://bdl.stat.gov.pl/api/v1/data/by-variable/{var_num}?year={first_year}&unit-level={unit_level}&page=0&page-size={page_size}'
    # loading first page in order to check total number of pages for a given variable
    # the number of records/pages stays the same for any number of years
  
    first_page = requests.get(first_url, timeout=7)
    first_page = first_page.json()
    #page_size=min(first_page['totalRecords'],100) #useless
    last_page_num=0 # if number of records is less than or equal to 100, the first page is the last page

    if first_page['totalRecords']>100: # getting the number of pages if there are more than 100 records
        last_url=first_page['links']['last'] # the last page url 
        # example: https://bdl.stat.gov.pl/api/v1/data/by-variable/76037?year=2017&unit-level=5&page=3&page-size=100
        last_page_num=last_url[last_url.find("page=")+5:last_url.find("&page-size")] 
        # string between page= and &page-size is the number of the last page

    list_of_records=[] # [(unit, year, value)] # a list returned by this method

    # all years at once, less connections/requests
    one_url=f'https://bdl.stat.gov.pl/api/v1/data/by-variable/{var_num}?'
    for year in range(first_year,last_year+1):
        one_url+=f'year={year}&'
    one_url+=f'unit-level={unit_level}&page-size={page_size}'
    # base url

    urls=[] # urls for all pages 
    for page_num in range(int(last_page_num)+1): 
        new_url=one_url+f'&page={page_num}' 
        urls.append(new_url)
    for url in urls: # for every page
        try:
            response = requests.get(url, timeout=7)
        except:
            print("error")
            return
        data=response.json() 
        list_of_records.extend( [(x['name'], val['year'], val['val']) for x in data['results'] for val in x['values']] )
        # packing specified variable values into a list of tuples (one tuple for every record)
        # list of records for a single page extends the overall list of records

    return list_of_records

## Data download example

In [None]:
# Variable:
# Total revenue of voivodships budgets
# Data accessibility level -> 3 (Region (NUTS 2))
# If specified unit level is lower, the values are aggregated

# Dochody budżetów województw 
# Ogółem
# id: 6454

In [None]:
dochody_wojewodztw=load_data(6454,2,2015,2018)

In [None]:
dochody_wojewodztw[:5]

[('MAŁOPOLSKIE', '2015', 1137039132.7),
 ('MAŁOPOLSKIE', '2016', 1068354267.52),
 ('MAŁOPOLSKIE', '2017', 1337669040.22),
 ('MAŁOPOLSKIE', '2018', 1366539355.62),
 ('ŚLĄSKIE', '2015', 1763365707.84)]

In [None]:
dochody_df=pd.DataFrame(dochody_wojewodztw, columns=['województwo', 'rok', 'dochód'])

In [None]:
#pd.options.display.float_format = '{:.2f}'.format
dochody_df

Unnamed: 0,województwo,rok,dochód
0,MAŁOPOLSKIE,2015,1137039132.70
1,MAŁOPOLSKIE,2016,1068354267.52
2,MAŁOPOLSKIE,2017,1337669040.22
3,MAŁOPOLSKIE,2018,1366539355.62
4,ŚLĄSKIE,2015,1763365707.84
...,...,...,...
59,PODLASKIE,2018,707145494.13
60,MAZOWIECKIE,2015,2768680366.53
61,MAZOWIECKIE,2016,2302606116.47
62,MAZOWIECKIE,2017,2550122963.49


## Saving data

In [None]:
path='' # specify custom path
# path='drive/MyDrive/'

In [None]:
# saving data to a csv file
dochody_df.to_csv(path+"dochody_budżetów_województw.csv", index=False)
#dochody_df.to_csv(path+"dochody_budżetów_województw.csv", index=False, encoding='utf-8')