## Importing libraries

In [30]:
import cbsodata
import re
import json
import requests
import concurrent.futures

## Setting criteria

In [62]:
# Define the desired SBI codes
# Load the SBI data from the JSON file
with open('data/sbi_data.json', 'r', encoding='utf-8') as sbi_file:
    sbi_data = json.load(sbi_file)
    
# Define the desired SBI information
desired_sbi_title = ["Q Gezondheids- en welzijnszorg", "C Industrie", "O Openbaar bestuur en overheidsdiensten", "P Onderwijs"]
    
# Define the desired frequencies
# desired_frequencies = None  # Set to None if you don't want to filter by frequencies
desired_frequencies = [
    "Fourtimesayear", "Viermaalperjaar", "Quarterly", "Perkwartaal", "Threemonthly", "Perdriemaanden", # Quarterly
    "Monthly",  "Permaand",# Monthly
    "Perweek",  "Weekly", # Weekly
    "Stopgezet", "Discontinued" # Other
]

# Define the desired identifiers
desired_identifiers = None  # Set to None if you don't want to filter by identifiers
# desired_identifiers = ['85917NED', '83156NED', '80072ned', '81628NED']

# Define the undesired substrings in the short titles. Data for the Caribian island is excluded
undesired_short_titles = ['Caribisch', 'Bonaire']

# Define the desired language. Data is available in English and Dutch.
desired_language = 'nl'

# Define the period range
start_year_threshold = 2016
end_year_threshold = 2022


## Extracting tables

In [33]:
# Fetch the list of tables
tables = cbsodata.get_table_list()

In [None]:
# Function to extract the first and last four-digit numbers from a period string
def extract_years(period):
    # Find all sequences of four digits
    years = re.findall(r'\b\d{4}\b', period)
    if len(years) >= 2:
        return int(years[0]), int(years[-1])
    elif len(years) == 1:
        return int(years[0]), int(years[0])
    else:
        return None, None

# List to store the filtered data
filtered_data = []

# Filter tables and extract identifiers, short titles, languages, periods, frequencies, and SBI code
for table in tables:
    identifier = table.get('Identifier', 'N/A')
    frequency = table.get('Frequency', 'N/A')
    short_title = table.get('ShortTitle', 'N/A')
    language = table.get('Language', 'N/A')
    period = table.get('Period', 'N/A')
    
    # By default, set these filters to True if no specific filter is applied
    passes_identifier_filter = True if desired_identifiers is None else identifier in desired_identifiers
    passes_frequency_filter = True if desired_frequencies is None else frequency in desired_frequencies

    # Check if the table contains the desired SBI information
    sbi_info = sbi_data.get(identifier, None)
    passes_sbi_filter = False
    sbi_codes_and_titles = []  # This will store all SBI codes and titles

    if isinstance(sbi_info, list):
        # Extract the titles from the SBI data and compare with the desired_sbi_title list
        sbi_titles = {sbi_entry.get('Title', '') for sbi_entry in sbi_info}
        
        # Check if all desired_sbi_title elements are in the sbi_titles
        if all(title in sbi_titles for title in desired_sbi_title):
            passes_sbi_filter = True
            # Collect all the matching SBI codes and titles
            for sbi_entry in sbi_info:
                sbi_code = sbi_entry.get('Key', 'Unknown SBI Code')
                sbi_title = sbi_entry.get('Title', 'Unknown SBI Title')
                sbi_codes_and_titles.append(f"{sbi_code}: {sbi_title}")

    # Apply all the filters including the SBI code filter
    if (passes_identifier_filter and
        passes_frequency_filter and
        passes_sbi_filter and
        language == desired_language and
        not any(substring in short_title for substring in undesired_short_titles)):

        start_year, end_year = extract_years(period)

        if start_year is not None and end_year is not None:
            # Check if the period range is within the desired thresholds
            if start_year <= start_year_threshold and end_year >= end_year_threshold:
                # Append all relevant data including all SBI codes and titles
                filtered_data.append({
                    'Identifier': identifier,
                    'ShortTitle': short_title,
                    'Language': language,
                    'Period': period,
                    'Frequency': frequency,
                    'SBI_Codes_and_Titles': sbi_codes_and_titles
                })

# Save the filtered data to a JSON file
with open('data/table_selection.json', 'w', encoding='utf-8') as json_file:
    json.dump(filtered_data, json_file, indent=4, ensure_ascii=False)

print("Filtered data has been saved to 'table_selection.json'.")

# Print the filtered data
print("Filtered identifiers, short titles, periods, and frequencies:")
for entry in filtered_data:
    print(f"Identifier: {entry['Identifier']}, ShortTitle: {entry['ShortTitle']}, Period: {entry['Period']}, Frequency: {entry['Frequency']}")

## Appendix 1 - overview of all frequency, language, period options

In [None]:
# Initialize sets for unique frequencies, periods, and languages
unique_frequencies = set()
unique_periods = set()
unique_languages = set()

# Loop through the tables and collect unique frequencies, periods, and languages
for table in tables:
    frequency = table.get('Frequency', 'N/A')
    period = table.get('Period', 'N/A')
    language = table.get('Language', 'N/A')
    
    unique_frequencies.add(frequency)
    unique_periods.add(period)
    unique_languages.add(language)

# Convert sets to lists for easier manipulation and JSON serialization
unique_frequencies = list(unique_frequencies)
unique_periods = list(unique_periods)
unique_languages = list(unique_languages)

# Create a dictionary to store all the unique data
unique_data = {
    "frequencies": unique_frequencies,
    "periods": unique_periods,
    "languages": unique_languages
}

# Save the unique data as a JSON file in the 'data' folder
with open('data/unique_data.json', 'w') as json_file:
    json.dump(unique_data, json_file, indent=4)

print("Unique frequencies, periods, and languages have been saved to 'data/unique_data.json'.")

## Appendix 2 - extracting SBI data

In [None]:
# Extract the identifiers from the filtered_data list
identifiers = [entry[0] for entry in filtered_data]  # entry[0] corresponds to the identifier in the tuple

# Function to fetch SBI code information for a single identifier
def fetch_sbi_code(identifier):
    url = f"https://opendata.cbs.nl/ODataApi/OData/{identifier}/BedrijfstakkenBranchesSBI2008"
    
    try:
        # Send a GET request to the constructed URL
        response = requests.get(url)
        
        # Check if the request was successful (HTTP status code 200)
        if response.status_code == 200:
            # Convert the response to JSON format
            data = response.json()

            # Check if the 'value' key contains SBI data
            if 'value' in data and data['value']:
                return (identifier, data['value'])  # Return identifier with SBI data
            else:
                return (identifier, 'No SBI information available')  # No data available
        else:
            return (identifier, f"Request failed with status code {response.status_code}")
    
    except Exception as e:
        return (identifier, f"Error occurred: {str(e)}")

# Function to extract SBI code information using concurrent requests
def extract_sbi_codes_concurrent(identifiers):
    sbi_data_dict = {}
    
    # Use ThreadPoolExecutor to send requests concurrently
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Submit all the fetch_sbi_code tasks
        future_to_identifier = {executor.submit(fetch_sbi_code, identifier): identifier for identifier in identifiers}
        
        # As each request finishes, process the result
        for future in concurrent.futures.as_completed(future_to_identifier):
            identifier, result = future.result()  # Get the result from the future
            sbi_data_dict[identifier] = result  # Store in the dictionary

    return sbi_data_dict

# Call the concurrent version of the SBI code extraction function
sbi_data = extract_sbi_codes_concurrent(identifiers)

# Save the dictionary to a file
with open('data/sbi_data.json', 'w') as f:
    json.dump(sbi_data, f, indent=4)
    
print("SBI data has been saved to 'data/sbi_data.json'.")