## Importing libraries

In [1]:
import pandas as pd
import cbsodata
import json
import re
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
DATA_DIR = r"C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data"

def ensure_directory_exists(directory):
    """Ensure a directory exists; create it if it doesn't."""
    if not os.path.exists(directory):
        os.makedirs(directory)

In [3]:
def load_json(file_path):
    """Load a JSON file with error handling."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except FileNotFoundError:
        print(f"Error: File not found - {file_path}")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON file {file_path}: {e}")
    return None

In [4]:
def save_json(data, file_path):
    """Save data to a JSON file with error handling."""
    try:
        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump(data, file, indent=4)
        print(f"Data successfully saved to {file_path}")
    except Exception as e:
        print(f"Error saving data to {file_path}: {e}")

In [5]:
def process_period_column(df, perioden_column='Perioden', year_column='Year', quarter_column='Quarter'):
    """
    Process the 'Perioden' column to extract 'Year' and 'Quarter'.
    """
    df[[year_column, quarter_column]] = df[perioden_column].apply(
        lambda x: pd.Series(extract_latest_year_and_quarter(x))
    )
    return df

In [6]:
def extract_latest_year_and_quarter(period_text):
    """
    Extract the latest year and quarter from a period string.
    """
    match = re.findall(r'(\d{4})\s*(\w+)', period_text)
    if match:
        latest_year, latest_period_part = match[-1]
        latest_year = int(latest_year)
        latest_quarter = extract_quarter(latest_period_part)
        return latest_year, latest_quarter
    return None, None

In [7]:
def extract_quarter(period_part):
    """
    Determine the quarter based on months or phrases like '1e kwartaal'.
    """
    if any(month in period_part for month in ['januari', 'februari', 'maart']):
        return 1
    elif any(month in period_part for month in ['april', 'mei', 'juni']):
        return 2
    elif any(month in period_part for month in ['juli', 'augustus', 'september']):
        return 3
    elif any(month in period_part for month in ['oktober', 'november', 'december']):
        return 4
    if '1e' in period_part:
        return 1
    elif '2e' in period_part:
        return 2
    elif '3e' in period_part:
        return 3
    elif '4e' in period_part:
        return 4
    return None

In [8]:
def monthly_to_quarterly(df, period_column='Perioden', value_columns=None):
    """
    Convert monthly data to quarterly by summing numeric columns.
    """
    if value_columns is None:
        value_columns = df.select_dtypes(include=[float, int]).columns.tolist()

    # Extract year and month from the period column
    df['Year'] = df[period_column].apply(lambda x: int(re.search(r'(\d{4})', x).group()))
    df['Month'] = df[period_column].apply(lambda x: extract_month(x))

    # Assign quarters based on months
    df['Quarter'] = df['Month'].apply(lambda x: (x - 1) // 3 + 1)

    # Group by year and quarter
    quarterly_df = df.groupby(['Year', 'Quarter'])[value_columns].sum().reset_index()
    return quarterly_df

In [9]:
def extract_month(period_text):
    """
    Extract the month number from a period string.
    """
    months = {
        'januari': 1, 'februari': 2, 'maart': 3,
        'april': 4, 'mei': 5, 'juni': 6,
        'juli': 7, 'augustus': 8, 'september': 9,
        'oktober': 10, 'november': 11, 'december': 12
    }
    for month_name, month_num in months.items():
        if month_name in period_text.lower():
            return month_num
    return None

In [10]:
def rename_sbi_column(df, primary_column='BedrijfstakkenBranchesSBI2008', backup_column='BedrijfskenmerkenSBI2008'):
    """
    Rename 'BedrijfskenmerkenSBI2008' to 'BedrijfstakkenBranchesSBI2008' if necessary.
    """
    if primary_column not in df.columns and backup_column in df.columns:
        df = df.rename(columns={backup_column: primary_column})
    return df

In [11]:
def filter_by_industry(df, industry_column='BedrijfstakkenBranchesSBI2008', valid_industries=None):
    """
    Filter the DataFrame by specific industries.
    """
    if valid_industries is None:
        valid_industries = [
            "Q Gezondheids- en welzijnszorg", "G Handel", "C Industrie",
            "M Specialistische zakelijke diensten", "N Verhuur en overige zakelijke diensten",
            "O Openbaar bestuur en overheidsdiensten"
        ]
    return df[df[industry_column].isin(valid_industries)]

In [12]:
def filter_by_year(df, year_column='Year', start_year=2008, end_year=2022):
    """
    Filter the DataFrame by year range.
    """
    return df[(df[year_column] >= start_year) & (df[year_column] <= end_year)]

In [13]:
def group_and_sum(df, group_by_columns):
    """
    Group the DataFrame by specified columns and sum numeric data.
    """
    return df.groupby(group_by_columns).sum(numeric_only=True).reset_index()

In [14]:
def reorder_columns(df, preferred_columns):
    """
    Reorder DataFrame columns to place preferred ones at the start.
    """
    remaining_columns = [col for col in df.columns if col not in preferred_columns]
    return df[preferred_columns + remaining_columns]

In [17]:
# Load the table selection JSON
table_selection_path = os.path.join(DATA_DIR, 'table_selection.json')
table_data = load_json(table_selection_path)

if not table_data:
    print("Table selection data is missing or invalid. Exiting.")
    exit()

excluded_identifiers = ['85663NED']
identifier_freq_dict = {
    entry['Identifier']: entry['Frequency']
    for entry in table_data if entry['Identifier'] not in excluded_identifiers
}

In [18]:
def fetch_and_save_tables(identifiers, output_folder=DATA_DIR):
    """Fetch data for identifiers and save as CSV."""
    ensure_directory_exists(output_folder)
    for identifier in identifiers:
        output_file = os.path.join(output_folder, f"{identifier}.csv")
        if os.path.exists(output_file):
            print(f"File already exists for {identifier}. Skipping.")
            continue
        try:
            print(f"Fetching data for {identifier}...")
            data = pd.DataFrame(cbsodata.get_data(identifier))
            data.to_csv(output_file, index=False)
        except Exception as e:
            print(f"Error fetching data for {identifier}: {e}")

fetch_and_save_tables(identifier_freq_dict.keys())

File already exists for 83451NED. Skipping.
File already exists for 85928NED. Skipping.
File already exists for 80072ned. Skipping.
File already exists for 81589NED. Skipping.
File already exists for 83147NED. Skipping.
File already exists for 81588NED. Skipping.
File already exists for 83149NED. Skipping.
File already exists for 83148NED. Skipping.


In [19]:
# Process and combine tables
def process_tables(tables, identifier_dict):
    processed_tables = []
    for i, table in enumerate(tables):
        identifier = list(identifier_dict.keys())[i]
        frequency = identifier_dict[identifier]
        if frequency == 'Maandelijks':
            table = monthly_to_quarterly(table)
        table = process_period_column(table)
        table = rename_sbi_column(table)
        table = filter_by_industry(table)
        table = filter_by_year(table)
        processed_tables.append(table)

    combined_df = pd.concat(processed_tables, ignore_index=True)
    grouped_df = group_and_sum(combined_df, ['Year', 'Quarter', 'BedrijfstakkenBranchesSBI2008'])
    return grouped_df

In [21]:
# Assuming file paths are generated for CSV files
file_paths = [os.path.join(DATA_DIR, f"{identifier}.csv") for identifier in identifier_freq_dict.keys()]

def load_csv_files(file_paths):
    """
    Load CSV files into DataFrames.
    """
    dataframes = []
    for file_path in file_paths:
        try:
            print(f"Loading file: {file_path}")
            df = pd.read_csv(file_path)
            dataframes.append(df)
        except FileNotFoundError:
            print(f"File not found: {file_path}")
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    return dataframes

# Load tables from CSV files
tables = load_csv_files(file_paths)

# Process the loaded tables
if tables:  # Check if there are any tables loaded
    final_df = process_tables(tables, identifier_freq_dict)

    # Save the final combined DataFrame
    final_output_path = os.path.join(DATA_DIR, 'merged_tables.csv')
    final_df.to_csv(final_output_path, index=False)
    print(f"Final DataFrame saved to {final_output_path}.")
else:
    print("No tables were loaded. Cannot proceed with processing.")


Loading file: C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\83451NED.csv
Loading file: C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\85928NED.csv
Loading file: C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\80072ned.csv
Loading file: C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\81589NED.csv
Loading file: C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\83147NED.csv
Loading file: C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\81588NED.csv
Loading file: C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\83149NED.csv
Loading file: C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\83148NED.csv
Final DataFrame saved to C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\mer