## Importing libraries

In [35]:
import pandas as pd
import requests 
import cbsodata
import re
# from sklearn.linear_model import LogisticRegression

## Merging tables

In [54]:
def split_period_column(df, perioden_column_name='Perioden', year_column_name='Jaar', period_column_name='Periode', quarter_column_name='Kwartaal'):
    """
    Splits the 'Perioden' column of a DataFrame into 'Jaar' (year), 'Periode' (remaining part), and 'Kwartaal' (quarter).

    Parameters:
        df (DataFrame): The DataFrame containing the 'Perioden' column.
        perioden_column_name (str): The name of the column containing the period data. Default is 'Perioden'.
        year_column_name (str): The name of the column to store the extracted years. Default is 'Jaar'.
        period_column_name (str): The name of the column to store the remaining part of the period. Default is 'Periode'.
        quarter_column_name (str): The name of the column to store the extracted quarter. Default is 'Kwartaal'.

    Returns:
        DataFrame: The DataFrame with the 'Perioden' column split into 'Jaar', 'Periode', and 'Kwartaal'.
    """
    # Copy the original DataFrame to avoid modifying the original
    df_copy = df.copy()

    # Split the 'Perioden' column
    df_copy[[year_column_name, period_column_name]] = df_copy[perioden_column_name].str.extract(r'(\d{4})\s?(.*)')

    # Extract the quarter information
    df_copy[quarter_column_name] = df_copy[period_column_name].apply(lambda x: extract_quarter(x))

    return df_copy

def extract_quarter(period_part):
    """
    Extracts the quarter information from the remaining part of the period.
    """
    if any(month in period_part for month in ['januari', 'februari', 'maart']):
        return 1
    elif any(month in period_part for month in ['april', 'mei', 'juni']):
        return 2
    elif any(month in period_part for month in ['juli', 'augustus', 'september']):
        return 3
    elif any(month in period_part for month in ['oktober', 'november', 'december']):
        return 4
    else:
        for char in period_part:
            if char.isdigit():
                return int(char)
        return None

def convert_monthly_to_quarterly(df, category_column='BedrijfstakkenBranchesSBI2008', year_column_name='Jaar', quarter_column_name='Kwartaal', value_column='ProducentenprijsindexPPI_1'):
    """
    Converts monthly data to quarterly data by averaging the values for each quarter.

    Parameters:
        df (DataFrame): The DataFrame containing the data.
        category_column (str): The name of the column containing the category data. Default is 'BedrijfstakkenBranchesSBI2008'.
        year_column_name (str): The name of the column containing the year data. Default is 'Jaar'.
        quarter_column_name (str): The name of the column containing the quarter data. Default is 'Kwartaal'.
        value_column (str): The name of the column containing the values to be averaged. Default is 'ProducentenprijsindexPPI_1'.

    Returns:
        DataFrame: The DataFrame with monthly data converted to quarterly data.
    """
    # Ensure the quarter column is numeric
    df[quarter_column_name] = pd.to_numeric(df[quarter_column_name], errors='coerce')

    # Group by category, year, and quarter, then average the values
    quarterly_df = df.groupby([category_column, year_column_name, quarter_column_name])[value_column].mean().reset_index()
    
    return quarterly_df

def filter_columns(df):
    """
    Filters the DataFrame to ensure specific column order and drops certain columns if they exist.
    Additionally, if a 'Marges' column exists, only keeps rows where 'Marges' equals 'Waarde'.

    Parameters:
        df (DataFrame): The DataFrame to be filtered.

    Returns:
        DataFrame: The filtered DataFrame with columns ordered as Jaar, Kwartaal, BedrijfskenmerkenSBI2008 (if exists), 
                   followed by remaining columns (excluding Period, Perioden, ID if they exist).
    """
    # If 'Marges' column exists, filter rows to keep only those with value 'Waarde'
    if 'Marges' in df.columns:
        df = df[df['Marges'] == 'Waarde']

    # Define the columns to keep and the order
    columns_to_keep = []

    # Ensure 'Jaar' (Year) is the first column if it exists
    if 'Jaar' in df.columns:
        columns_to_keep.append('Jaar')

    # Ensure 'Kwartaal' (Quarter) is the second column if it exists
    if 'Kwartaal' in df.columns:
        columns_to_keep.append('Kwartaal')

    # Ensure 'BedrijfskenmerkenSBI2008' (Industry code) is the third column if it exists
    if 'BedrijfskenmerkenSBI2008' in df.columns:
        columns_to_keep.append('BedrijfskenmerkenSBI2008')

    # Append remaining columns in their original order (excluding Period, Perioden, ID if they exist)
    for col in df.columns:
        if col not in columns_to_keep and col not in ['Period', 'Perioden', 'ID', 'Periode']:
            columns_to_keep.append(col)

    # Filter the DataFrame to include only the specified columns
    filtered_df = df[columns_to_keep]

    return filtered_df

def filter_industries(df):
    """
    Filters the DataFrame to only include rows where 'BedrijfskenmerkenSBI2008' is in the specified list of industries.
    If the column 'BedrijfstakkenBranchesSBI2008' is present, it will be renamed to 'BedrijfskenmerkenSBI2008' first.

    Parameters:
        df (DataFrame): The DataFrame to be filtered.

    Returns:
        DataFrame: The filtered DataFrame containing only rows with specified industries.
    """
    # Rename the column if necessary
    if 'BedrijfstakkenBranchesSBI2008' in df.columns:
        df = df.rename(columns={'BedrijfstakkenBranchesSBI2008': 'BedrijfskenmerkenSBI2008'})

    # List of industries to keep
    industries_to_keep = [
        'A Landbouw, bosbouw en visserij',
        'B Delfstoffenwinning',
        'C Industrie',
        'D Energievoorziening',
        'E Waterbedrijven en afvalbeheer',
        'F Bouwnijverheid',
        'G Handel',
        'H Vervoer en opslag',
        'I Horeca',
        'J Informatie en communicatie',
        'K Financiële dienstverlening',
        'L Verhuur en handel van onroerend goed',
        'M Specialistische zakelijke diensten',
        'N Verhuur en overige zakelijke diensten',
        'O Openbaar bestuur en overheidsdiensten',
        'P Onderwijs',
        'Q Gezondheids- en welzijnszorg',
        'R Cultuur, sport en recreatie',
        'S Overige dienstverlening'
    ]

    # Filter the DataFrame
    filtered_df = df[df['BedrijfskenmerkenSBI2008'].isin(industries_to_keep)]

    return filtered_df


def filter_period(df):
    """
    Drops the rows that have a NaN or NULL value in the 'Kwartaal' column and drops rows where the year is not between 2016 and 2023.

    Parameters:
        df (DataFrame): The DataFrame to be filtered.

    Returns:
        DataFrame: The filtered DataFrame with rows containing NaN or NULL in 'Kwartaal' dropped and rows where the year is not between 2016 and 2023 dropped.
    """
    # Drop rows with NaN or NULL values in the 'Kwartaal' column
    filtered_df = df.dropna(subset=['Kwartaal']).copy()
    
    # Ensure the 'Jaar' column is of integer type
    filtered_df.loc[:, 'Jaar'] = pd.to_numeric(filtered_df['Jaar'], errors='coerce')

    # Drop rows where the year is not between 2016 and 2023
    filtered_df = filtered_df[(filtered_df['Jaar'] >= 2016) & (filtered_df['Jaar'] <= 2021)]

    return filtered_df

def convert_yearly_to_quarterly(df):
    """
    Converts yearly data to quarterly data by duplicating each row four times 
    and assigning quarters 1 to 4 to the 'Kwartaal' column for each duplicated row.

    Parameters:
        df (DataFrame): The DataFrame containing yearly data.

    Returns:
        DataFrame: The DataFrame with yearly data converted to quarterly data.
    """
    # Create a list to hold the new rows
    new_rows = []

    # Iterate over each row in the dataframe
    for index, row in df.iterrows():
        # Duplicate each row four times with different quarters
        for quarter in range(1, 5):
            new_row = row.copy()
            new_row['Kwartaal'] = quarter
            new_rows.append(new_row)

    # Create a new dataframe from the list of new rows
    quarterly_df = pd.DataFrame(new_rows)

    return quarterly_df

def add_industry(df):
    """
    Adds industry names to a dataframe by duplicating each row for each industry name.

    Parameters:
        df (DataFrame): The DataFrame to which industry names will be added.

    Returns:
        DataFrame: The DataFrame with new rows added for each industry name.
    """
    # List of industry names
    industry_names = [
        'A Landbouw, bosbouw en visserij',
        'B Delfstoffenwinning',
        'C Industrie',
        'D Energievoorziening',
        'E Waterbedrijven en afvalbeheer',
        'F Bouwnijverheid',
        'G Handel',
        'H Vervoer en opslag',
        'I Horeca',
        'J Informatie en communicatie',
        'K Financiële dienstverlening',
        'L Verhuur en handel van onroerend goed',
        'M Specialistische zakelijke diensten',
        'N Verhuur en overige zakelijke diensten',
        'O Openbaar bestuur en overheidsdiensten',
        'P Onderwijs',
        'Q Gezondheids- en welzijnszorg',
        'R Cultuur, sport en recreatie',
        'S Overige dienstverlening'
    ]

    # Create a list to hold the new rows
    new_rows = []

    # Iterate over each row in the dataframe
    for index, row in df.iterrows():
        # Duplicate each row for each industry name
        for industry_name in industry_names:
            new_row = row.copy()
            new_row['BedrijfskenmerkenSBI2008'] = industry_name
            new_rows.append(new_row)

    # Create a new dataframe from the list of new rows
    expanded_df = pd.DataFrame(new_rows)

    return expanded_df

def append_dataframes(dataframes):
    """
    Appends multiple DataFrames together vertically and joins on Jaar, Kwartaal, and BedrijfskenmerkenSBI2008.

    Parameters:
        dataframes (list of DataFrames): A list containing the DataFrames to be appended.

    Returns:
        DataFrame: The concatenated DataFrame containing all rows from input DataFrames.
    """
    # Concatenate the DataFrames vertically
    concatenated_df = pd.concat(dataframes, ignore_index=True, sort=False)

    # Set columns to join on
    join_columns = ['Jaar', 'Kwartaal', 'BedrijfskenmerkenSBI2008']

    # Ensure join columns are of the correct type and values
    concatenated_df['Jaar'] = pd.to_numeric(concatenated_df['Jaar'], errors='coerce')
    concatenated_df['Kwartaal'] = pd.to_numeric(concatenated_df['Kwartaal'], errors='coerce')

    # Drop rows with NaN in join columns (if any)
    concatenated_df.dropna(subset=join_columns, inplace=True)

    # Perform inner join on specified columns to retain only matching rows
    concatenated_df = concatenated_df.groupby(join_columns).first().reset_index()

    return concatenated_df


In [56]:
sustainable_employability_data = pd.DataFrame(cbsodata.get_data('83156NED')) 
sustainable_employability_data = split_period_column(sustainable_employability_data)
sustainable_employability_data = convert_yearly_to_quarterly(sustainable_employability_data)
sustainable_employability_data = filter_columns(sustainable_employability_data)
sustainable_employability_data = filter_industries(sustainable_employability_data)
sustainable_employability_data = filter_period(sustainable_employability_data)
print(sustainable_employability_data.shape)
sustainable_employability_data.head()

(456, 12)


Unnamed: 0,Jaar,Kwartaal,BedrijfskenmerkenSBI2008,Marges,TevredenMetArbeidsomstandigheden_1,TevredenMetWerk_2,LeeftijdWillenDoorwerken_3,LeeftijdInStaatDoorwerken_4,VoldoenAanFysiekeEisenWerk_5,VoldoenAanPsychischeEisenWerk_6,MakkelijkNieuweFunctieEigenWerkgever_7,MakkelijkNieuweBaanAndereWerkgever_8
32,2016,1,"A Landbouw, bosbouw en visserij",Waarde,72.8,75.9,61.7,60.3,90.0,91.6,47.9,63.2
32,2016,2,"A Landbouw, bosbouw en visserij",Waarde,72.8,75.9,61.7,60.3,90.0,91.6,47.9,63.2
32,2016,3,"A Landbouw, bosbouw en visserij",Waarde,72.8,75.9,61.7,60.3,90.0,91.6,47.9,63.2
32,2016,4,"A Landbouw, bosbouw en visserij",Waarde,72.8,75.9,61.7,60.3,90.0,91.6,47.9,63.2
33,2017,1,"A Landbouw, bosbouw en visserij",Waarde,73.9,76.9,61.6,61.8,87.3,91.4,50.8,64.4


In [57]:
ziekteverzuim_data = pd.DataFrame(cbsodata.get_data('80072ned'))
ziekteverzuim_data = split_period_column(ziekteverzuim_data)
ziekteverzuim_data = filter_columns(ziekteverzuim_data)
ziekteverzuim_data = filter_industries(ziekteverzuim_data)
ziekteverzuim_data = filter_period(ziekteverzuim_data) 
print(ziekteverzuim_data.shape)
ziekteverzuim_data.head()

(456, 4)


Unnamed: 0,Jaar,Kwartaal,BedrijfskenmerkenSBI2008,Ziekteverzuimpercentage_1
241,2016,1.0,"A Landbouw, bosbouw en visserij",2.7
242,2016,2.0,"A Landbouw, bosbouw en visserij",2.5
243,2016,3.0,"A Landbouw, bosbouw en visserij",2.4
244,2016,4.0,"A Landbouw, bosbouw en visserij",2.5
246,2017,1.0,"A Landbouw, bosbouw en visserij",2.6


In [58]:
gezondheids_data = pd.DataFrame(cbsodata.get_data('81628NED'))
gezondheids_data = split_period_column(gezondheids_data)
gezondheids_data = convert_yearly_to_quarterly(gezondheids_data)
gezondheids_data = add_industry(gezondheids_data)
gezondheids_data = filter_columns(gezondheids_data)
gezondheids_data = filter_period(gezondheids_data)
print(gezondheids_data.shape)
gezondheids_data.head()

(456, 109)


Unnamed: 0,Jaar,Kwartaal,BedrijfskenmerkenSBI2008,LevendgeborenenRelatief_1,LevendgeborenenUitTienermoeders_2,LevendgeborenenUit35Moeders_3,LevendgeborenenUit40Moeders_4,TotaalDoodsoorzaken_5,KwaadaardigeNieuwvormingen_6,ZiektenVanHartEnVaatstelsel_7,...,ZiekenhuisInclBuitenpolikliniek_97,Fysiotherapeut_98,Ziekenhuizen_99,k_86104GGZMetOvernachting_100,k_872087301Gehandicaptenzorg_101,VerpleegVerzorgingshuizenThuiszorg_102,Ziekenhuizen_103,k_86104GGZMetOvernachting_104,k_872087301Gehandicaptenzorg_105,VerpleegVerzorgingshuizenThuiszorg_106
15,2016,1,"A Landbouw, bosbouw en visserij",10.1,1492.0,37126.0,5860.0,87.5,26.6,22.7,...,4.7,,11.0,25.0,17.0,40.0,130.1,86.6,76.7,68.6
15,2016,1,B Delfstoffenwinning,10.1,1492.0,37126.0,5860.0,87.5,26.6,22.7,...,4.7,,11.0,25.0,17.0,40.0,130.1,86.6,76.7,68.6
15,2016,1,C Industrie,10.1,1492.0,37126.0,5860.0,87.5,26.6,22.7,...,4.7,,11.0,25.0,17.0,40.0,130.1,86.6,76.7,68.6
15,2016,1,D Energievoorziening,10.1,1492.0,37126.0,5860.0,87.5,26.6,22.7,...,4.7,,11.0,25.0,17.0,40.0,130.1,86.6,76.7,68.6
15,2016,1,E Waterbedrijven en afvalbeheer,10.1,1492.0,37126.0,5860.0,87.5,26.6,22.7,...,4.7,,11.0,25.0,17.0,40.0,130.1,86.6,76.7,68.6


In [61]:
dataframes = [ziekteverzuim_data, sustainable_employability_data]
concatenated_data = append_dataframes(dataframes)
print(concatenated_data.shape)
concatenated_data.head()

(456, 13)


Unnamed: 0,Jaar,Kwartaal,BedrijfskenmerkenSBI2008,Ziekteverzuimpercentage_1,Marges,TevredenMetArbeidsomstandigheden_1,TevredenMetWerk_2,LeeftijdWillenDoorwerken_3,LeeftijdInStaatDoorwerken_4,VoldoenAanFysiekeEisenWerk_5,VoldoenAanPsychischeEisenWerk_6,MakkelijkNieuweFunctieEigenWerkgever_7,MakkelijkNieuweBaanAndereWerkgever_8
0,2016,1.0,"A Landbouw, bosbouw en visserij",2.7,Waarde,72.8,75.9,61.7,60.3,90.0,91.6,47.9,63.2
1,2016,1.0,B Delfstoffenwinning,3.6,Waarde,84.6,84.3,63.0,64.2,94.1,95.2,57.9,68.8
2,2016,1.0,C Industrie,5.5,Waarde,71.0,73.7,63.3,62.8,88.0,90.1,48.1,53.7
3,2016,1.0,D Energievoorziening,4.9,Waarde,79.4,75.9,62.5,64.0,93.1,94.4,59.0,59.6
4,2016,1.0,E Waterbedrijven en afvalbeheer,5.5,Waarde,73.1,74.2,63.9,62.7,89.6,90.3,47.2,52.8


In [62]:
concatenated_data.to_pickle('concatenated_data.pkl')