## Importing libraries

In [1]:
import pandas as pd
import requests 
import cbsodata
import re
# from sklearn.linear_model import LogisticRegression


## CBS table selection

In [3]:
# Define the desired frequencies
desired_frequencies = [
    "Fourtimesayear", "Monthly", "Threemonthly", "Quarterly", "Viermaalperjaar",
    "Perkwartaal", "Perweek", "Stopgezet", "Permaand", "Discontinued",
    "Perdriemaanden", "Weekly"
]

# Fetch the list of tables
tables = cbsodata.get_table_list()

# Function to extract the first and last four-digit numbers from a period string
def extract_years(period):
    # Find all sequences of four digits
    years = re.findall(r'\b\d{4}\b', period)
    if len(years) >= 2:
        return int(years[0]), int(years[-1])
    elif len(years) == 1:
        return int(years[0]), int(years[0])
    else:
        return None, None

# List to store the identifiers, short titles, and languages
filtered_data = []

# Filter tables and extract identifiers, short titles, and languages
for table in tables:
    if table.get('Frequency') in desired_frequencies:
        period = table.get('Period', 'N/A')
        start_year, end_year = extract_years(period)
        if start_year is not None and end_year is not None:
            if start_year <= 2012 and end_year >= 2022:
                short_title = table.get('ShortTitle', 'N/A')
                # Filter out rows containing 'Caribisch' or 'Bonaire' in ShortTitle
                if 'Caribisch' not in short_title and 'Bonaire' not in short_title:
                    identifier = table.get('Identifier', 'N/A')
                    language = table.get('Language', 'N/A')
                    if language == 'nl':  # Only include rows where Language is 'nl'
                        filtered_data.append((identifier, short_title, language))

# Print the filtered data
print("Filtered identifiers, short titles, and languages:")
for identifier, short_title, language in filtered_data:
    print(f"Identifier: {identifier}, ShortTitle: {short_title}, Language: {language}")


Filtered identifiers, short titles, and languages:
Identifier: 83599NED, ShortTitle: Openstaande vacatures; SBI 2008, regio, Language: nl
Identifier: 80472ned, ShortTitle: Vacatures; SBI 2008; totaal, Language: nl
Identifier: 80473ned, ShortTitle: Vacatures;SBI2008;particuliere bedrijven, Language: nl
Identifier: 80474ned, ShortTitle: Vacatures; seizoengecorrigeerd, SBI2008, Language: nl
Identifier: 84545NED, ShortTitle: Vacatures; stromen, seizoengecorrigeerd, Language: nl
Identifier: 80567ned, ShortTitle: Vacatures; vacaturegraad naar SBI 2008, Language: nl
Identifier: 80857ned, ShortTitle: Vacatures; overheid en onderwijs, Language: nl
Identifier: 84166NED, ShortTitle: Arbeid; kwartalen, nationale rekeningen, Language: nl
Identifier: 84310NED, ShortTitle: Arbeidsvolume; kwartalen, geslacht, nr, Language: nl
Identifier: 83451NED, ShortTitle: Werkgelegenheid; banen, lonen per maand, Language: nl
Identifier: 82848NED, ShortTitle: Arbeidsdeelname; wisselingen per maand, Language: nl
Ide

## Merging tables

In [62]:
import pandas as pd

def split_period_column(df, perioden_column_name='Perioden', year_column_name='Jaar', period_column_name='Periode', quarter_column_name='Kwartaal'):
    """
    Splits the 'Perioden' column of a DataFrame into 'Jaar' (year), 'Periode' (remaining part), and 'Kwartaal' (quarter).

    Parameters:
        df (DataFrame): The DataFrame containing the 'Perioden' column.
        perioden_column_name (str): The name of the column containing the period data. Default is 'Perioden'.
        year_column_name (str): The name of the column to store the extracted years. Default is 'Jaar'.
        period_column_name (str): The name of the column to store the remaining part of the period. Default is 'Periode'.
        quarter_column_name (str): The name of the column to store the extracted quarter. Default is 'Kwartaal'.

    Returns:
        DataFrame: The DataFrame with the 'Perioden' column split into 'Jaar', 'Periode', and 'Kwartaal'.
    """
    # Copy the original DataFrame to avoid modifying the original
    df_copy = df.copy()

    # Split the 'Perioden' column
    df_copy[[year_column_name, period_column_name]] = df_copy[perioden_column_name].str.extract(r'(\d{4})\s?(.*)')

    # Extract the quarter information
    df_copy[quarter_column_name] = df_copy[period_column_name].apply(lambda x: extract_quarter(x))

    return df_copy

def extract_quarter(period_part):
    """
    Extracts the quarter information from the remaining part of the period.
    """
    if any(month in period_part for month in ['januari', 'februari', 'maart']):
        return 1
    elif any(month in period_part for month in ['april', 'mei', 'juni']):
        return 2
    elif any(month in period_part for month in ['juli', 'augustus', 'september']):
        return 3
    elif any(month in period_part for month in ['oktober', 'november', 'december']):
        return 4
    else:
        for char in period_part:
            if char.isdigit():
                return int(char)
        return None

def convert_monthly_to_quarterly(df, category_column='BedrijfstakkenBranchesSBI2008', year_column_name='Jaar', quarter_column_name='Kwartaal', value_column='ProducentenprijsindexPPI_1'):
    """
    Converts monthly data to quarterly data by averaging the values for each quarter.

    Parameters:
        df (DataFrame): The DataFrame containing the data.
        category_column (str): The name of the column containing the category data. Default is 'BedrijfstakkenBranchesSBI2008'.
        year_column_name (str): The name of the column containing the year data. Default is 'Jaar'.
        quarter_column_name (str): The name of the column containing the quarter data. Default is 'Kwartaal'.
        value_column (str): The name of the column containing the values to be averaged. Default is 'ProducentenprijsindexPPI_1'.

    Returns:
        DataFrame: The DataFrame with monthly data converted to quarterly data.
    """
    # Ensure the quarter column is numeric
    df[quarter_column_name] = pd.to_numeric(df[quarter_column_name], errors='coerce')

    # Group by category, year, and quarter, then average the values
    quarterly_df = df.groupby([category_column, year_column_name, quarter_column_name])[value_column].mean().reset_index()
    
    return quarterly_df


In [36]:
ziekteverzuim_data = pd.DataFrame(cbsodata.get_data('80072ned'))
ziekteverzuim_data = split_period_column(ziekteverzuim_data)
ziekteverzuim_data.head()

Unnamed: 0,ID,BedrijfskenmerkenSBI2008,Perioden,Ziekteverzuimpercentage_1,Jaar,Periode,Kwartaal
0,0,A-U Alle economische activiteiten,1996 1e kwartaal,5.5,1996,1e kwartaal,1.0
1,1,A-U Alle economische activiteiten,1996 2e kwartaal,4.6,1996,2e kwartaal,2.0
2,2,A-U Alle economische activiteiten,1996 3e kwartaal,4.0,1996,3e kwartaal,3.0
3,3,A-U Alle economische activiteiten,1996 4e kwartaal,4.7,1996,4e kwartaal,4.0
4,4,A-U Alle economische activiteiten,1996,4.7,1996,,


In [56]:
ziekteverzuim_data_kwartaal = ziekteverzuim_data[['BedrijfskenmerkenSBI2008', 'Jaar', 'Kwartaal', 'Ziekteverzuimpercentage_1']]
ziekteverzuim_data_kwartaal.head()

Unnamed: 0,BedrijfskenmerkenSBI2008,Jaar,Kwartaal,Ziekteverzuimpercentage_1
0,A-U Alle economische activiteiten,1996,1.0,5.5
1,A-U Alle economische activiteiten,1996,2.0,4.6
2,A-U Alle economische activiteiten,1996,3.0,4.0
3,A-U Alle economische activiteiten,1996,4.0,4.7
4,A-U Alle economische activiteiten,1996,,4.7


In [63]:
# inflatie_data = pd.DataFrame(cbsodata.get_data('83131NED'))
inflatie_data = pd.DataFrame(cbsodata.get_data('83936NED'))
inflatie_data = split_period_column(inflatie_data)
inflatie_data.head()

Unnamed: 0,ID,Afzet,BedrijfstakkenBranchesSBI2008,Perioden,ProducentenprijsindexPPI_1,MaandmutatiePPI_2,JaarmutatiePPI_3,Wegingscoefficient_4,Jaar,Periode,Kwartaal
0,0,Totaal afzetprijzen,B Delfstoffenwinning,2012 januari,121.2,,,16741.0,2012,januari,1.0
1,1,Totaal afzetprijzen,B Delfstoffenwinning,2012 februari,121.7,0.4,,16741.0,2012,februari,1.0
2,2,Totaal afzetprijzen,B Delfstoffenwinning,2012 maart,122.5,0.6,,16741.0,2012,maart,1.0
3,3,Totaal afzetprijzen,B Delfstoffenwinning,2012 april,125.4,2.4,,16741.0,2012,april,2.0
4,4,Totaal afzetprijzen,B Delfstoffenwinning,2012 mei,126.8,1.1,,16741.0,2012,mei,2.0


In [64]:
inflatie_data_kwartaal = convert_monthly_to_quarterly(inflatie_data)
inflatie_data_kwartaal.head()

Unnamed: 0,BedrijfstakkenBranchesSBI2008,Jaar,Kwartaal,ProducentenprijsindexPPI_1
0,06 Winning van aardolie en aardgas,2012,1.0,130.8
1,06 Winning van aardolie en aardgas,2012,2.0,132.2
2,06 Winning van aardolie en aardgas,2012,3.0,130.966667
3,06 Winning van aardolie en aardgas,2012,4.0,139.366667
4,06 Winning van aardolie en aardgas,2013,1.0,132.833333


In [67]:
# unique_values_ziekteverzuim = ziekteverzuim_data_kwartaal['BedrijfskenmerkenSBI2008'].unique()
unique_values_inflatie = inflatie_data_kwartaal['BedrijfstakkenBranchesSBI2008'].unique()

# print(unique_values_ziekteverzuim)
print(unique_values_inflatie)

['06 Winning van aardolie en aardgas'
 '08 Delfstoffenwinning (geen olie en gas)' '10 Voedingsmiddelenindustrie'
 '10-12 Voedings-, genotmiddelenindustrie'
 '101 Slachterijen en vleeswarenindustrie'
 '1011 Slachterijen (geen pluimvee)' '1012 Pluimveeslachterijen'
 '1013 Vleesverwerkende industrie' '102 Visverwerkende industrie'
 '103 Groente-, fruitverwerkende industrie'
 '1031 Aardappelproductenindustrie'
 '1032 Fruit- en groentesappenindustrie'
 '1039 Groente-,fruitverwerkende industrie'
 '104 Spijsoliën- en -vettenindustrie e.d.'
 '1041 Spijsoliën- en -vettenindustrie' '105 Zuivelindustrie'
 '1051 Zuivelindustrie (geen ijs)' '106 Meelindustrie'
 '1061 Meelindustrie (geen zetmeel)' '107 Brood- en deegwarenindustrie'
 '1071 Brood- en banketbakkerijen' '108 Overige voedingsmiddelenindustrie'
 '1081 Suikerindustrie' '1082 Cacao- en chocoladewerkindustrie'
 '109 Diervoederindustrie' '11 Drankenindustrie'
 '1102 Wijnindustrie (uit druiven)' '1107 Frisdrankindustrie'
 '12 Tabaksindustrie' 

## Appendices

### Old ingestion method

In [29]:
# Define the API endpoint
api_url = "https://opendata.cbs.nl/ODataApi/odata/80072ned/TypedDataSet" # Sick leave data
# Function to get data from the API with pagination handling
def get_data(url):
    data = []
    while url:
        response = requests.get(url)
        if response.status_code == 200:
            json_response = response.json()
            data.extend(json_response['value'])
            url = json_response.get('@odata.nextLink', None)
        else:
            print(f"Failed to fetch data: {response.status_code}")
            break
    return data

# Fetch the dataset
data = get_data(api_url)

# Convert the data to a pandas DataFrame
if data:
    df = pd.DataFrame(data)
    print("Data fetched and converted to DataFrame successfully")
else:
    print("Failed to fetch or convert data")

# Save the DataFrame to a CSV file
df.to_csv('sick_leave_data.csv', index=False)
print("Data saved to sick_leave_data.csv")

Data fetched and converted to DataFrame successfully
Data saved to sick_leave_data.csv


In [30]:
# Explore the DataFrame
# View the first few rows
print(df.head())


# Get summary information
print(df.info())

# Get a statistical summary
print(df.describe())

# View column names
print(df.columns)

# Check for missing values
print(df.isnull().sum())

# Check unique object types
print(df.select_dtypes(include='object').nunique())


   ID BedrijfskenmerkenSBI2008  Perioden  Ziekteverzuimpercentage_1
0   0                  T001081  1996KW01                        5.5
1   1                  T001081  1996KW02                        4.6
2   2                  T001081  1996KW03                        4.0
3   3                  T001081  1996KW04                        4.7
4   4                  T001081  1996JJ00                        4.7
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5499 entries, 0 to 5498
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         5499 non-null   int64  
 1   BedrijfskenmerkenSBI2008   5499 non-null   object 
 2   Perioden                   5499 non-null   object 
 3   Ziekteverzuimpercentage_1  5189 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 172.0+ KB
None
                ID  Ziekteverzuimpercentage_1
count  5499.000000                5

### Inspect columns of the list of tables that is retrieved via the obsodata package

In [10]:
# Fetch the list of tables
tables = cbsodata.get_table_list()

# Initialize sets for unique frequencies and unique periods
unique_frequencies = set()
unique_periods = set()
unique_languages = set()

# Loop through the tables and collect unique frequencies and periods
for table in tables:
    frequency = table.get('Frequency', 'N/A')
    period = table.get('Period', 'N/A')
    language = table.get('Language','N/A')
    unique_frequencies.add(frequency)
    unique_periods.add(period)
    unique_languages.add(language)

# Convert sets to lists for easier manipulation (optional)
unique_frequencies = list(unique_frequencies)
unique_periods = list(unique_periods)
unique_languages = list(unique_languages)

# # Print unique frequencies
print("Unique frequencies:")
for frequency in unique_frequencies:
    print(frequency)

# Print unique periods
print("\nUnique periods:")
for period in unique_periods:
    print(period)

# Print unique periods
print("\nUnique languages:")
for language in unique_languages:
    print(language)    

Unique frequencies:
Perjaar
Quarterly
Monthly
Viermaalperjaar
Discontinued
Pertweejaar
Tweemaalperjaar
Yearly
Sixmonthly
Fouryearly
Perkwartaal
Fourtimesayear
Perdriejaar
Permaand
Weekly
Twoyearly
Twiceyearly
Pervierjaar
Threetimesayear
Threeyearly
Perdriemaanden
Pervijfjaar
Threemonthly
Perhalfjaar
Irregularly
Perweek
Driemaalperjaar
Fiveyearly
Stopgezet
Eenmalig
Onregelmatig
Onceonly

Unique periods:
1981, 1985, 1990-2022
2009-2016
2012-2017
1 januari 2008
1995, 1998, 2000, 2004
January 2000 - December 2012  2000-2012
2005 - 2015
2006-2008
2008-2012
januari 1999-juni 2023
2001-2015
1995-2011
2016 - 2022
2011 - 2018
1995-2010
1996 1e kwartaal - 2023 4e kwartaal
2009 - 2023
1992, 1995 en 1998
2011 3e kwartaal - 2022 3e kwartaal.
2013-2020
1993 januari - 2012 december
1918 - 2006
2000 - 2004, 2006 - 2008
2002 - 2012
2013 kwartaal 1 - 2024 kwartaal 1
2014 - 2022
2007 - 2024; 2007 1e kwartaal - 2024 1e kwartaal
1998 - 2007, 1st quarter 1998 - 4th quarter 2007
2004/'05 - 2008/'09
1994-2010