This notebook creates dataframes based on the csv file belastingen.csv downloaded from https://doi.org/10.6084/m9.figshare.27221169.v1 <br>
It fetches JSON files containing details about column selection and renaming from the STRO 2.0 GitHub repository. <br>

In [None]:
import requests
import pandas as pd
import json

In [None]:
# path to downloaded copy of belastingen.csv
belastingen = r"C:\STRO10\belastingen.csv\belastingen.csv" # update filepath if necessary

# 1. Preparation

In [None]:
# retrieve files for preparing entity creation from the STRO 2.0 GitHub repository
owner = 'dhofu'
repo = 'stro20'
column_selection = 'STRO_20_column_selection'
rename_columns = 'STRO_20_column_names'
url_selection = f"https://api.github.com/repos/{owner}/{repo}/contents/{column_selection}"
url_rename = f"https://api.github.com/repos/{owner}/{repo}/contents/{rename_columns}"

In [None]:
response_column_selection = requests.get(url_selection)
response_rename_columns = requests.get(url_rename)

In [None]:
# fetch the files for selecting columns from the 'belastingen' dataframe
selection_mappings = {
    'usecols_taxes': None,
    'usecols_taxes_remarks': None
}
for file in response_column_selection.json():
    if file['name'].startswith('belastingen_'):
        key = file['name'].split('_', 1)[1].split('.', 1)[0]
        if key in selection_mappings:
            selection_mappings[key] = requests.get(file['download_url']).json()

usecols_taxes = selection_mappings['usecols_taxes']
usecols_taxes_remarks = selection_mappings['usecols_taxes_remarks']

In [None]:
# fetch the files for renaming columns from the 'belastingen' dataframe 
rename_mappings = {
    'rename_taxes': None,
    'rename_remarks_taxes': None
}

for file in response_rename_columns.json():
    if file['name'].startswith('belastingen_'):
        key = file['name'].split('_', 1)[1].split('.', 1)[0]
        if key in rename_mappings:
            rename_mappings[key] = requests.get(file['download_url']).json()

rename_taxes = rename_mappings['rename_taxes']
rename_remarks_taxes = rename_mappings['rename_remarks_taxes']

# 2. Create Entities

## 2.1. Create dataframes

In [None]:
df_taxes = pd.read_csv(belastingen, sep=",", quotechar='"', usecols=usecols_taxes, encoding="utf-8", low_memory=False)\
.rename(columns=rename_taxes).reset_index(drop=True)

In [None]:
df_remarks_taxes = pd.read_csv(belastingen, sep=",", quotechar='"', usecols=usecols_taxes_remarks, encoding="utf-8", low_memory=False)\
.rename(columns=rename_remarks_taxes).reset_index(drop=True)

In [None]:
df_remarks_taxes.dropna(inplace=True)

## 2.2. Refactor dataframes

In [None]:
df_taxes_cry = df_taxes.melt(id_vars=['ce_id', 'row', 'taxAnnotation', 'discount'], value_vars=['currency1', 'currency2', 'currency3'])

In [None]:
df_taxes_qty = df_taxes.melt(id_vars=['ce_id', 'row', 'taxAnnotation', 'discount'], value_vars=['quantity1', 'quantity2', 'quantity3'])

In [None]:
melted_df_taxes = df_taxes_cry.merge(df_taxes_qty, how='left', left_index=True, right_index=True)

In [None]:
melted_df_taxes.drop(columns=['ce_id_y', 'row_y', 'taxAnnotation_y', 'discount_y', 'variable_x', 'variable_y'], inplace=True)

In [None]:
melted_df_taxes.rename(columns={'ce_id_x': 'ce_id', 'row_x': 'row', 'taxAnnotation_x': 'taxAnnotation',\
                                'discount_x': 'discount', 'value_x': 'currency', 'value_y': 'quantity'}, inplace=True)

In [None]:
# melted_df_taxes

In [None]:
# check to see if anything could be dropped. This is not the case.
# melted_df_taxes.dropna(how='all')

## 3. Save dataframes to CSV

In [None]:
melted_df_taxes.to_csv(r"C:\STRO_HUYGENS\STRO20\taxes_entry.csv", sep=';', quotechar= '"', index_label='index', encoding='utf-8')

In [None]:
df_remarks_taxes.to_csv(r"C:\STRO_HUYGENS\STRO20\remarks_taxes.csv", sep=';', quotechar= '"', index_label='index', encoding='utf-8')