This notebook creates dataframes based on the CSV file ladingen.csv downloaded from www.soundtoll.nl <br>
It fetches JSON files containing details about column selection and renaming from the STRO 2.0 GitHub repository. <br>

In [None]:
import requests
import pandas as pd
import numpy as np
import json

In [None]:
# Load the file ladingen.csv from the legacy data downloaded previously (See Jupyter Notebook STRO20_0_sparseDensity_STRO10.ipynb).
# If you have not downloaded the file before, you can find it here: https://doi.org/10.6084/m9.figshare.27221169.v2
ladingen = r"INSERT_FILEPATH"

# 1. Preparation

In [None]:
# retrieve files for preparing entity creation from the STRO 2.0 GitHub repository
owner = 'dhofu'
repo = 'stro20'
column_selection = 'STRO_20_column_selection'
rename_columns = 'STRO_20_column_names'
url_selection = f"https://api.github.com/repos/{owner}/{repo}/contents/{column_selection}"
url_rename = f"https://api.github.com/repos/{owner}/{repo}/contents/{rename_columns}"

In [None]:
response_column_selection = requests.get(url_selection)
response_rename_columns = requests.get(url_rename)

In [None]:
# fetch the files for selecting columns from the 'ladingen' dataframe
selection_mappings = {
    'usecols_cargo': None,
    'usecols_departure': None,
    'usecols_destination': None,
    'usecols_remarks_cargo': None
}
for file in response_column_selection.json():
    if file['name'].startswith('ladingen_'):
        key = file['name'].split('_', 1)[1].split('.', 1)[0]
        if key in selection_mappings:
            selection_mappings[key] = requests.get(file['download_url']).json()

usecols_cargo = selection_mappings['usecols_cargo']
usecols_departure = selection_mappings['usecols_departure']
usecols_destination = selection_mappings['usecols_destination']
usecols_remarks_cargo = selection_mappings['usecols_remarks_cargo']

In [None]:
# fetch the files for renaming columns from the 'ladingen' dataframe 
rename_mappings = {
    'rename_cargo': None,
    'rename_departure': None,
    'rename_destination': None,
    'rename_remarks_cargo': None
}

for file in response_rename_columns.json():
    if file['name'].startswith('ladingen_'):
        key = file['name'].split('_', 1)[1].split('.', 1)[0]
        if key in rename_mappings:
            rename_mappings[key] = requests.get(file['download_url']).json()

rename_cargo = rename_mappings['rename_cargo']
rename_departure = rename_mappings['rename_departure']
rename_destination = rename_mappings['rename_destination']
rename_remarks_cargo = rename_mappings['rename_remarks_cargo']

# 2. Create entities

## 2.1. Create dataframes

The process consists of two steps. In the first step, dataframes are created for cargoes, departure, destination and remarks about the cargoes. <br>
The cargoes dataframe will be elaborated further in the second step. <br>

In [None]:
# The cargoes dataframe is a temporary dataframe that will be elaborated next.
df_cargoes = pd.read_csv(ladingen, sep=",", quotechar='"', usecols=usecols_cargo, encoding="utf-8", low_memory=False)\
.rename(columns=rename_cargo).reset_index().rename(columns={'index': 'cargo_id'})

In [None]:
df_departure = pd.read_csv(ladingen, sep=",", quotechar='"', usecols=usecols_departure, encoding="utf-8", low_memory=False)\
.rename(columns=rename_departure).reset_index(drop=True)

In [None]:
df_destination = pd.read_csv(ladingen, sep=",", quotechar='"', usecols=usecols_destination, encoding="utf-8", low_memory=False)\
.rename(columns=rename_destination).reset_index(drop=True)

In [None]:
df_remarks_cargo = pd.read_csv(ladingen, sep=",", quotechar='"', usecols=usecols_remarks_cargo, encoding="utf-8", low_memory=False)\
.rename(columns=rename_remarks_cargo).reset_index(drop=True)

## 2.2. Elaborate cargoes dataframe

A new ID is added to the cargoes dataframe to facilitate processing data related to registered cargoes.<br>
The cargoes dataframe is split into dataframes that contain details about the registation, taxation and measurement of each cargo item. <br>
The measurement includes an elaboration of alternative units of measure, if they are available in the data.<br>

In [None]:
# df_cargoes

### 2.2.1. Refactor Measurement of Cargoes

In [None]:
df_measurement = df_cargoes[['cargo_id', 'ce_id', 'unitOfMeasure', 'quantity', 'alternativeUnitOfMeasure', 'alternativeQuantity']]

In [None]:
# a boolean that indicates if there is an alternative measurement available is added to the dataframe
df_measurement['alternative'] = np.where(df_measurement.alternativeUnitOfMeasure.notnull(), True, False)

In [None]:
df_measurement_msr = df_measurement.melt(id_vars=['cargo_id', 'ce_id', 'alternative'], \
                                         value_vars=['unitOfMeasure', 'alternativeUnitOfMeasure'])

In [None]:
df_measurement_qty = df_measurement.melt(id_vars=['cargo_id', 'ce_id', 'alternative'], \
                                         value_vars=['quantity', 'alternativeQuantity'])

In [None]:
melted_df_measurement = df_measurement_msr.merge(df_measurement_qty, how='left', left_index=True, right_index=True)

In [None]:
melted_df_measurement.drop(columns=['variable_x', 'ce_id_y', 'cargo_id_y', 'alternative_y', 'variable_y'], inplace=True)

In [None]:
melted_df_measurement.rename(columns={'cargo_id_x': 'cargo_id', 'ce_id_x': 'ce_id', 'alternative_x': 'alternative', \
                                      'value_x': 'measure', 'value_y': 'quantity'}, inplace=True)

In [None]:
melted_df_measurement.dropna(inplace=True)

In [None]:
# check the melted dataframe
melted_df_measurement.loc[melted_df_measurement['cargo_id'] == 5568718]

### 2.2.2. Refactor Taxation of Cargoes

In [None]:
df_taxes_cargoes = df_cargoes[['cargo_id', 'ce_id', 'currency1', 'quantity1', 'currency2', 'quantity2', 'currency3', 'quantity3']]

In [None]:
df_taxes_cargoes_cry = df_taxes_cargoes.melt(id_vars=['cargo_id', 'ce_id'], value_vars=['currency1', 'currency2', 'currency3'])

In [None]:
df_taxes_cargoes_qty = df_taxes_cargoes.melt(id_vars=['cargo_id', 'ce_id'], value_vars=['quantity1', 'quantity2', 'quantity3'])

In [None]:
melted_df_taxes_cargoes = df_taxes_cargoes_cry.merge(df_taxes_cargoes_qty, how='left', left_index=True, right_index=True)

In [None]:
melted_df_taxes_cargoes.drop(columns=['variable_x', 'ce_id_y', 'cargo_id_y', 'variable_y'], inplace=True)

In [None]:
melted_df_taxes_cargoes.rename(columns={'cargo_id_x': 'cargo_id', 'ce_id_x': 'ce_id', 'value_x': 'currency', 'value_y': 'quantity'}, inplace=True)

In [None]:
melted_df_taxes_cargoes.dropna(inplace=True)

### 2.2.3. Registration of Cargoes

In [None]:
df_cargo_regs = df_cargoes[['cargo_id', 'ce_id', 'row', 'commodity']]

In [None]:
df_cargo_regs

## 2.3. Cleaning dataframes

In [None]:
df_departure.drop_duplicates(inplace=True)

In [None]:
df_departure = df_departure.reset_index().rename(columns={'index': 'cargo_id'})

In [None]:
# check records in the dataframe
# df_departure.loc[df_departure['ce_id'] == 88094]

In [None]:
df_destination.drop_duplicates(inplace=True)

In [None]:
df_destination = df_destination.reset_index().rename(columns={'index': 'cargo_id'})

In [None]:
df_remarks_cargo.dropna(inplace=True)

In [None]:
df_remarks_cargo = df_remarks_cargo.reset_index().rename(columns={'index': 'cargo_id'})

In [None]:
# df_remarks_cargo

## 3. Save dataframes as CSV

In [None]:
melted_df_measurement.to_csv(r"C:\STRO20\cargoes_measurement.csv", sep=';', quotechar= '"', index_label='index', encoding='utf-8')

In [None]:
melted_df_taxes_cargoes.to_csv(r"C:\STRO20\taxes_cargoes.csv", sep=';', quotechar= '"', index_label='index', encoding='utf-8')

In [None]:
df_departure.to_csv(r"C:\STRO20\departure.csv", sep=';', quotechar= '"', index_label='index', encoding='utf-8')

In [None]:
df_destination.to_csv(r"C:\STRO20\destination.csv", sep=';', quotechar= '"', index_label='index', encoding='utf-8')

In [None]:
df_remarks_cargo.to_csv(r"C:\STRO20\remarks_cargoes.csv", sep=';', quotechar= '"', index_label='index', encoding='utf-8')

In [None]:
df_cargo_regs.to_csv(r"C:\STRO20\cargoes_regs.csv", sep=';', quotechar= '"', index_label='index', encoding='utf-8')