In [1]:
import pandas as pd
import numpy as np
import json
import re, pytz, os, requests, sys
from pathlib import Path
from datetime import datetime
import sys
sys.path.append("/workspaces/service-data")

from src.clean import clean_percentage, clean_fiscal_yr, normalize_string, standardize_column_names
from src.load import load_csv
from src.export import export_to_csv
from src.merge import merge_si, merge_ss
from src.utils import dept_list, program_list
from main import get_config

import pandas as pd
import numpy as np
import pytz
from pathlib import Path



base_dir = Path.cwd()
parent_dir = base_dir.parent
config = get_config()


In [5]:
"""Builds a structured data dictionary from a JSON file, processes nested data, 
renames columns, standardizes names, and exports to CSV."""

INPUT_DIR = config['input_dir']
file_path =  INPUT_DIR / 'service_data_dict.json'

# Load JSON file into a dictionary
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Initial normalization of json file
data_dict = pd.json_normalize(data)

# Explode and normalize the 'resources' portion
data_dict = data_dict.explode('resources').reset_index(drop=True)
data_dict = pd.json_normalize(data_dict['resources'])

# Explode the 'fields' portion
data_dict = data_dict.explode('fields').reset_index(drop=True)

# Tie the resource fields to the 'fields portion'
data_dict_fields = pd.json_normalize(data_dict['fields'])
data_dict = data_dict.merge(data_dict_fields, left_index=True, right_index=True)

# List of field names and details about their type and requirements
dd_field_names = data_dict.loc[:, ~data_dict.columns.str.startswith('choices.')].drop(columns=['fields'])

# List of translated code labels for fields with restricted input choices
dd_choices = data_dict.melt(
    id_vars = ['resource_name', 'title.en', 'title.fr','id','label.en', 'label.fr'], 
    value_vars=[col for col in data_dict.columns if col.startswith('choices.')]
)

dd_choices.dropna(subset=['value'], inplace=True)

dd_choices['code'] = dd_choices['variable'].str.split('.').str[1]
dd_choices['en_fr'] = dd_choices['variable'].str.split('.').str[2]
dd_choices = dd_choices.dropna(subset='en_fr')
dd_choices = dd_choices.loc[dd_choices['en_fr'].isin(['en', 'fr'])]

dd_choices = dd_choices.pivot(index=['resource_name', 'id', 'code'], columns='en_fr', values='value')
dd_choices = dd_choices.reset_index()

# Keep dd_choices tidy by removing program_id and splitting into its own file (dd_program)
dd_program = dd_choices.loc[dd_choices['id'] == 'program_id']
dd_choices = dd_choices.loc[dd_choices['id'] != 'program_id']

# Standardize column names
dd_field_names = standardize_column_names(dd_field_names)
dd_program = standardize_column_names(dd_program)
dd_choices = standardize_column_names(dd_choices)

data_dictionary_file_dict = {
    'dd_field_names': dd_field_names,
    'dd_program': dd_program,
    'dd_choices': dd_choices
}
