In [0]:
import dlt
from pyspark.sql.functions import substring, col, lit, when, initcap, element_at, split, upper, trim, lower, regexp_replace, regexp_extract, substring, expr, concat, coalesce

TOP_DIR = "/Volumes/prd_mega/sboost4/vboost4"
INPUT_DIR = f"{TOP_DIR}/Documents/input/Countries"
WORKSPACE_DIR = f"{TOP_DIR}/Workspace"
COUNTRY_MICRODATA_DIR = f'{WORKSPACE_DIR}/microdata_csv/Ghana'

CSV_READ_OPTIONS = {
    "header": "true",
    "multiline": "true",
    "quote": '"',
    "escape": '"',
}

@dlt.expect_or_drop("year_not_null", "YEAR IS NOT NULL")
@dlt.table(name=f'gha_boost_bronze')
def boost_bronze():
    bronze_df = (spark.read
      .format("csv")
      .options(**CSV_READ_OPTIONS)
      .option("inferSchema", "true")
      .load(COUNTRY_MICRODATA_DIR)
    )
    return bronze_df

@dlt.table(name=f'gha_boost_silver')
def boost_silver():
    return (dlt.read(f'gha_boost_bronze')
            .withColumn('ECON1',coalesce(col('ECON1'), lit('')))
            .withColumn('FUnc1',coalesce(col('FUnc1'), lit('')))
            .withColumn('Func2',coalesce(col('Func2'), lit('')))
            .withColumn('Foreign',coalesce(col('Foreign'), lit('')))
            .filter(~(col('ECON1').startswith('32') | col('ECON1').startswith('33')))
            .withColumn('is_foreign', col('Foreign')!='n')
            .withColumn('admin0', lit('Central'))
            .withColumn('admin1', lit('Central Scope'))
            .withColumn('admin2', col('ADMIN1'))
            .withColumn('region_code', col('GEO').substr(1, 2))
            .withColumn('geo1', 
                when(col('region_code') == '00', 'Other')
                .when(col('region_code') == '01', 'Western')
                .when(col('region_code') == '02', 'Central')
                .when(col('region_code') == '03', 'Greater Accra')
                .when(col('region_code') == '04', 'Volta')
                .when(col('region_code') == '05', 'Eastern')
                .when(col('region_code') == '06', 'Ashanti')
                .when((col('region_code') == '07') & (col('YEAR')<2019), 'Brong Ahafo')
                .when((col('region_code') == '07') & (col('YEAR')>=2019), 'Bono')
                .when(col('region_code') == '08', 'Northern')
                .when(col('region_code') == '09', 'Upper East')
                .when(col('region_code') == '10', 'Upper West')
                .when(col('region_code') == '11', 'Oti')
                .when(col('region_code') == '12', 'Bono East')
                .when(col('region_code') == '13', 'Ahafo')
                .when(col('region_code') == '14', 'Savannah')
                .when(col('region_code') == '15', 'North East')
                .when(col('region_code') == '16', 'Western North')
                .when(col('region_code') == '99', 'Central Scope')
            )
            .withColumn('func_sub',
                when(col('Func2').startswith('70330'), 'judiciary')
                .when((col('FUnc1').startswith('703')) & (~col('Func2').startswith('70330')), 'public safety')
                .when(col('Func2').startswith('70435') | col('Func2').startswith('70432'), 'energy')
                # No breakdown of health spending into primary, secondary, tertiary etc
                # No breakdown of education spending into primary, secondary, tertiary etc
            )
            .withColumn('func',
                when(col('FUnc1').startswith('702'), 'Defence')
                .when(col('FUnc1').startswith('703'), 'Public order and safety')
                .when(col('FUnc1').startswith('704'), 'Economic affairs')
                .when(col('FUnc1').startswith('705'), 'Environmental protection')
                .when(col('FUnc1').startswith('706'), 'Housing and community amenities')
                .when(col('FUnc1').startswith('707'), 'Health')
                .when(col('FUnc1').startswith('709'), 'Education')
                .when(col('FUnc1').startswith('708'), 'Recreation, culture and religion')
                .when(col('FUnc1').startswith('710'), 'Social protection')
                .otherwise('General public services')
            )
            .withColumn('econ_sub',
                when(col('ECON1').startswith('25'), 'subsidies to production')                        
            )
            .withColumn('econ',
                when(col('ECON1').startswith('21'), 'Wage bill')
                .when(col('ECON1').startswith('23') | col('ECON1').startswith('31'), 'Capital expenditures')
                .when(col('ECON1').startswith('22'), 'Goods and services')
                .when(col('ECON1').startswith('25'), 'Subsidies')
                # missing information on Social benefits
                .when(col('ECON1').startswith('26'), 'Other grants and transfers')
                .when(col('ECON1').startswith('24'), 'Interest on debt')
                .otherwise('Other expenses')
            )
        )
    
@dlt.table(name=f'gha_boost_gold')
def boost_gold():
    return (dlt.read(f'gha_boost_silver')
            .withColumn('country_name', lit('Ghana')) 
            .select('country_name',
                    'year',
                    'approved',
                    col('EXECUTED').alias('executed'),
                    'admin0',
                    'admin1',
                    'admin2',
                    'geo1',
                    'is_foreign',
                    'func',
                    'func_sub',
                    'econ',
                    'econ_sub'
                )
            )
