In [None]:
import os       as os
import numpy    as np
import datetime as dt
import sys      as sys

##
## Graphical libs and setup
import matplotlib.pyplot as plt
plt.ioff()
 
#print(plt.style.available)
plt.style.use(['ggplot', 'fast'])
plt.rc('image', cmap='cubehelix') # See https://www.mrao.cam.ac.uk/~dag/CUBEHELIX/
 
import seaborn  as sns
import pandas   as pd
pd.options.display.width = 250
 
from bnzds.utilities import CommonNotebook as cn
from bnzds.utilities import CommonIO       as cio
from bnzds.utilities import CommonGraphs   as cg
from bnzds.utilities import CommonSpark    as cs
 
# Not yet guaranteed to work under phttp://pxlbig03:10020/notebooks/DS_Python_Utilities/bnzds/examples/Jupyter/Lab_Test_BoxGraph.ipynb#ython2
# ML libs should be on python3 anyway
#if sys.version_info[0] >= 3:
#    from bnzds.utilities import CommonML       as cml
#    from bnzds.utilities import CommonTA       as cta
 
## Set OUTPUT paths
mainpath = os.environ.get('DSOUTPUT')
if mainpath is None:
    from os.path import expanduser
    mainpath = expanduser("~") + '/output/'

dpath = mainpath + '/data/'
gpath = mainpath + '/graphs/'
mpath = mainpath + '/model/'
                
## Platform Init
cn.platformInit(libs=[cn, cio], gpath=gpath)
#print = cn.aprint

In [None]:
pip install duckdb

In [None]:
import duckdb
con1 = duckdb.connect(':memory:', config={'allow_unsigned_extensions' : 'true'})

# Edge node has 504G
con1.sql("SET memory_limit = '100GB'")  

# See https://stackoverflow.com/questions/71952623/reading-partitioned-parquet-files-in-duckdb
con1.sql("SET temp_directory = '/data/disk1/tmp/duckdbcaches/" + os.environ.get('USER') + "'")

# Use progress bar (if possible)
print(con1.sql("PRAGMA enable_print_progress_bar"))


In [None]:
df_fin = pd.read_csv('business-financial-data-march-2024-csv.csv')
df_fin.columns = [str(col).lower().replace(' ','_') for col in df_fin.columns]
df_fin['year'] = df_fin['period'].astype(str).str.slice(0,4).astype(int)
df_fin.rename(columns={'group':'ind_group'}, inplace='True')
df_fin.head()

In [None]:
df_emp = pd.read_csv('machine-readable-business-employment-data-mar-2024-quarter.csv')
df_emp.columns = [str(col).lower().replace(' ','_') for col in df_emp.columns]
df_emp.head()

In [None]:
duckdb.register('fin',df_fin)
duckdb.register('emp',df_emp)

In [None]:
print(df_fin.dtypes)

In [None]:
Q1_query = """
with sal_wgs as
(--Get all data where series_title_1 = salaries and wages
select 
    *
from fin 
where series_title_1 ilike '%salaries and wages%'
      
)

, first_year as
(--All industries where first year for salaries and wages after 2016
select
    series_title_2
    , min(year) as first_year
from sal_wgs
group by series_title_2
having min(year) > 2016
)

select
    series_title_2 as industry
    ,avg_filled_jobs as overall_filled_jobs
from 
(
    select
        emp.series_title_2
        ,avg(emp.data_value) as avg_filled_jobs
        ,emp.series_title_1
    from emp
    inner join first_year fst on fst.series_title_2 = emp.series_title_2
    where emp.series_title_1 = 'Filled jobs'
    group by emp.series_title_2
            , emp.series_title_1
) mx
order by avg_filled_jobs desc
limit 1
;
"""

result = duckdb.sql(Q1_query).df()
result

In [None]:
Q2_query = """
with base as 
(--Get all data for business industry = Level 2 and series_title_1 = operating income
select 
    *
from fin
where series_title_1 = 'Sales (operating income)'
      and ind_group = 'Industry by financial variable (NZSIOC Level 2)'
)

select 
    period
    ,series_title_2 as industry
    ,data_value as operating_income
from
    (
    select
        period
        ,series_title_2
        ,data_value
        ,dense_rank over (partition by period order by data_value desc) as operating_inc_rnk
    from base
    ) rnk
where operating_inc_rnk = 2
"""

result = duckdb.sql(Q2_query).df()
result

In [None]:
emp_data = con1.sql("""
                    SELECT * FROM 'machine-readable-business-employment-data-mar-2024-quarter.csv'
                    """)
print(emp_data)