In [0]:
from datetime import datetime
from datetime import timedelta
import sys
sys.path.append('../libs')

import utils

src_catalog = dbutils.widgets.get('src_catalog')
src_schema = dbutils.widgets.get('src_schema')
src_table = dbutils.widgets.get('src_table')
src_partition = dbutils.widgets.get('src_partition')

tgt_catalog = dbutils.widgets.get('tgt_catalog')
tgt_schema = dbutils.widgets.get('tgt_schema')
tgt_table = dbutils.widgets.get('tgt_table')
tgt_partition = dbutils.widgets.get('tgt_partition')

In [0]:
if utils.table_exists(spark, tgt_catalog, tgt_schema, tgt_table):

    last_updated = utils.get_last_partition(spark, tgt_catalog, tgt_schema, tgt_table, tgt_partition)

    end_date = utils.get_last_partition(spark, src_catalog, src_schema, src_table, src_partition)

    if last_updated == end_date:
        print('Table is already up to date')
        dbutils.notebook.exit()

    start_date = spark.sql(f'''
        SELECT 
            MIN(ref_date) 
        FROM silver.nytimes.top_stories 
        WHERE weekofyear(ref_date) = {datetime.strptime(end_date, "%Y-%m-%d").date().strftime('%V')}
    ''').collect()[0][0]
    
    print(f'Updating table with data from {start_date} to {end_date}')
else:
    start_date = utils.get_first_partition(spark, src_catalog, src_schema, src_table, src_partition)
    end_date = utils.get_last_partition(spark, src_catalog, src_schema, src_table, src_partition)
    print(f'Creating table with data from {start_date} to {end_date}') 


In [0]:
df_upload = spark.sql(f'''
  WITH persons AS (
    SELECT
      ref_date,
      weekofyear(ref_date) AS week_number,
      explode(ds_persons) AS person
    FROM {src_catalog}.{src_schema}.{src_table}
    WHERE ref_date >= '{start_date}'
  ),

  week_period AS (
    SELECT
      week_number,
      MIN(ref_date) AS week_start,
      MAX(ref_date) AS week_end
    FROM persons
    GROUP BY week_number
  )

  SELECT
    p.week_number,
    week_start,
    week_end,
    concat(
      split_part(regexp_replace(person, r'\\s\\(\\d\\d\\d\\d\\-?.?.?.?.', ''), ',', '2'),
      ' ',
      split_part(regexp_replace(person, r'\\s\\(\\d\\d\\d\\d\\-?.?.?.?.', ''), ',', '1')
    ) AS person,
    COUNT(*) AS mentions,
    week_end AS ref_date
  FROM persons p
  INNER JOIN week_period ON p.week_number = week_period.week_number
  GROUP BY p.week_number, week_start, week_end, person
  ORDER BY week_number DESC, mentions DESC
'''.format(
  src_catalog = src_catalog, 
  src_schema = src_schema, 
  src_table = src_table, 
  start_date = start_date))

In [0]:
(df_upload.write
    .partitionBy('{tgt_partition}')
    .format('delta')
    .mode('overwrite')
    .option('replaceWhere', f'{tgt_partition} >= "{start_date}"')
    .saveAsTable(f'{tgt_catalog}.{tgt_schema}.{tgt_table}')
)