# Setup Code

In [37]:
# Importing all packages used
import pandas as pd

## Importing delivery_radius_log

In [38]:
# Importing the CSV file delivery_radius_log as a dataframe first
df_delivery_radius_log = pd.read_csv('data/delivery_radius_log.csv')
df_delivery_radius_log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1316 entries, 0 to 1315
Data columns (total 3 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   DELIVERY_AREA_ID         1316 non-null   object
 1   DELIVERY_RADIUS_METERS   1316 non-null   int64 
 2   EVENT_STARTED_TIMESTAMP  1316 non-null   object
dtypes: int64(1), object(2)
memory usage: 31.0+ KB


In [47]:
df_delivery_radius_log.head()

Unnamed: 0,DELIVERY_AREA_ID,DELIVERY_RADIUS_METERS,EVENT_STARTED_TIMESTAMP
0,5db02e5d401d690c836b9ead,3000,2022-06-14T08:26:20.923854Z
1,5db02e5d401d690c836b9ead,7000,2022-06-14T08:49:01.186365Z
2,5db02e5d401d690c836b9ead,3000,2022-06-18T07:43:57.662294Z
3,5db02e5d401d690c836b9ead,7000,2022-06-18T08:00:45.227506Z
4,5d78a7e552dfabd5251dab7b,4000,2022-06-18T08:05:29.093983Z


## Importing purchases 

In [40]:
# Importing the CSV file delivery_radius_log as a dataframe first
df_purchases = pd.read_csv('data/purchases.csv')
df_purchases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177895 entries, 0 to 177894
Data columns (total 6 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   PURCHASE_ID                            177895 non-null  object 
 1   TIME_RECEIVED                          177895 non-null  object 
 2   TIME_DELIVERED                         177895 non-null  object 
 3   END_AMOUNT_WITH_VAT_EUR                177895 non-null  float64
 4   DROPOFF_DISTANCE_STRAIGHT_LINE_METRES  177895 non-null  int64  
 5   DELIVERY_AREA_ID                       177895 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 8.1+ MB


In [41]:
df_purchases.head()

Unnamed: 0,PURCHASE_ID,TIME_RECEIVED,TIME_DELIVERED,END_AMOUNT_WITH_VAT_EUR,DROPOFF_DISTANCE_STRAIGHT_LINE_METRES,DELIVERY_AREA_ID
0,5f85beff7762a1539ad6faf1,2022-10-13T14:51:43.048Z,2022-10-13T15:18:35.265Z,17.87,735,5d78a7e552dfabd5251dab7b
1,5f85c08dddf0c9826389f3cd,2022-10-13T14:58:21.078Z,2022-10-13T15:28:09.194Z,17.75,436,5cc1b60b034adf90cd8f14dd
2,5f85bc2cf49ddea98955ce5f,2022-10-13T14:39:40.153Z,2022-10-13T15:05:15.058Z,25.8,867,5cc1b60b034adf90cd8f14dd
3,5f855dbf5a93deaf2be5b872,2022-10-13T07:56:47.003Z,2022-10-13T09:05:14.37Z,15.7,252,5db02e5d401d690c836b9ead
4,5f85be8a8876393ee141ed82,2022-10-13T14:49:46.693Z,2022-10-13T15:14:31.299Z,18.8,857,5db02e5d401d690c836b9ead


# Setting up a PostgreSQL DB  in localhost

In [42]:
import psycopg2 
from sqlalchemy import create_engine 


In [43]:
# Generating a connection STRING for using a test service account `dbuser`
from sqlalchemy import URL

url_object = URL.create(
    "postgresql+psycopg2",
    username="dbuser",
    password="1",  # plain (unescaped) text
    host="localhost",
    database="postgres",
)

url_object

postgresql+psycopg2://dbuser:***@localhost/postgres

In [44]:
db = create_engine(url_object) 
conn = db.connect() 
conn1 = psycopg2.connect( 
  database="postgres", 
  user='dbuser',  
  password='1',  
  host='localhost',  
  port= '5432'
) 
  
conn1.autocommit = True
cursor = conn1.cursor() 
  
# drop table if it already exists 
cursor.execute('drop table if exists ods.test_table') 
  
sql = '''
CREATE TABLE ods.test_table AS 
SELECT 1 AS test_column_name
;
'''
  
cursor.execute(sql) 

In [45]:
sql_test = '''
SELECT *
FROM ods.test_table
;
'''

print('running test SELECT statement...\n')
cursor.execute(sql_test) 
results = cursor.fetchall() 
print('result:\n')
print(results)

running test SELECT statement...

result:

[(1,)]


In [46]:
df_check = pd.read_sql_query(sql_test, conn1)
df_check.head()

  df_check = pd.read_sql_query(sql_test, conn1)


Unnamed: 0,test_column_name
0,1


# Creating ODS tables in the local PostgreSQL server

In [60]:
df_delivery_radius_log.to_sql(
    name = 'delivery_radius_log',
    schema = 'ods',
    con=db,
    if_exists = 'replace'
)

# asserting if the ingested table in there
sql_delivery_radius_log = '''
SELECT *
FROM ods.delivery_radius_log
LIMIT 5
;
'''

cursor.execute(sql_delivery_radius_log) 
cursor.fetchall() 

[(0, '5db02e5d401d690c836b9ead', 3000, '2022-06-14T08:26:20.923854Z'),
 (1, '5db02e5d401d690c836b9ead', 7000, '2022-06-14T08:49:01.186365Z'),
 (2, '5db02e5d401d690c836b9ead', 3000, '2022-06-18T07:43:57.662294Z'),
 (3, '5db02e5d401d690c836b9ead', 7000, '2022-06-18T08:00:45.227506Z'),
 (4, '5d78a7e552dfabd5251dab7b', 4000, '2022-06-18T08:05:29.093983Z')]

In [240]:
df_purchases.to_sql(
    name = 'purchases',
    schema = 'ods',
    con=db,
    if_exists = 'replace'
)

# asserting if the ingested table in there
sql_purchases = '''
SELECT *
FROM ods.purchases
LIMIT 5
;
'''

cursor.execute(sql_purchases) 
cursor.fetchall() 

[(0,
  0,
  '5f85beff7762a1539ad6faf1',
  '2022-10-13T14:51:43.048Z',
  '2022-10-13T15:18:35.265Z',
  17.87,
  735,
  '5d78a7e552dfabd5251dab7b'),
 (1,
  1,
  '5f85c08dddf0c9826389f3cd',
  '2022-10-13T14:58:21.078Z',
  '2022-10-13T15:28:09.194Z',
  17.75,
  436,
  '5cc1b60b034adf90cd8f14dd'),
 (2,
  2,
  '5f85bc2cf49ddea98955ce5f',
  '2022-10-13T14:39:40.153Z',
  '2022-10-13T15:05:15.058Z',
  25.8,
  867,
  '5cc1b60b034adf90cd8f14dd'),
 (3,
  3,
  '5f855dbf5a93deaf2be5b872',
  '2022-10-13T07:56:47.003Z',
  '2022-10-13T09:05:14.37Z',
  15.7,
  252,
  '5db02e5d401d690c836b9ead'),
 (4,
  4,
  '5f85be8a8876393ee141ed82',
  '2022-10-13T14:49:46.693Z',
  '2022-10-13T15:14:31.299Z',
  18.8,
  857,
  '5db02e5d401d690c836b9ead')]

In [64]:
sql = '''
SELECT *
FROM ods.purchases
'''

df_purchases = pd.read_sql(sql, con = db)
df_purchases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177895 entries, 0 to 177894
Data columns (total 7 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   index                                  177895 non-null  int64  
 1   PURCHASE_ID                            177895 non-null  object 
 2   TIME_RECEIVED                          177895 non-null  object 
 3   TIME_DELIVERED                         177895 non-null  object 
 4   END_AMOUNT_WITH_VAT_EUR                177895 non-null  float64
 5   DROPOFF_DISTANCE_STRAIGHT_LINE_METRES  177895 non-null  int64  
 6   DELIVERY_AREA_ID                       177895 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 9.5+ MB


In [65]:
df_purchases.head()

Unnamed: 0,index,PURCHASE_ID,TIME_RECEIVED,TIME_DELIVERED,END_AMOUNT_WITH_VAT_EUR,DROPOFF_DISTANCE_STRAIGHT_LINE_METRES,DELIVERY_AREA_ID
0,0,5f85beff7762a1539ad6faf1,2022-10-13T14:51:43.048Z,2022-10-13T15:18:35.265Z,17.87,735,5d78a7e552dfabd5251dab7b
1,1,5f85c08dddf0c9826389f3cd,2022-10-13T14:58:21.078Z,2022-10-13T15:28:09.194Z,17.75,436,5cc1b60b034adf90cd8f14dd
2,2,5f85bc2cf49ddea98955ce5f,2022-10-13T14:39:40.153Z,2022-10-13T15:05:15.058Z,25.8,867,5cc1b60b034adf90cd8f14dd
3,3,5f855dbf5a93deaf2be5b872,2022-10-13T07:56:47.003Z,2022-10-13T09:05:14.37Z,15.7,252,5db02e5d401d690c836b9ead
4,4,5f85be8a8876393ee141ed82,2022-10-13T14:49:46.693Z,2022-10-13T15:14:31.299Z,18.8,857,5db02e5d401d690c836b9ead


# Task 1
In the first task you’ll work with the delivery radius log dataset. Given this delivery radius change log, we would like you to detect at any given time what is a temporary reduction (or increase) of the delivery radius and what is the "default" (more permanent) delivery radius. For this exercise, you can assume that the default radius at any given time is a radius that has lasted for at least 24 hours uninterrupted.

We would like you to produce a dataset(s) and answer the following:
* What are all the default delivery radiuses for the delivery areas during the timeframe
provided? Keep in mind that each area can have multiple default radiuses in the given
dataset.
* How many hours of radius reductions with respect to the the default radiuses have we
had during the timeframe provided for each delivery area?

Please give answers in numerical values to the above questions.

In [134]:
df_delivery_radius_log.head()

Unnamed: 0,index,DELIVERY_AREA_ID,DELIVERY_RADIUS_METERS,EVENT_STARTED_TIMESTAMP
0,0,5db02e5d401d690c836b9ead,3000,2022-06-14T08:26:20.923854Z
1,1,5db02e5d401d690c836b9ead,7000,2022-06-14T08:49:01.186365Z
2,2,5db02e5d401d690c836b9ead,3000,2022-06-18T07:43:57.662294Z
3,3,5db02e5d401d690c836b9ead,7000,2022-06-18T08:00:45.227506Z
4,4,5d78a7e552dfabd5251dab7b,4000,2022-06-18T08:05:29.093983Z


In [143]:
sql_fct_delivery_radius_log = '''
SELECT *
FROM ods.delivery_radius_log
LIMIT 5
;
'''

df_fct_delivery_radius_log = pd.read_sql(sql_fct_delivery_radius_log, con = db)

df_fct_delivery_radius_log.head()

Unnamed: 0,index,DELIVERY_AREA_ID,DELIVERY_RADIUS_METERS,EVENT_STARTED_TIMESTAMP
0,0,5db02e5d401d690c836b9ead,3000,2022-06-14T08:26:20.923854Z
1,1,5db02e5d401d690c836b9ead,7000,2022-06-14T08:49:01.186365Z
2,2,5db02e5d401d690c836b9ead,3000,2022-06-18T07:43:57.662294Z
3,3,5db02e5d401d690c836b9ead,7000,2022-06-18T08:00:45.227506Z
4,4,5d78a7e552dfabd5251dab7b,4000,2022-06-18T08:05:29.093983Z


In [261]:
# asserting if the ingested table in there
sql_fct_delivery_radius_log = '''
WITH delivery_radius_log AS (
    SELECT *
        , LAG("DELIVERY_RADIUS_METERS") OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY "EVENT_STARTED_TIMESTAMP") AS previous_delivery_radius_meters
        , LAG("EVENT_STARTED_TIMESTAMP") OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY "EVENT_STARTED_TIMESTAMP") AS previous_event_started_timestamp 
    FROM ods.delivery_radius_log
    -- WHERE "DELIVERY_AREA_ID" = '5cc1b60b034adf90cd8f14dd'
), fct_delivery_radius_log AS (
    SELECT *
      , "DELIVERY_RADIUS_METERS" < previous_delivery_radius_meters AS is_reduction
      , EXTRACT('epoch' FROM ("EVENT_STARTED_TIMESTAMP"::TIMESTAMP - previous_event_started_timestamp::TIMESTAMP))/3600 AS delta_hours
    FROM delivery_radius_log
)
SELECT *
FROM fct_delivery_radius_log
WHERE "DELIVERY_AREA_ID" = '5cc1b60b034adf90cd8f14dd'
ORDER BY "EVENT_STARTED_TIMESTAMP"
;
'''

df_fct_delivery_radius_log = pd.read_sql(sql_fct_delivery_radius_log,
                                         con = db)

df_fct_delivery_radius_log.head(55)

Unnamed: 0,index,DELIVERY_AREA_ID,DELIVERY_RADIUS_METERS,EVENT_STARTED_TIMESTAMP,previous_delivery_radius_meters,previous_event_started_timestamp,is_reduction,delta_hours
0,694,5cc1b60b034adf90cd8f14dd,3500,2021-12-01T12:12:41.947087Z,,,,
1,695,5cc1b60b034adf90cd8f14dd,6500,2021-12-01T12:30:09.40586Z,3500.0,2021-12-01T12:12:41.947087Z,False,0.290961
2,696,5cc1b60b034adf90cd8f14dd,3500,2021-12-02T13:16:21.329693Z,6500.0,2021-12-01T12:30:09.40586Z,True,24.769979
3,697,5cc1b60b034adf90cd8f14dd,6500,2021-12-02T13:27:00.815321Z,3500.0,2021-12-02T13:16:21.329693Z,False,0.177635
4,707,5cc1b60b034adf90cd8f14dd,3500,2021-12-05T15:52:54.673552Z,6500.0,2021-12-02T13:27:00.815321Z,True,74.431627
5,708,5cc1b60b034adf90cd8f14dd,6500,2021-12-05T16:11:52.970808Z,3500.0,2021-12-05T15:52:54.673552Z,False,0.316194
6,709,5cc1b60b034adf90cd8f14dd,3500,2021-12-08T13:57:25.951153Z,6500.0,2021-12-05T16:11:52.970808Z,True,69.759161
7,710,5cc1b60b034adf90cd8f14dd,6500,2021-12-08T14:07:17.814469Z,3500.0,2021-12-08T13:57:25.951153Z,False,0.164406
8,713,5cc1b60b034adf90cd8f14dd,3500,2021-12-10T13:19:16.103106Z,6500.0,2021-12-08T14:07:17.814469Z,True,47.199525
9,714,5cc1b60b034adf90cd8f14dd,6500,2021-12-10T13:28:13.172008Z,3500.0,2021-12-10T13:19:16.103106Z,False,0.149186


# Default Delivery Radius for all delivery area

In [148]:
# asserting if the ingested table in there
sql_fct_delivery_radius_log = '''
WITH delivery_radius_log AS (
    SELECT *
        , LAG("DELIVERY_RADIUS_METERS") OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY "EVENT_STARTED_TIMESTAMP") AS previous_delivery_radius_meters
        , LAG("EVENT_STARTED_TIMESTAMP") OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY "EVENT_STARTED_TIMESTAMP") AS previous_event_started_timestamp 
    FROM ods.delivery_radius_log
    -- WHERE "DELIVERY_AREA_ID" = '5cc1b60b034adf90cd8f14dd'
), fct_delivery_radius_log AS (
    SELECT *
      , "DELIVERY_RADIUS_METERS" < previous_delivery_radius_meters AS is_reduction
      , EXTRACT('HOUR' FROM ("EVENT_STARTED_TIMESTAMP"::TIMESTAMP - previous_event_started_timestamp::TIMESTAMP)) AS delta_hours
    FROM delivery_radius_log
)
SELECT "DELIVERY_AREA_ID" AS delivery_area_id
    , "DELIVERY_RADIUS_METERS" AS default_delivery_radius_meters
    , "EVENT_STARTED_TIMESTAMP" AS latest_event_started_timestamp
FROM (
    SELECT *
        , ROW_NUMBER() OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY delta_hours DESC) AS _row_number
    FROM fct_delivery_radius_log
)
WHERE _row_number = 1
;
'''

df_fct_delivery_radius_log = pd.read_sql(sql_fct_delivery_radius_log,
                                         con = db)

df_fct_delivery_radius_log.head()

Unnamed: 0,delivery_area_id,default_delivery_radius_meters,latest_event_started_timestamp
0,5cc1b60b034adf90cd8f14dd,3500,2021-12-01T12:12:41.947087Z
1,5d78a7e552dfabd5251dab7b,4000,2021-12-02T14:23:43.714277Z
2,5db02e5d401d690c836b9ead,4000,2021-12-03T17:25:04.855491Z


In [394]:
# Save the dimension table in the DB for re-using
sql_delivery_radius_log = '''
WITH delivery_radius_log AS (
    SELECT *
        , LAG("DELIVERY_RADIUS_METERS") OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY "EVENT_STARTED_TIMESTAMP") AS previous_delivery_radius_meters
        , LAG("EVENT_STARTED_TIMESTAMP") OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY "EVENT_STARTED_TIMESTAMP") AS previous_event_started_timestamp 
    FROM ods.delivery_radius_log
    -- WHERE "DELIVERY_AREA_ID" = '5cc1b60b034adf90cd8f14dd'
), fct_delivery_radius_log AS (
    SELECT *
      , "DELIVERY_RADIUS_METERS" < previous_delivery_radius_meters AS is_reduction
      , EXTRACT('epoch' FROM ("EVENT_STARTED_TIMESTAMP"::TIMESTAMP - previous_event_started_timestamp::TIMESTAMP))/3600 AS delta_hours
    FROM delivery_radius_log
)
SELECT "DELIVERY_AREA_ID" AS delivery_area_id
    , "DELIVERY_RADIUS_METERS" AS default_delivery_radius_meters
    , "EVENT_STARTED_TIMESTAMP" AS latest_event_started_timestamp
FROM (
    SELECT *
        , ROW_NUMBER() OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY delta_hours DESC) AS _row_number
    FROM fct_delivery_radius_log
)
WHERE _row_number = 1
;
'''

df_delivery_radius_log = pd.read_sql(sql_delivery_radius_log, con = db)

df_delivery_radius_log.head()


df_delivery_radius_log.to_sql(
    name = 'delivery_areas',
    schema = 'dim',
    con=db,
    if_exists = 'replace',
    index = False
)


cursor.execute(sql_delivery_radius_log) 
cursor.fetchall() 

# Assert if the table creation is successful
sql_fct_delivery_radius_log = '''
SELECT *
FROM dim.delivery_areas
;
'''

df_fct_delivery_radius_log = pd.read_sql(sql_fct_delivery_radius_log, con = db)

df_fct_delivery_radius_log.head()


Unnamed: 0,delivery_area_id,default_delivery_radius_meters,latest_event_started_timestamp
0,5cc1b60b034adf90cd8f14dd,3500,2021-12-01T12:12:41.947087Z
1,5d78a7e552dfabd5251dab7b,4000,2021-12-02T14:23:43.714277Z
2,5db02e5d401d690c836b9ead,4000,2021-12-03T17:25:04.855491Z


## Hours of radius reductions with respect to the the default radiuses per each delivery area without the given timeframe

In [327]:
# asserting if the ingested table in there
sql_sum_delivery_area_radius_reduction_durations = '''
WITH delivery_radius_log AS (
    SELECT logs.*
        , areas."default_delivery_radius_meters"
        , LAG(logs."EVENT_STARTED_TIMESTAMP") OVER (PARTITION BY logs."DELIVERY_AREA_ID" ORDER BY logs."EVENT_STARTED_TIMESTAMP") AS previous_event_started_timestamp 
        , logs."DELIVERY_RADIUS_METERS" < areas."default_delivery_radius_meters" AS is_reduction
    FROM ods.delivery_radius_log AS logs
    LEFT JOIN dim.delivery_areas areas ON logs."DELIVERY_AREA_ID" = areas."delivery_area_id"
), fct_delivery_radius_log AS (
    SELECT *
        , EXTRACT('epoch' FROM ("EVENT_STARTED_TIMESTAMP"::TIMESTAMP - previous_event_started_timestamp::TIMESTAMP))/3600 AS delta_hours
    FROM delivery_radius_log
)
SELECT "DELIVERY_AREA_ID"
    , "default_delivery_radius_meters"
    , SUM(CASE WHEN is_reduction THEN delta_hours END) AS reduction_hours
FROM fct_delivery_radius_log
GROUP BY 1,2
;
'''

df_sum_delivery_area_radius_reduction_durations = pd.read_sql(sql_sum_delivery_area_radius_reduction_durations,
                                         con = db)

df_sum_delivery_area_radius_reduction_durations.head(500)

Unnamed: 0,DELIVERY_AREA_ID,default_delivery_radius_meters,reduction_hours
0,5d78a7e552dfabd5251dab7b,4000,1721.26467
1,5db02e5d401d690c836b9ead,4000,9176.436851
2,5cc1b60b034adf90cd8f14dd,3500,617.643406


In [233]:
# Create the summary table in DB
df_sum_delivery_area_radius_reduction_durations.to_sql(
    name = 'delivery_area_radius_reduction_durations',
    schema = 'sum',
    con=db,
    if_exists = 'replace'
)

cursor.execute(sql_sum_delivery_area_radius_reduction_durations) 
cursor.fetchall() 

# Assert if the table creation is successful
sql_sum_delivery_area_radius_reduction_durations = '''
SELECT *
FROM sum.delivery_area_radius_reduction_durations
;
'''

df_sum_delivery_area_radius_reduction_durations = pd.read_sql(sql_sum_delivery_area_radius_reduction_durations, con = db)

df_sum_delivery_area_radius_reduction_durations.head()


DuplicateColumnError: A column with name 'level_0' is already present in table 'delivery_area_radius_reduction_durations'.

# Task 2

Now that we know the default delivery radiuses and times when the delivery radius was reduced, we would like you to create a derived dataset aggregated to hourly level that can be used to analyze delivery radius reductions and purchases in the areas for any hour in 2022. Build the dataset so that anyone could query the data without writing further joins or calculations and would be able to answer the following questions with a simple SELECT statement:
* How many purchases and how much revenue (End Amount With VAT Eur) do we produce during the hour?
* How long do the deviations (reductions) from default radius last during the hour? How many times have we modified the radius during the hour?
* How do these hourly values compare to the previous week for each area? This is just a simple week-over-week percentage difference for each of the above-mentioned four measures.

We want to emphasize that all three questions should be answered with the same aggregated dataset, meaning for instance that even the week-over-week differences are pre-calculated. Please note that for this task it is enough to only create the dataset and you are not expected to answer these questions. We only wish to see the code which creates this table and a sample of a few rows from the resulting dataset.

In [279]:
# Create a dummy Week table first
sql_base_weeks = '''
SELECT generate_series(date '2022-01-01', date '2022-12-31', '1 week')::DATE AS base_week
;
'''

df_base_weeks = pd.read_sql(sql_base_weeks, con = db)

df_base_weeks.head(10)

Unnamed: 0,base_week
0,2022-01-01
1,2022-01-08
2,2022-01-15
3,2022-01-22
4,2022-01-29
5,2022-02-05
6,2022-02-12
7,2022-02-19
8,2022-02-26
9,2022-03-05


In [257]:
# Develop the Summary of Sales
sql_sum_sales = '''
SELECT DATE_TRUNC('WEEK', "TIME_RECEIVED"::DATE) AS week_reporting
    , "DELIVERY_AREA_ID"
    , COUNT(*) AS nb_purcahses
    , SUM("END_AMOUNT_WITH_VAT_EUR") AS end_amount_with_vat_eur
FROM ods.purchases
GROUP BY 1,2
;
'''

df_sum_sales = pd.read_sql(sql_sum_sales, con = db)

df_sum_sales.head()

Unnamed: 0,week_reporting,DELIVERY_AREA_ID,nb_purcahses,end_amount_with_vat_eur
0,2022-12-05 00:00:00+01:00,5db02e5d401d690c836b9ead,1409,30937.6
1,2022-11-07 00:00:00+01:00,5cc1b60b034adf90cd8f14dd,1519,35271.84
2,2022-05-02 00:00:00+02:00,5d78a7e552dfabd5251dab7b,1206,29698.55
3,2022-03-07 00:00:00+01:00,5d78a7e552dfabd5251dab7b,861,20872.1
4,2022-04-18 00:00:00+02:00,5cc1b60b034adf90cd8f14dd,1183,30715.12


## # Generate a Fact table of delivery 

In [385]:
# Develop the Summary of Sales
sql_fct_delivery_raiud_events = '''
SELECT *
FROM fct.delivery_radius_log
;
'''

df_fct_delivery_raiud_events = pd.read_sql(sql_fct_delivery_raiud_events, con = db)

df_fct_delivery_raiud_events.head()

Unnamed: 0,index,DELIVERY_AREA_ID,DELIVERY_RADIUS_METERS,EVENT_STARTED_TIMESTAMP
0,0,5db02e5d401d690c836b9ead,3000,2022-06-14T08:26:20.923854Z
1,1,5db02e5d401d690c836b9ead,7000,2022-06-14T08:49:01.186365Z
2,2,5db02e5d401d690c836b9ead,3000,2022-06-18T07:43:57.662294Z
3,3,5db02e5d401d690c836b9ead,7000,2022-06-18T08:00:45.227506Z
4,4,5d78a7e552dfabd5251dab7b,4000,2022-06-18T08:05:29.093983Z


In [386]:
sql_fct_delivery_radius_events = '''
WITH delivery_radius_log AS (
    SELECT "DELIVERY_AREA_ID" AS delivery_area_id
        , "EVENT_STARTED_TIMESTAMP"::TIMESTAMP AS event_started_timestamp
        , "DELIVERY_RADIUS_METERS" AS delivery_radius_meters
    FROM ods.delivery_radius_log 
), fct_delivery_radius_events AS (
    SELECT logs.delivery_area_id
        , logs.event_started_timestamp
        , LAG(logs.event_started_timestamp) OVER (PARTITION BY logs.delivery_area_id ORDER BY logs.event_started_timestamp) AS previous_event_started_timestamp 
        , LEAD(logs.event_started_timestamp) OVER (PARTITION BY logs.delivery_area_id ORDER BY logs.event_started_timestamp) AS next_event_started_timestamp 
        , logs.delivery_radius_meters
        , areas."default_delivery_radius_meters"
        , logs.delivery_radius_meters < areas."default_delivery_radius_meters" AS is_reduction
    FROM delivery_radius_log logs
    LEFT JOIN dim.delivery_areas areas ON logs.delivery_area_id = areas."delivery_area_id"
    ORDER BY logs.delivery_area_id, logs.event_started_timestamp
)
SELECT delivery_area_id
    , previous_event_started_timestamp
    , event_started_timestamp
    , next_event_started_timestamp 
    , EXTRACT('epoch' FROM (next_event_started_timestamp - event_started_timestamp))/3600 AS delta_hours
    , delivery_radius_meters
    , default_delivery_radius_meters
    , is_reduction
FROM fct_delivery_radius_events

'''

df_fct_delivery_radius_events = pd.read_sql(sql_fct_delivery_radius_events, con = db)

df_fct_delivery_radius_events.head()

Unnamed: 0,delivery_area_id,previous_event_started_timestamp,event_started_timestamp,next_event_started_timestamp,delta_hours,delivery_radius_meters,default_delivery_radius_meters,is_reduction
0,5cc1b60b034adf90cd8f14dd,NaT,2021-12-01 12:12:41.947087,2021-12-01 12:30:09.405860,0.290961,3500,3500,False
1,5cc1b60b034adf90cd8f14dd,2021-12-01 12:12:41.947087,2021-12-01 12:30:09.405860,2021-12-02 13:16:21.329693,24.769979,6500,3500,False
2,5cc1b60b034adf90cd8f14dd,2021-12-01 12:30:09.405860,2021-12-02 13:16:21.329693,2021-12-02 13:27:00.815321,0.177635,3500,3500,False
3,5cc1b60b034adf90cd8f14dd,2021-12-02 13:16:21.329693,2021-12-02 13:27:00.815321,2021-12-05 15:52:54.673552,74.431627,6500,3500,False
4,5cc1b60b034adf90cd8f14dd,2021-12-02 13:27:00.815321,2021-12-05 15:52:54.673552,2021-12-05 16:11:52.970808,0.316194,3500,3500,False


In [390]:
df_fct_delivery_radius_events.to_sql(
    name = 'delivery_radius_events',
    schema = 'fct',
    con=db,
    if_exists = 'replace',
    index = False
)

# asserting if the ingested table in there
sql_fct_delivery_radius_events = '''
WITH delivery_radius_log AS (
    SELECT "DELIVERY_AREA_ID" AS delivery_area_id
        , "EVENT_STARTED_TIMESTAMP"::TIMESTAMP AS event_started_timestamp
        , "DELIVERY_RADIUS_METERS" AS delivery_radius_meters
    FROM ods.delivery_radius_log 
), fct_delivery_radius_events AS (
    SELECT logs.delivery_area_id
        , logs.event_started_timestamp
        , LAG(logs.event_started_timestamp) OVER (PARTITION BY logs.delivery_area_id ORDER BY logs.event_started_timestamp) AS previous_event_started_timestamp 
        , LEAD(logs.event_started_timestamp) OVER (PARTITION BY logs.delivery_area_id ORDER BY logs.event_started_timestamp) AS next_event_started_timestamp 
        , logs.delivery_radius_meters
        , areas."default_delivery_radius_meters"
        , logs.delivery_radius_meters < areas."default_delivery_radius_meters" AS is_reduction
    FROM delivery_radius_log logs
    LEFT JOIN dim.delivery_areas areas ON logs.delivery_area_id = areas."delivery_area_id"
    ORDER BY logs.delivery_area_id, logs.event_started_timestamp
)
SELECT delivery_area_id
    , previous_event_started_timestamp
    , event_started_timestamp
    , next_event_started_timestamp 
    , EXTRACT('epoch' FROM (next_event_started_timestamp - event_started_timestamp))/3600 AS delta_hours
    , delivery_radius_meters
    , default_delivery_radius_meters
    , is_reduction
FROM fct_delivery_radius_events

'''

cursor.execute(sql_fct_delivery_radius_events) 
# cursor.fetchall() 

## Generate a Summary table of delivery 

In [398]:
# Develop the Summary of Delivery Radius Reduction
sql_sum_delivery_radius_reduction = '''
WITH base_hours AS (
    SELECT generate_series(date '2022-01-01', date '2022-12-31', '1 hour') AS base_hour
), base_hours_with_delivery_areas AS (
	SELECT base_hours.base_hour
		, delivery_areas.delivery_area_id
	FROM base_hours
	CROSS JOIN dim.delivery_areas
), delivery_radius_events AS (
    SELECT *
        , DATE_TRUNC('hour', DATE_ADD(base_hour, INTERVAL '1 HOUR')) AS next_base_hour
    FROM (
        SELECT delivery_area_id
            , DATE_TRUNC('hour', event_started_timestamp) AS base_hour
			, previous_event_started_timestamp
            , event_started_timestamp
            , next_event_started_timestamp
            , delta_hours
        FROM fct.delivery_radius_events
    ) 
), delivery_radius_events_processed AS (
    SELECT delivery_area_id
    , base_hour
    , next_base_hour
	, previous_event_started_timestamp
    , event_started_timestamp
    , next_event_started_timestamp
    -- Scenario 1: event starts & ends within the same hour
    , EXTRACT('epoch' FROM (next_event_started_timestamp - event_started_timestamp))/3600 AS delta_hours_1
    -- Scenario 2: event starts in the current base hour, then ends in another base hour later
    , EXTRACT('epoch' FROM (next_base_hour - event_started_timestamp))/3600 AS delta_hours_2
    -- Scenario 3: event from in a previous base hour, then ends in the current base hour
    , EXTRACT('epoch' FROM (LEAST(next_event_started_timestamp, next_base_hour) - event_started_timestamp))/3600 AS delta_hours_3
    -- Scenario 4: event from in a previous base hour, then ends in the future base hour (i.e. full hour closure)
    , 1 AS delta_hours_4
FROM delivery_radius_events
), delivery_radius_events_processed_further AS(
	SELECT delivery_area_id
		, base_hour
		, next_base_hour
		, previous_event_started_timestamp
		, event_started_timestamp
		, next_event_started_timestamp
		, (
			CASE
				-- Scenario 1: event starts & ends within the same hour
				WHEN base_hour = DATE_TRUNC('hour', event_started_timestamp)
						AND base_hour = DATE_TRUNC('hour', next_event_started_timestamp)
					THEN 1
				-- Scenario 2: event starts in the current base hour, then ends in another base hour later
				WHEN base_hour = DATE_TRUNC('hour', event_started_timestamp)
						AND next_event_started_timestamp > next_base_hour
					THEN 2
				-- Scenario 3: event from in a previous base hour, then ends in the current base hour
				WHEN previous_event_started_timestamp < base_hour
						AND base_hour = DATE_TRUNC('hour', event_started_timestamp)
					THEN 3
				-- Scenario 4 cannot be handled here, to be taken care later
				ELSE 0
			END
			 ) AS Scenario
		, (
			CASE
				-- Scenario 1: event starts & ends within the same hour
				WHEN base_hour = DATE_TRUNC('hour', event_started_timestamp)
						AND base_hour = DATE_TRUNC('hour', next_event_started_timestamp)
					THEN delta_hours_1
				-- Scenario 2: event starts in the current base hour, then ends in another base hour later
				WHEN base_hour = DATE_TRUNC('hour', event_started_timestamp)
						AND next_event_started_timestamp > next_base_hour
					THEN delta_hours_2
				-- Scenario 3: event from in a previous base hour, then ends in the current base hour
				WHEN previous_event_started_timestamp < base_hour
						AND base_hour = DATE_TRUNC('hour', event_started_timestamp)
					THEN delta_hours_3
				-- Scenario 4 cannot be handled here, to be taken care later
				ELSE 0
			END
			 ) AS delta_hours
		, delta_hours_1
		, delta_hours_2
		, delta_hours_3
	FROM delivery_radius_events_processed
), delivery_radius_events_processed_further_agg_part1 AS (
	-- Scenario 1-3
	SELECT delivery_area_id
		, base_hour
		, SUM(delta_hours) AS delta_hours
	FROM delivery_radius_events_processed_further
	GROUP BY 1,2
	ORDER BY delivery_area_id, base_hour
), delivery_radius_events_processed_further_agg_part2 AS (
	-- Scenario 4
	SELECT base.delivery_area_id
		, base.base_hour
		, 1 AS delta_hours
	FROM base_hours_with_delivery_areas base
	INNER JOIN delivery_radius_events_processed_further events ON base.delivery_area_id = events.delivery_area_id
		AND base.base_hour BETWEEN events.event_started_timestamp AND events.next_event_started_timestamp
		AND events.previous_event_started_timestamp < base.base_hour
		AND events.next_event_started_timestamp > DATE_ADD(base.base_hour, INTERVAL '1 hour')
	ORDER BY base.delivery_area_id, base.base_hour
), delivery_radius_events_union_all_agg AS (
	SELECT delivery_area_id
		, base_hour
		, SUM(delta_hours) AS delta_hours
	FROM (
		SELECT *
		FROM delivery_radius_events_processed_further_agg_part1

		UNION ALL

		SELECT *
		FROM delivery_radius_events_processed_further_agg_part2
	)
	GROUP BY 1,2
	ORDER BY delivery_area_id, base_hour
)
SELECT base.delivery_area_id
    , base.base_hour
    , COALESCE(agg.delta_hours, 0) AS delta_hours
FROM base_hours_with_delivery_areas base
LEFT JOIN delivery_radius_events_union_all_agg agg ON base.base_hour = agg.base_hour
    AND base.delivery_area_id = agg.delivery_area_id
ORDER BY base.delivery_area_id, base.base_hour
'''

df_sum_delivery_radius_reduction = pd.read_sql(sql_sum_delivery_radius_reduction, con = db)
df_sum_delivery_radius_reduction.head(50)

df_sum_delivery_radius_reduction.to_sql(
    name = 'delivery_radius_reduction',
    schema = 'sum',
    con=db,
    if_exists = 'replace',
    index = False
)


Unnamed: 0,delivery_area_id,base_hour,delta_hours
0,5cc1b60b034adf90cd8f14dd,2022-12-28 03:00:00+01:00,0.0
1,5cc1b60b034adf90cd8f14dd,2022-12-26 15:00:00+01:00,0.0
2,5cc1b60b034adf90cd8f14dd,2022-12-26 16:00:00+01:00,0.0
3,5cc1b60b034adf90cd8f14dd,2022-12-26 17:00:00+01:00,0.0
4,5cc1b60b034adf90cd8f14dd,2022-12-26 18:00:00+01:00,0.0
5,5cc1b60b034adf90cd8f14dd,2022-12-30 09:00:00+01:00,0.0
6,5cc1b60b034adf90cd8f14dd,2022-12-30 08:00:00+01:00,0.0
7,5cc1b60b034adf90cd8f14dd,2022-12-26 19:00:00+01:00,0.0
8,5cc1b60b034adf90cd8f14dd,2022-12-30 07:00:00+01:00,0.0
9,5cc1b60b034adf90cd8f14dd,2022-12-26 20:00:00+01:00,0.0
