# Setup Code

In [37]:
# Importing all packages used
import pandas as pd

## Importing delivery_radius_log

In [38]:
# Importing the CSV file delivery_radius_log as a dataframe first
df_delivery_radius_log = pd.read_csv('data/delivery_radius_log.csv')
df_delivery_radius_log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1316 entries, 0 to 1315
Data columns (total 3 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   DELIVERY_AREA_ID         1316 non-null   object
 1   DELIVERY_RADIUS_METERS   1316 non-null   int64 
 2   EVENT_STARTED_TIMESTAMP  1316 non-null   object
dtypes: int64(1), object(2)
memory usage: 31.0+ KB


In [47]:
df_delivery_radius_log.head()

Unnamed: 0,DELIVERY_AREA_ID,DELIVERY_RADIUS_METERS,EVENT_STARTED_TIMESTAMP
0,5db02e5d401d690c836b9ead,3000,2022-06-14T08:26:20.923854Z
1,5db02e5d401d690c836b9ead,7000,2022-06-14T08:49:01.186365Z
2,5db02e5d401d690c836b9ead,3000,2022-06-18T07:43:57.662294Z
3,5db02e5d401d690c836b9ead,7000,2022-06-18T08:00:45.227506Z
4,5d78a7e552dfabd5251dab7b,4000,2022-06-18T08:05:29.093983Z


## Importing purchases 

In [40]:
# Importing the CSV file delivery_radius_log as a dataframe first
df_purchases = pd.read_csv('data/purchases.csv')
df_purchases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177895 entries, 0 to 177894
Data columns (total 6 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   PURCHASE_ID                            177895 non-null  object 
 1   TIME_RECEIVED                          177895 non-null  object 
 2   TIME_DELIVERED                         177895 non-null  object 
 3   END_AMOUNT_WITH_VAT_EUR                177895 non-null  float64
 4   DROPOFF_DISTANCE_STRAIGHT_LINE_METRES  177895 non-null  int64  
 5   DELIVERY_AREA_ID                       177895 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 8.1+ MB


In [41]:
df_purchases.head()

Unnamed: 0,PURCHASE_ID,TIME_RECEIVED,TIME_DELIVERED,END_AMOUNT_WITH_VAT_EUR,DROPOFF_DISTANCE_STRAIGHT_LINE_METRES,DELIVERY_AREA_ID
0,5f85beff7762a1539ad6faf1,2022-10-13T14:51:43.048Z,2022-10-13T15:18:35.265Z,17.87,735,5d78a7e552dfabd5251dab7b
1,5f85c08dddf0c9826389f3cd,2022-10-13T14:58:21.078Z,2022-10-13T15:28:09.194Z,17.75,436,5cc1b60b034adf90cd8f14dd
2,5f85bc2cf49ddea98955ce5f,2022-10-13T14:39:40.153Z,2022-10-13T15:05:15.058Z,25.8,867,5cc1b60b034adf90cd8f14dd
3,5f855dbf5a93deaf2be5b872,2022-10-13T07:56:47.003Z,2022-10-13T09:05:14.37Z,15.7,252,5db02e5d401d690c836b9ead
4,5f85be8a8876393ee141ed82,2022-10-13T14:49:46.693Z,2022-10-13T15:14:31.299Z,18.8,857,5db02e5d401d690c836b9ead


# Setting up a PostgreSQL DB  in localhost

In [42]:
import psycopg2 
from sqlalchemy import create_engine 


In [43]:
# Generating a connection STRING for using a test service account `dbuser`
from sqlalchemy import URL

url_object = URL.create(
    "postgresql+psycopg2",
    username="dbuser",
    password="1",  # plain (unescaped) text
    host="localhost",
    database="postgres",
)

url_object

postgresql+psycopg2://dbuser:***@localhost/postgres

In [44]:
db = create_engine(url_object) 
conn = db.connect() 
conn1 = psycopg2.connect( 
  database="postgres", 
  user='dbuser',  
  password='1',  
  host='localhost',  
  port= '5432'
) 
  
conn1.autocommit = True
cursor = conn1.cursor() 
  
# drop table if it already exists 
cursor.execute('drop table if exists ods.test_table') 
  
sql = '''
CREATE TABLE ods.test_table AS 
SELECT 1 AS test_column_name
;
'''
  
cursor.execute(sql) 

In [45]:
sql_test = '''
SELECT *
FROM ods.test_table
;
'''

print('running test SELECT statement...\n')
cursor.execute(sql_test) 
results = cursor.fetchall() 
print('result:\n')
print(results)

running test SELECT statement...

result:

[(1,)]


In [46]:
df_check = pd.read_sql_query(sql_test, conn1)
df_check.head()

  df_check = pd.read_sql_query(sql_test, conn1)


Unnamed: 0,test_column_name
0,1


# Creating ODS tables in the local PostgreSQL server

In [60]:
df_delivery_radius_log.to_sql(
    name = 'delivery_radius_log',
    schema = 'ods',
    con=db,
    if_exists = 'replace'
)

# asserting if the ingested table in there
sql_delivery_radius_log = '''
SELECT *
FROM ods.delivery_radius_log
LIMIT 5
;
'''

cursor.execute(sql_delivery_radius_log) 
cursor.fetchall() 

[(0, '5db02e5d401d690c836b9ead', 3000, '2022-06-14T08:26:20.923854Z'),
 (1, '5db02e5d401d690c836b9ead', 7000, '2022-06-14T08:49:01.186365Z'),
 (2, '5db02e5d401d690c836b9ead', 3000, '2022-06-18T07:43:57.662294Z'),
 (3, '5db02e5d401d690c836b9ead', 7000, '2022-06-18T08:00:45.227506Z'),
 (4, '5d78a7e552dfabd5251dab7b', 4000, '2022-06-18T08:05:29.093983Z')]

In [61]:
df_purchases.to_sql(
    name = 'purchases',
    schema = 'ods',
    con=db,
    if_exists = 'replace'
)

# asserting if the ingested table in there
sql_purchases = '''
SELECT *
FROM ods.purchases
LIMIT 5
;
'''

cursor.execute(sql_purchases) 
cursor.fetchall() 

[(0,
  '5f85beff7762a1539ad6faf1',
  '2022-10-13T14:51:43.048Z',
  '2022-10-13T15:18:35.265Z',
  17.87,
  735,
  '5d78a7e552dfabd5251dab7b'),
 (1,
  '5f85c08dddf0c9826389f3cd',
  '2022-10-13T14:58:21.078Z',
  '2022-10-13T15:28:09.194Z',
  17.75,
  436,
  '5cc1b60b034adf90cd8f14dd'),
 (2,
  '5f85bc2cf49ddea98955ce5f',
  '2022-10-13T14:39:40.153Z',
  '2022-10-13T15:05:15.058Z',
  25.8,
  867,
  '5cc1b60b034adf90cd8f14dd'),
 (3,
  '5f855dbf5a93deaf2be5b872',
  '2022-10-13T07:56:47.003Z',
  '2022-10-13T09:05:14.37Z',
  15.7,
  252,
  '5db02e5d401d690c836b9ead'),
 (4,
  '5f85be8a8876393ee141ed82',
  '2022-10-13T14:49:46.693Z',
  '2022-10-13T15:14:31.299Z',
  18.8,
  857,
  '5db02e5d401d690c836b9ead')]

In [64]:
sql = '''
SELECT *
FROM ods.purchases
'''

df_purchases = pd.read_sql(sql, con = db)
df_purchases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177895 entries, 0 to 177894
Data columns (total 7 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   index                                  177895 non-null  int64  
 1   PURCHASE_ID                            177895 non-null  object 
 2   TIME_RECEIVED                          177895 non-null  object 
 3   TIME_DELIVERED                         177895 non-null  object 
 4   END_AMOUNT_WITH_VAT_EUR                177895 non-null  float64
 5   DROPOFF_DISTANCE_STRAIGHT_LINE_METRES  177895 non-null  int64  
 6   DELIVERY_AREA_ID                       177895 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 9.5+ MB


In [65]:
df_purchases.head()

Unnamed: 0,index,PURCHASE_ID,TIME_RECEIVED,TIME_DELIVERED,END_AMOUNT_WITH_VAT_EUR,DROPOFF_DISTANCE_STRAIGHT_LINE_METRES,DELIVERY_AREA_ID
0,0,5f85beff7762a1539ad6faf1,2022-10-13T14:51:43.048Z,2022-10-13T15:18:35.265Z,17.87,735,5d78a7e552dfabd5251dab7b
1,1,5f85c08dddf0c9826389f3cd,2022-10-13T14:58:21.078Z,2022-10-13T15:28:09.194Z,17.75,436,5cc1b60b034adf90cd8f14dd
2,2,5f85bc2cf49ddea98955ce5f,2022-10-13T14:39:40.153Z,2022-10-13T15:05:15.058Z,25.8,867,5cc1b60b034adf90cd8f14dd
3,3,5f855dbf5a93deaf2be5b872,2022-10-13T07:56:47.003Z,2022-10-13T09:05:14.37Z,15.7,252,5db02e5d401d690c836b9ead
4,4,5f85be8a8876393ee141ed82,2022-10-13T14:49:46.693Z,2022-10-13T15:14:31.299Z,18.8,857,5db02e5d401d690c836b9ead


# Task 1
In the first task you’ll work with the delivery radius log dataset. Given this delivery radius change log, we would like you to detect at any given time what is a temporary reduction (or increase) of the delivery radius and what is the "default" (more permanent) delivery radius. For this exercise, you can assume that the default radius at any given time is a radius that has lasted for at least 24 hours uninterrupted.

We would like you to produce a dataset(s) and answer the following:
* What are all the default delivery radiuses for the delivery areas during the timeframe
provided? Keep in mind that each area can have multiple default radiuses in the given
dataset.
* How many hours of radius reductions with respect to the the default radiuses have we
had during the timeframe provided for each delivery area?

Please give answers in numerical values to the above questions.

In [134]:
df_delivery_radius_log.head()

Unnamed: 0,index,DELIVERY_AREA_ID,DELIVERY_RADIUS_METERS,EVENT_STARTED_TIMESTAMP
0,0,5db02e5d401d690c836b9ead,3000,2022-06-14T08:26:20.923854Z
1,1,5db02e5d401d690c836b9ead,7000,2022-06-14T08:49:01.186365Z
2,2,5db02e5d401d690c836b9ead,3000,2022-06-18T07:43:57.662294Z
3,3,5db02e5d401d690c836b9ead,7000,2022-06-18T08:00:45.227506Z
4,4,5d78a7e552dfabd5251dab7b,4000,2022-06-18T08:05:29.093983Z


In [143]:
sql_fct_delivery_radius_log = '''
SELECT *
FROM ods.delivery_radius_log
LIMIT 5
;
'''

df_fct_delivery_radius_log = pd.read_sql(sql_fct_delivery_radius_log, con = db)

df_fct_delivery_radius_log.head()

Unnamed: 0,index,DELIVERY_AREA_ID,DELIVERY_RADIUS_METERS,EVENT_STARTED_TIMESTAMP
0,0,5db02e5d401d690c836b9ead,3000,2022-06-14T08:26:20.923854Z
1,1,5db02e5d401d690c836b9ead,7000,2022-06-14T08:49:01.186365Z
2,2,5db02e5d401d690c836b9ead,3000,2022-06-18T07:43:57.662294Z
3,3,5db02e5d401d690c836b9ead,7000,2022-06-18T08:00:45.227506Z
4,4,5d78a7e552dfabd5251dab7b,4000,2022-06-18T08:05:29.093983Z


In [144]:
# asserting if the ingested table in there
sql_fct_delivery_radius_log = '''
WITH delivery_radius_log AS (
    SELECT *
        , LAG("DELIVERY_RADIUS_METERS") OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY "EVENT_STARTED_TIMESTAMP") AS previous_delivery_radius_meters
        , LAG("EVENT_STARTED_TIMESTAMP") OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY "EVENT_STARTED_TIMESTAMP") AS previous_event_started_timestamp 
    FROM ods.delivery_radius_log
    -- WHERE "DELIVERY_AREA_ID" = '5cc1b60b034adf90cd8f14dd'
), fct_delivery_radius_log AS (
    SELECT *
      , "DELIVERY_RADIUS_METERS" < previous_delivery_radius_meters AS is_reduction
      , EXTRACT('HOUR' FROM ("EVENT_STARTED_TIMESTAMP"::TIMESTAMP - previous_event_started_timestamp::TIMESTAMP)) AS delta_hours
    FROM delivery_radius_log
)
SELECT *
FROM fct_delivery_radius_log
-- WHERE delta_hours >=23
-- ORDER BY delta_hours DESC
;
'''

df_fct_delivery_radius_log = pd.read_sql(sql_fct_delivery_radius_log,
                                         con = db)

df_fct_delivery_radius_log.head(5000)

Unnamed: 0,index,DELIVERY_AREA_ID,DELIVERY_RADIUS_METERS,EVENT_STARTED_TIMESTAMP,previous_delivery_radius_meters,previous_event_started_timestamp,is_reduction,delta_hours
0,694,5cc1b60b034adf90cd8f14dd,3500,2021-12-01T12:12:41.947087Z,,,,
1,695,5cc1b60b034adf90cd8f14dd,6500,2021-12-01T12:30:09.40586Z,3500.0,2021-12-01T12:12:41.947087Z,False,0.0
2,696,5cc1b60b034adf90cd8f14dd,3500,2021-12-02T13:16:21.329693Z,6500.0,2021-12-01T12:30:09.40586Z,True,0.0
3,697,5cc1b60b034adf90cd8f14dd,6500,2021-12-02T13:27:00.815321Z,3500.0,2021-12-02T13:16:21.329693Z,False,0.0
4,707,5cc1b60b034adf90cd8f14dd,3500,2021-12-05T15:52:54.673552Z,6500.0,2021-12-02T13:27:00.815321Z,True,2.0
...,...,...,...,...,...,...,...,...
1311,1308,5db02e5d401d690c836b9ead,7000,2023-01-03T13:14:36.552923Z,3000.0,2023-01-03T12:41:15.581536Z,False,0.0
1312,1309,5db02e5d401d690c836b9ead,3500,2023-01-03T13:32:08.64009Z,7000.0,2023-01-03T13:14:36.552923Z,True,0.0
1313,1310,5db02e5d401d690c836b9ead,7000,2023-01-03T13:51:42.840053Z,3500.0,2023-01-03T13:32:08.64009Z,False,0.0
1314,1314,5db02e5d401d690c836b9ead,3500,2023-01-03T14:28:54.650661Z,7000.0,2023-01-03T13:51:42.840053Z,True,0.0


# Default Delivery Radius for all delivery area

In [148]:
# asserting if the ingested table in there
sql_fct_delivery_radius_log = '''
WITH delivery_radius_log AS (
    SELECT *
        , LAG("DELIVERY_RADIUS_METERS") OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY "EVENT_STARTED_TIMESTAMP") AS previous_delivery_radius_meters
        , LAG("EVENT_STARTED_TIMESTAMP") OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY "EVENT_STARTED_TIMESTAMP") AS previous_event_started_timestamp 
    FROM ods.delivery_radius_log
    -- WHERE "DELIVERY_AREA_ID" = '5cc1b60b034adf90cd8f14dd'
), fct_delivery_radius_log AS (
    SELECT *
      , "DELIVERY_RADIUS_METERS" < previous_delivery_radius_meters AS is_reduction
      , EXTRACT('HOUR' FROM ("EVENT_STARTED_TIMESTAMP"::TIMESTAMP - previous_event_started_timestamp::TIMESTAMP)) AS delta_hours
    FROM delivery_radius_log
)
SELECT "DELIVERY_AREA_ID" AS delivery_area_id
    , "DELIVERY_RADIUS_METERS" AS default_delivery_radius_meters
    , "EVENT_STARTED_TIMESTAMP" AS latest_event_started_timestamp
FROM (
    SELECT *
        , ROW_NUMBER() OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY delta_hours DESC) AS _row_number
    FROM fct_delivery_radius_log
)
WHERE _row_number = 1
;
'''

df_fct_delivery_radius_log = pd.read_sql(sql_fct_delivery_radius_log,
                                         con = db)

df_fct_delivery_radius_log.head()

Unnamed: 0,delivery_area_id,default_delivery_radius_meters,latest_event_started_timestamp
0,5cc1b60b034adf90cd8f14dd,3500,2021-12-01T12:12:41.947087Z
1,5d78a7e552dfabd5251dab7b,4000,2021-12-02T14:23:43.714277Z
2,5db02e5d401d690c836b9ead,4000,2021-12-03T17:25:04.855491Z


In [214]:
# Save the dimension table in the DB for re-using
sql_delivery_radius_log = '''
WITH delivery_radius_log AS (
    SELECT *
        , LAG("DELIVERY_RADIUS_METERS") OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY "EVENT_STARTED_TIMESTAMP") AS previous_delivery_radius_meters
        , LAG("EVENT_STARTED_TIMESTAMP") OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY "EVENT_STARTED_TIMESTAMP") AS previous_event_started_timestamp 
    FROM ods.delivery_radius_log
    -- WHERE "DELIVERY_AREA_ID" = '5cc1b60b034adf90cd8f14dd'
), fct_delivery_radius_log AS (
    SELECT *
      , "DELIVERY_RADIUS_METERS" < previous_delivery_radius_meters AS is_reduction
      , EXTRACT('epoch' FROM ("EVENT_STARTED_TIMESTAMP"::TIMESTAMP - previous_event_started_timestamp::TIMESTAMP))/3600 AS delta_hours
    FROM delivery_radius_log
)
SELECT "DELIVERY_AREA_ID" AS delivery_area_id
    , "DELIVERY_RADIUS_METERS" AS default_delivery_radius_meters
    , "EVENT_STARTED_TIMESTAMP" AS latest_event_started_timestamp
FROM (
    SELECT *
        , ROW_NUMBER() OVER (PARTITION BY "DELIVERY_AREA_ID" ORDER BY delta_hours DESC) AS _row_number
    FROM fct_delivery_radius_log
)
WHERE _row_number = 1
;
'''

df_delivery_radius_log = pd.read_sql(sql_delivery_radius_log, con = db)

df_delivery_radius_log.head()


df_delivery_radius_log.to_sql(
    name = 'delivery_areas',
    schema = 'dim',
    con=db,
    if_exists = 'replace'
)


cursor.execute(sql_delivery_radius_log) 
cursor.fetchall() 

# Assert if the table creation is successful
sql_fct_delivery_radius_log = '''
SELECT *
FROM dim.delivery_areas
;
'''

df_fct_delivery_radius_log = pd.read_sql(sql_fct_delivery_radius_log, con = db)

df_fct_delivery_radius_log.head()


Unnamed: 0,index,delivery_area_id,default_delivery_radius_meters,latest_event_started_timestamp
0,0,5cc1b60b034adf90cd8f14dd,3500,2021-12-01T12:12:41.947087Z
1,1,5d78a7e552dfabd5251dab7b,4000,2021-12-02T14:23:43.714277Z
2,2,5db02e5d401d690c836b9ead,4000,2021-12-03T17:25:04.855491Z


## Hours of radius reductions with respect to the the default radiuses per each delivery area without the given timeframe

In [221]:
# asserting if the ingested table in there
sql_sum_delivery_area_radius_reduction_durations = '''
WITH delivery_radius_log AS (
    SELECT logs.*
        , areas."default_delivery_radius_meters"
        , LAG(logs."EVENT_STARTED_TIMESTAMP") OVER (PARTITION BY logs."DELIVERY_AREA_ID" ORDER BY logs."EVENT_STARTED_TIMESTAMP") AS previous_event_started_timestamp 
        , logs."DELIVERY_RADIUS_METERS" < areas."default_delivery_radius_meters" AS is_reduction
    FROM ods.delivery_radius_log AS logs
    LEFT JOIN dim.delivery_areas areas ON logs."DELIVERY_AREA_ID" = areas."delivery_area_id"
), fct_delivery_radius_log AS (
    SELECT *
        , EXTRACT('epoch' FROM ("EVENT_STARTED_TIMESTAMP"::TIMESTAMP - previous_event_started_timestamp::TIMESTAMP))/3600 AS delta_hours
    FROM delivery_radius_log
)
/*
SELECT *
FROM fct_delivery_radius_log
WHERE "DELIVERY_AREA_ID" = '5cc1b60b034adf90cd8f14dd'
    AND is_reduction
*/


SELECT "DELIVERY_AREA_ID"
    , "default_delivery_radius_meters"
    , SUM(CASE WHEN is_reduction THEN delta_hours END) AS reduction_hours
FROM fct_delivery_radius_log
GROUP BY 1,2
;
'''

df_sum_delivery_area_radius_reduction_durations = pd.read_sql(sql_sum_delivery_area_radius_reduction_durations,
                                         con = db)

df_sum_delivery_area_radius_reduction_durations.head(500)

Unnamed: 0,DELIVERY_AREA_ID,default_delivery_radius_meters,reduction_hours
0,5d78a7e552dfabd5251dab7b,4000,1721.26467
1,5db02e5d401d690c836b9ead,4000,9176.436851
2,5cc1b60b034adf90cd8f14dd,3500,617.643406


In [232]:
# Create the summary table in DB
df_sum_delivery_area_radius_reduction_durations.to_sql(
    name = 'delivery_area_radius_reduction_durations',
    schema = 'sum',
    con=db,
    if_exists = 'replace'
)

cursor.execute(sql_sum_delivery_area_radius_reduction_durations) 
cursor.fetchall() 

# Assert if the table creation is successful
sql_sum_delivery_area_radius_reduction_durations = '''
SELECT *
FROM sum.delivery_area_radius_reduction_durations
;
'''

df_sum_delivery_area_radius_reduction_durations = pd.read_sql(sql_sum_delivery_area_radius_reduction_durations, con = db)

df_sum_delivery_area_radius_reduction_durations.head()


DuplicateColumnError: A column with name 'level_0' is already present in table 'delivery_area_radius_reduction_durations_3'.