In [1]:
import pypyodbc as odbc
import pandas as pd

DRIVER_NAME = 'SQL SERVER'
SERVER_NAME = 'DESKTOP-I0V76P2\SQLEXPRESS' 
DATABASE_NAME = 'data_mart'

connection_string = f"""
    DRIVER={{{DRIVER_NAME}}};
    SERVER={SERVER_NAME};
    DATABASE={DATABASE_NAME};
    Trust_Connection=yes;
    # uid=;
    # pwd=;
"""

def execute_query_to_df(query):
    """Executes a SQL query and returns the results as a pandas DataFrame."""

    conn = odbc.connect(connection_string)
    cursor = conn.cursor()
    cursor.execute(query)
    rows = cursor.fetchall()
    df = pd.DataFrame(rows, columns=cursor.description)
    df.columns = [col[0] for col in df.columns]
    cursor.close()
    conn.close()
    return df

def execute_update(query):
    """Executes a SQL query that doesn't return results."""

    conn = odbc.connect(connection_string)
    cursor = conn.cursor()
    cursor.execute(query)
    conn.commit()  # Commit the changes to the database
    cursor.close()
    conn.close()

1. Data Cleansing Steps
In a single query, perform the following operations and generate a new table in the data_mart schema named clean_weekly_sales:

* Convert the week_date to a DATE format

* Add a week_number as the second column for each week_date value, for example any value from the 1st of January to 7th of January will be 1, 8th to 14th will be 2 etc

* Add a month_number with the calendar month for each week_date value as the 3rd column

* Add a calendar_year column as the 4th column containing either 2018, 2019 or 2020 values

* Add a new column called age_band after the original segment column using the following mapping on the number inside the segment value

segment	age_band

1	Young Adults

2	Middle Aged

3 or 4	Retirees


* Add a new demographic column using the following mapping for the first letter in the segment values:

segment	demographic

C	Couples

F	Families

* Ensure all null string values with an "unknown" string value in the original segment column as well as the new age_band and demographic columns

* Generate a new avg_transaction column as the sales value divided by transactions rounded to 2 decimal places for each record

In [90]:
query = """
WITH cte AS(
    SELECT 
        *,
        CONVERT(date,
            CONCAT(
                    CASE
                        WHEN RIGHT(LEFT(week_date,2),1) = '/' 
                        THEN '0' + LEFT(week_date,1) 
                        ELSE LEFT(week_date,2) END,
                    '/',
                    CASE
                        WHEN LEFT(RIGHT(week_date,5),1) = '/'
                        THEN '0' + LEFT(RIGHT(week_date,4),1)
                        ELSE LEFT(RIGHT(week_date,5),2) END,
                    '/',
                    '20' + RIGHT(week_date,2)), 103) AS week_date_clean,
        CASE 
            WHEN segment IS NULL THEN 'unknown'
            WHEN segment = 'null' THEN 'unknown'
            ELSE segment END AS segment2,
        CASE
            WHEN segment IS NULL THEN 'unknown'
            WHEN segment = 'null' THEN 'unknown'
            WHEN RIGHT(segment,1) = '1' THEN 'Young Adults'
            WHEN RIGHT(segment,1) = '2' THEN 'Middle Aged'
            ELSE 'Retirees' END AS age_band,
        CASE
            WHEN segment IS NULL THEN 'unknown'
            WHEN segment = 'null' THEN 'unknown'
            WHEN LEFT(segment,1) = 'C' THEN 'Couples'
            WHEN LEFT(segment,1) = 'F' THEN 'Families' END AS demographic,
        CAST(sales / transactions AS DECIMAL(10,2)) AS avg_transaction
        FROM weekly_sales
    )

SELECT
    week_date_clean AS date,
    DATEPART(week, week_date_clean) - 
        DATEPART(week, DATEADD(year, DATEDIFF(year, 0, week_date_clean), 0)) + 1 AS week_number,
    DATEPART(month, week_date_clean) AS month_number,
    DATEPART(year, week_date_clean) AS calendar_year,
    segment2,
    age_band,
    demographic,
    region,
    platform, 
    segment, 
    customer_type, 
    transactions, 
    sales, 
    avg_transaction
INTO clean_weekly_sales
FROM cte
"""
execute_update(query)

### 2. Data Exploration
What day of the week is used for each week_date value?

What range of week numbers are missing from the dataset?

How many total transactions were there for each year in the dataset?

What is the total sales for each region for each month?

What is the total count of transactions for each platform

What is the percentage of sales for Retail vs Shopify for each month?

What is the percentage of sales by demographic for each year in the dataset?

Which age_band and demographic values contribute the most to Retail sales?

Can we use the avg_transaction column to find the average transaction size for each year for Retail vs Shopify? If not - how would you calculate it instead?

In [8]:
# What day of the week is used for each week_date value?

query = """
SELECT 
    FORMAT(date, 'dddd') AS weekday,
    COUNT(date) AS count_of_orders
FROM clean_weekly_sales
GROUP BY FORMAT(date, 'dddd')
"""

execute_query_to_df(query)

Unnamed: 0,weekday,count_of_orders
0,Monday,17117


In [46]:
# What range of week numbers are missing from the dataset?

query = """
WITH cte_num AS(
    SELECT
        1 AS week_num
    
    UNION ALL
    
    SELECT
        week_num + 1
    FROM cte_num
        WHERE week_num + 1 <= 52
        )
        
SELECT 
    cte_num.week_num
FROM cte_num
    LEFT OUTER JOIN clean_weekly_sales AS ws
        ON cte_num.week_num = ws.week_number
GROUP BY cte_num.week_num 
HAVING COUNT(ws.week_number) = 0
ORDER BY cte_num.week_num
"""

execute_query_to_df(query).head()

Unnamed: 0,week_num
0,1
1,2
2,3
3,4
4,5


In [25]:
# How many total transactions were there for each year in the dataset?

query = """
SELECT
    calendar_year,
    SUM(transactions) AS total_transactions
FROM clean_weekly_sales
GROUP BY calendar_year
"""

execute_query_to_df(query)


Unnamed: 0,calendar_year,total_transactions
0,2019,365639285
1,2020,375813651
2,2018,346406460


In [45]:
# What is the total sales for each region for each month?

query = """
SELECT
    region,
    month_number,
    FORMAT(
        SUM(CAST(sales AS FLOAT)),
        'N0') AS total_sales
FROM clean_weekly_sales
GROUP BY region, month_number
ORDER BY region, month_number
"""

execute_query_to_df(query).head()

Unnamed: 0,region,month_number,total_sales
0,AFRICA,3,567767480
1,AFRICA,4,1911783504
2,AFRICA,5,1647244738
3,AFRICA,6,1767559760
4,AFRICA,7,1960219710


In [44]:
# What is the total sales for each region for each month?

query = """
WITH AggregatedSales AS (
    SELECT
        region,
        month_number,
        SUM(CAST(sales AS FLOAT)) AS total_sales
    FROM clean_weekly_sales
    GROUP BY region, month_number
)

SELECT
    month_number,
    FORMAT([AFRICA], 'N0') AS AFRICA,
    FORMAT([ASIA], 'N0') AS ASIA,
    FORMAT([CANADA], 'N0') AS CANADA,
    FORMAT([EUROPE], 'N0') AS EUROPE,
    FORMAT([OCEANIA], 'N0') AS OCEANIA,
    FORMAT([SOUTH AMERICA], 'N0') AS [SOUTH AMERICA],
    FORMAT([USA], 'N0') AS USA
FROM 
    AggregatedSales
PIVOT (
    SUM(total_sales)
    FOR region IN ([AFRICA], [ASIA], [CANADA], [EUROPE], [OCEANIA], [SOUTH AMERICA], [USA])
) AS P
ORDER BY month_number;

"""

execute_query_to_df(query)

Unnamed: 0,month_number,africa,asia,canada,europe,oceania,south america,usa
0,3,567767480,529770793,144634329,35337093,783282888,71023109,225353043
1,4,1911783504,1804628707,484552594,127334255,2599767620,238451531,759786323
2,5,1647244738,1526285399,412378365,109338389,2215657304,201391809,655967121
3,6,1767559760,1619482889,443846698,122813826,2371884744,218247455,703878990
4,7,1960219710,1768844756,477134947,136757466,2563459400,235582776,760331754
5,8,1809596890,1663320609,447073019,122102995,2432313652,221166052,712002790
6,9,276320987,252836807,69067959,18877433,372465518,34175583,110532368


In [49]:
# What is the total count of transactions for each platform

query = """
SELECT 
    platform,
    SUM(transactions) AS sum_of_transactions
FROM clean_weekly_sales
    GROUP BY platform
"""

execute_query_to_df(query)

Unnamed: 0,platform,sum_of_transactions
0,Shopify,5925169
1,Retail,1081934227


In [91]:
# What is the percentage of sales for Retail vs Shopify for each month?

query = """
WITH sales_categorized_by_platform AS(
    SELECT 
        FORMAT(date, 'yyyy-MM') AS date,
        CASE 
            WHEN platform = 'Retail' THEN 1 ELSE 0 END AS retail_count,
        CASE 
            WHEN platform = 'Shopify' THEN 1 ELSE 0 END AS shopify_count,
        CAST(sales AS float) AS sales
    FROM clean_weekly_sales
    ),

    aggregated_sales_by_platform AS (   
    SELECT
        date,
        
        CAST(
            SUM(retail_count * sales) / 
            SUM(sales) * 100 
        AS DECIMAL(10,2)) AS retail_pct,
        
        CAST(
            SUM(shopify_count * sales) /
            SUM(sales) * 100 
        AS DECIMAL(10,2)) AS shopify_pct
        
    FROM sales_categorized_by_platform
    GROUP BY date
)

SELECT 
    *
FROM aggregated_sales_by_platform
"""

execute_query_to_df(query)

Unnamed: 0,date,retail_pct,shopify_pct
0,2020-05,96.71,3.29
1,2019-09,97.09,2.91
2,2018-06,97.76,2.24
3,2019-06,97.42,2.58
4,2019-05,97.52,2.48
5,2019-03,97.71,2.29
6,2020-04,96.96,3.04
7,2018-05,97.73,2.27
8,2019-07,97.35,2.65
9,2018-04,97.93,2.07


In [108]:
# What is the percentage of sales for Retail vs Shopify for each month?

query = """
    SELECT 
        FORMAT(date, 'yyyy-MM') AS date,
        CAST(
            SUM( 
                CASE 
                    WHEN platform = 'Retail' 
                    THEN CAST(sales AS FLOAT) ELSE 0 END) / 
                        SUM(CAST(sales AS float)) * 100 AS DECIMAL(10,2)) AS retail_count,
        CAST(
            SUM(
                CASE 
                    WHEN platform = 'Shopify' 
                    THEN CAST(sales AS FLOAT) ELSE 0 END) / 
                        SUM(CAST(sales AS float)) * 100 AS DECIMAL(10,2)) AS shopify_count
    FROM clean_weekly_sales
    GROUP BY FORMAT(date, 'yyyy-MM')
"""

execute_query_to_df(query)

Unnamed: 0,date,retail_count,shopify_count
0,2020-05,96.71,3.29
1,2019-09,97.09,2.91
2,2018-06,97.76,2.24
3,2019-06,97.42,2.58
4,2019-05,97.52,2.48
5,2019-03,97.71,2.29
6,2020-04,96.96,3.04
7,2018-05,97.73,2.27
8,2019-07,97.35,2.65
9,2018-04,97.93,2.07


In [130]:
# What is the percentage of sales by demographic for each year in the dataset?

query = """
WITH d AS (
    SELECT 
        FORMAT(date, 'yyyy') AS year,
        demographic,
        SUM(CAST(sales AS FLOAT)) AS sales
    FROM clean_weekly_sales
    GROUP BY FORMAT(date, 'yyyy'), demographic
        )
        
    SELECT
        demographic,
        FORMAT([2018], 'N0') AS [2018], 
        FORMAT([2019], 'N0') AS [2019], 
        FORMAT([2020], 'N0') AS [2020]
    FROM d
    PIVOT(
        SUM(sales)
        FOR year 
            IN ([2018], [2019], [2020])
    ) AS p;
"""

execute_query_to_df(query)

Unnamed: 0,demographic,2018,2019,2020
0,Couples,3402388688,3749251935,4049566928
1,Families,4125558033,4463918344,4614338065
2,unknown,5369434106,5532862221,5436315907


In [166]:
# Which age_band and demographic values contribute the most to Retail sales?

query = """
    SELECT TOP (1)
        age_band,
        CAST(SUM(CAST(sales AS FLOAT)) AS DECIMAL(20,0)) AS total_sales
    FROM clean_weekly_sales
    WHERE age_band <> 'unknown'
    GROUP BY age_band
    ORDER BY total_sales DESC
"""

execute_query_to_df(query)

Unnamed: 0,age_band,total_sales
0,Retirees,13281572202


In [165]:
# Which age_band and demographic values contribute the most to Retail sales?

query = """
    SELECT TOP (1)
        demographic,
        CAST(SUM(CAST(sales AS FLOAT)) AS DECIMAL(20,0)) AS total_sales
    FROM clean_weekly_sales
    WHERE demographic <> 'unknown'
    GROUP BY demographic
    ORDER BY total_sales DESC
"""

execute_query_to_df(query)

Unnamed: 0,demographic,total_sales
0,Families,13203814442


In [164]:
# Which age_band and demographic values contribute the most to Retail sales?

query = """
    SELECT TOP (1)
        age_band,
        demographic,
        CAST(SUM(CAST(sales AS FLOAT)) AS DECIMAL(20,0)) AS total_sales
    FROM clean_weekly_sales
    WHERE demographic <> 'unknown' OR age_band <> 'unknown'
    GROUP BY age_band, demographic
    ORDER BY total_sales DESC
"""

execute_query_to_df(query)

Unnamed: 0,age_band,demographic,total_sales
0,Retirees,Families,6750457132


In [177]:
# Can we use the avg_transaction column to find the average transaction size for each 
# year for Retail vs Shopify? If not - how would you calculate it instead?

query = """
WITH shopify_avg_transaction AS(
    SELECT
        CAST(AVG(avg_transaction) AS DECIMAL(10,2)) AS shopify_avg_transaction
    FROM clean_weekly_sales
    WHERE platform = 'Shopify'

    ),
    
retail_avg_transaction AS(
    SELECT
        CAST(AVG(avg_transaction) AS DECIMAL(10,2)) AS retail_avg_transaction
    FROM clean_weekly_sales
    WHERE platform = 'Retail'
    )
    
SELECT 
*,
shopify_avg_transaction - retail_avg_transaction AS transaction_difference
FROM shopify_avg_transaction
    CROSS JOIN retail_avg_transaction

"""

execute_query_to_df(query)

Unnamed: 0,shopify_avg_transaction,retail_avg_transaction,transaction_difference
0,180.23,41.84,138.39


3. Before & After Analysis

This technique is usually used when we inspect an important event and want to inspect the impact before and after a certain point in time.

Taking the week_date value of 2020-06-15 as the baseline week where the Data Mart sustainable packaging changes came into effect.

We would include all week_date values for 2020-06-15 as the start of the period after the change and the previous week_date values would be before

Using this analysis approach - answer the following questions:

What is the total sales for the 4 weeks before and after 2020-06-15? What is the growth or reduction rate in actual values and percentage of sales?

What about the entire 12 weeks before and after?

How do the sale metrics for these 2 periods before and after compare with the previous years in 2018 and 2019?

In [188]:
# What is the total sales for the 4 weeks before and after 2020-06-15? 
# Works, but only because the dates are in the middle of the year, 
# if at the start or end, then it would fail.

query = """
WITH sus_status AS(
    SELECT
        CASE
            WHEN date < '20200615' THEN 'before' ELSE 'after' END AS sustainable_status,
        CAST(sales AS FLOAT) AS sales
    FROM clean_weekly_sales
    WHERE (week_number  BETWEEN     (SELECT 
                                        MAX(week_number) 
                                    FROM clean_weekly_sales
                                    WHERE date = '20200615') - 4
                        AND         (SELECT 
                                        MAX(week_number) 
                                    FROM clean_weekly_sales
                                    WHERE date = '20200615') + 3)
        AND YEAR(date) = '2020' 
    )
    
SELECT 
    sustainable_status, 
    CAST(SUM(sales) AS DECIMAL(20,2)) AS sales
FROM sus_status 
GROUP BY sustainable_status
"""

execute_query_to_df(query)

Unnamed: 0,sustainable_status,sales
0,after,2318994169.0
1,before,2345878357.0


In [227]:
# What is the total sales for the 4 weeks before and after 2020-06-15? 
# This works whether the date sustainable packaging was introduced at the start
# or end of the year.

query = """
WITH sus_status AS(
    SELECT
        date,
        DENSE_RANK() 
            OVER(
                ORDER BY date) AS rownum,
        CASE
            WHEN date < '20200615' THEN 'before' ELSE 'after' END AS sustainable_status,
        CAST(sales AS FLOAT) AS sales
    FROM clean_weekly_sales
    )
    
SELECT 
    sustainable_status,
    CAST(SUM(CAST(sales AS FLOAT)) AS DECIMAL(20,0)) AS sales
FROM sus_status
WHERE   rownum >=     (SELECT         
                        MAX(rownum)
                    FROM sus_status
                    WHERE date = '20200615'
                    ) - 4
AND     rownum <=   (SELECT         
                        MAX(rownum)
                    FROM sus_status
                    WHERE date = '20200615'
                    ) + 3
GROUP BY sustainable_status
"""

execute_query_to_df(query)

Unnamed: 0,sustainable_status,sales
0,before,2345878357
1,after,2318994169


In [256]:
# What is the total sales for the 4 weeks before and after 2020-06-15? 
# What is the growth or reduction rate in actual values and percentage of sales?
# Full answer, but easier to do it with date function

query = """
WITH sus_status AS(
    SELECT
        date,
        DENSE_RANK() 
            OVER(
                ORDER BY date) AS rownum,
        CASE
            WHEN date < '20200615' THEN 'before' ELSE 'after' END AS sustainable_status,
        CAST(sales AS FLOAT) AS sales
    FROM clean_weekly_sales
    ),

sales_by_status AS( 
    SELECT 
        'actual' AS status,
        sustainable_status,
        CAST(SUM(CAST(sales AS FLOAT)) AS DECIMAL(20,0)) AS sales
    FROM sus_status
    WHERE   rownum >=     (SELECT         
                            MAX(rownum)
                        FROM sus_status
                        WHERE date = '20200615'
                        ) - 4
    AND     rownum <=   (SELECT         
                            MAX(rownum)
                        FROM sus_status
                        WHERE date = '20200615'
                        ) + 3
    GROUP BY sustainable_status
    ),
  
actual AS(   
    SELECT
        status,
        CAST(before AS DECIMAL(20,2)) AS before,
        CAST(after AS DECIMAL(20,2)) AS after
    FROM sales_by_status
        PIVOT(
            SUM(sales)
            FOR sustainable_status 
                IN (before, after)
        ) AS p
        ),

act_pct_union AS(
    SELECT * FROM actual

    UNION ALL

    SELECT 
        'percent' AS status,
        CAST(before / (before + after) * 100 AS DECIMAL(10,2)) AS before,
        CAST(after / (before + after) * 100 AS DECIMAL(10,2)) AS after
    FROM actual
)

SELECT *, after - before AS growth FROM act_pct_union
"""

execute_query_to_df(query)

Unnamed: 0,status,before,after,growth
0,actual,2345878357.0,2318994169.0,-26884188.0
1,percent,50.29,49.71,-0.58


In [303]:
# Same question as above but using DATEADD function in the WHERE clause

query = """
WITH sales_by_status AS(
        SELECT 
            'before' AS status, 
            CAST(SUM(CAST(sales AS FLOAT)) / 4 AS DECIMAL(20,0)) AS sales 
        FROM clean_weekly_sales 
        WHERE   date >= DATEADD(ww, -4, '20200615') 
            AND date < '20200615'
        
    UNION

        SELECT 
            'after' AS status, 
            CAST(SUM(CAST(sales AS FLOAT)) / 4 AS DECIMAL(20,0)) AS sales 
        FROM clean_weekly_sales 
        WHERE   date <= DATEADD(ww, 3, '20200615') 
            AND date >= '20200615'
    ),
    
sales_pivot AS (    
    SELECT
        'actual' AS status,
        CAST(before AS DECIMAL(20,2)) AS before,
        CAST(after AS DECIMAL(20,2)) AS after
    FROM sales_by_status
        PIVOT(
            SUM(sales)
            FOR status
                IN ( before, after )        
        ) AS p
    ),

sales_pivot_union AS(    
    SELECT * FROM sales_pivot   

    UNION

    SELECT 
        'percent' AS status,
        CAST(before / (after + before) * 100 AS DECIMAL(20,2)) AS before,
        CAST(after / (after + before) * 100  AS DECIMAL(20,2)) AS after
    FROM sales_pivot 
    )
 
SELECT 
    *,
    after - before AS growth
FROM sales_pivot_union
"""

execute_query_to_df(query)

Unnamed: 0,status,before,after,growth
0,actual,586469589.0,579748542.0,-6721047.0
1,percent,50.29,49.71,-0.58


In [257]:
# What about the entire 12 weeks before and after?
# Same as above, but adjusted week numbers, easier to do with date functions

query = """
WITH sus_status AS(
    SELECT
        date,
        DENSE_RANK() 
            OVER(
                ORDER BY date) AS rownum,
        CASE
            WHEN date < '20200615' THEN 'before' ELSE 'after' END AS sustainable_status,
        CAST(sales AS FLOAT) AS sales
    FROM clean_weekly_sales
    ),

sales_by_status AS( 
    SELECT 
        'actual' AS status,
        sustainable_status,
        CAST(SUM(CAST(sales AS FLOAT)) AS DECIMAL(20,0)) AS sales
    FROM sus_status
    WHERE   rownum >=     (SELECT         
                            MAX(rownum)
                        FROM sus_status
                        WHERE date = '20200615'
                        ) - 12
    AND     rownum <=   (SELECT         
                            MAX(rownum)
                        FROM sus_status
                        WHERE date = '20200615'
                        ) + 11
    GROUP BY sustainable_status
    ),
  
actual AS(   
    SELECT
        status,
        CAST(before AS DECIMAL(20,2)) AS before,
        CAST(after AS DECIMAL(20,2)) AS after
    FROM sales_by_status
        PIVOT(
            SUM(sales)
            FOR sustainable_status 
                IN (before, after)
        ) AS p
        ),

act_pct_union AS(
    SELECT * FROM actual

    UNION ALL

    SELECT 
        'percent' AS status,
        CAST(before / (before + after) * 100 AS DECIMAL(10,2)) AS before,
        CAST(after / (before + after) * 100 AS DECIMAL(10,2)) AS after
    FROM actual
)

SELECT *, after - before AS growth FROM act_pct_union
"""

execute_query_to_df(query)

Unnamed: 0,status,before,after,growth
0,actual,7126273147.0,6973947753.0,-152325394.0
1,percent,50.54,49.46,-1.08


In [273]:
# How do the sale metrics for these 2 periods before and after compare with the previous years in 2018 and 2019?

query = """
    SELECT 
        '2018' AS period, 
        CAST(SUM(CAST(sales AS FLOAT)) / 52 AS DECIMAL(20,0)) AS sales 
    FROM clean_weekly_sales 
    WHERE calendar_year = 2018

UNION

    SELECT 
        '2019' AS period, 
        CAST(SUM(CAST(sales AS FLOAT)) / 52 AS DECIMAL(20,0)) AS sales 
    FROM clean_weekly_sales 
    WHERE calendar_year = 2019

UNION

    SELECT 
        '4 weeks before' AS period, 
        CAST(SUM(CAST(sales AS FLOAT)) / 4 AS DECIMAL(20,0)) AS sales 
    FROM clean_weekly_sales 
    WHERE   date >= DATEADD(ww, -4, '20200615') 
        AND date < '20200615'
    
UNION

    SELECT 
        '4 weeks after' AS period, 
        CAST(SUM(CAST(sales AS FLOAT)) / 4 AS DECIMAL(20,0)) AS sales 
    FROM clean_weekly_sales 
    WHERE   date <= DATEADD(ww, 3, '20200615') 
        AND date >= '20200615'
"""

execute_query_to_df(query)

Unnamed: 0,period,sales
0,2018,248026554
1,2019,264346779
2,4 weeks after,579748542
3,4 weeks before,586469589


Which areas of the business have the highest negative impact in sales metrics performance in 2020 for the 12 week before and after period?

region
platform
age_band
demographic
customer_type
Do you have any further recommendations for Danny’s team at Data Mart or any interesting insights based off this analysis?

In [317]:
# Finding the top 5 combinations of region, platform, age_band, customert_type

query = """
WITH before AS(
    SELECT 
        region,
        demographic,
        customer_type,
        SUM(CAST(sales AS FLOAT)) AS sales_before
    FROM clean_weekly_sales 
    WHERE   date >= DATEADD(ww, -4, '20200615') 
        AND date < '20200615'
    GROUP BY region, demographic, customer_type
    ),
    
after AS(
    SELECT 
        region,
        demographic,
        customer_type,
        SUM(CAST(sales AS FLOAT)) AS sales_after
    FROM clean_weekly_sales 
    WHERE   date <= DATEADD(ww, 3, '20200615') 
        AND date >= '20200615'
    GROUP BY region, demographic, customer_type
    )

SELECT TOP(5) 
    b.region,
    b.demographic,
    b.customer_type,
    b.sales_before,
    a.sales_after,
    b.sales_before - a.sales_after AS change
FROM before AS b
    INNER JOIN after AS a
        ON b.region = a.region
        AND b.demographic = a.demographic
        AND b.customer_type = a.customer_type
ORDER BY change ASC
"""

execute_query_to_df(query)

Unnamed: 0,region,demographic,customer_type,sales_before,sales_after,change
0,AFRICA,unknown,Guest,182945895.0,184127468.0,-1181573.0
1,AFRICA,Couples,Existing,132609323.0,133748514.0,-1139191.0
2,AFRICA,Couples,New,36836723.0,37729445.0,-892722.0
3,EUROPE,unknown,Guest,13991430.0,14626468.0,-635038.0
4,AFRICA,Families,New,20949797.0,21439492.0,-489695.0


In [359]:
query = """
WITH region_before AS(
    SELECT 
        region,
        SUM(CAST(sales AS FLOAT)) AS sales_before
    FROM clean_weekly_sales 
    WHERE   date >= DATEADD(ww, -4, '20200615') 
        AND date < '20200615'
    GROUP BY region
    ),
    
region_after AS(
    SELECT 
        region,
        SUM(CAST(sales AS FLOAT)) AS sales_after
    FROM clean_weekly_sales 
    WHERE   date <= DATEADD(ww, 3, '20200615') 
        AND date >= '20200615'
    GROUP BY region
    ),
    
demographic_before AS(
    SELECT 
        demographic,
        SUM(CAST(sales AS FLOAT)) AS sales_before
    FROM clean_weekly_sales 
    WHERE   date >= DATEADD(ww, -4, '20200615') 
        AND date < '20200615'
    GROUP BY demographic
    ),
    
demographic_after AS(
    SELECT 
        demographic,
        SUM(CAST(sales AS FLOAT)) AS sales_after
    FROM clean_weekly_sales 
    WHERE   date <= DATEADD(ww, 3, '20200615') 
        AND date >= '20200615'
    GROUP BY demographic
    ),

customer_type_before AS(
    SELECT 
        customer_type,
        SUM(CAST(sales AS FLOAT)) AS sales_before
    FROM clean_weekly_sales 
    WHERE   date >= DATEADD(ww, -4, '20200615') 
        AND date < '20200615'
    GROUP BY customer_type
    ),
    
customer_type_after AS(
    SELECT 
        customer_type,
        SUM(CAST(sales AS FLOAT)) AS sales_after
    FROM clean_weekly_sales 
    WHERE   date <= DATEADD(ww, 3, '20200615') 
        AND date >= '20200615'
    GROUP BY customer_type
    ),

platform_before AS(
    SELECT 
        platform,
        SUM(CAST(sales AS FLOAT)) AS sales_before
    FROM clean_weekly_sales 
    WHERE   date >= DATEADD(ww, -4, '20200615') 
        AND date < '20200615'
    GROUP BY platform
    ),
    
platform_after AS(
    SELECT 
        platform,
        SUM(CAST(sales AS FLOAT)) AS sales_after
    FROM clean_weekly_sales 
    WHERE   date <= DATEADD(ww, 3, '20200615') 
        AND date >= '20200615'
    GROUP BY platform
    ),

union_all AS(
    SELECT
        'region' AS type,
        b.region AS value,
        b.sales_before,
        a.sales_after,
        b.sales_before - a.sales_after AS act_change,
        ((a.sales_after - b.sales_before) / b.sales_before) * 100 AS pct_change
    FROM region_before AS b
        INNER JOIN region_after AS a
            ON b.region = a.region

    UNION ALL

    SELECT
        'demographic' AS type,
        b.demographic AS value,
        b.sales_before,
        a.sales_after,
        b.sales_before - a.sales_after AS act_change,
        ((a.sales_after - b.sales_before) / b.sales_before) * 100 AS pct_change
    FROM demographic_before AS b
        INNER JOIN demographic_after AS a
            ON b.demographic = a.demographic
            
    UNION ALL

    SELECT
        'customer_type' AS type,
        b.customer_type AS value,
        b.sales_before,
        a.sales_after,
        b.sales_before - a.sales_after AS act_change,
        ((a.sales_after - b.sales_before) / b.sales_before) * 100 AS pct_change
    FROM customer_type_before AS b
        INNER JOIN customer_type_after AS a
            ON b.customer_type = a.customer_type
    
    UNION ALL
    
    SELECT
        'platform' AS type,
        b.platform AS value,
        b.sales_before,
        a.sales_after,
        b.sales_before - a.sales_after AS act_change,
        ((a.sales_after - b.sales_before) / b.sales_before) * 100 AS pct_change
    FROM platform_before AS b
        INNER JOIN platform_after AS a
            ON b.platform = a.platform
        ),

ranked_union AS(     
    SELECT 
        *,
        RANK() 
            OVER(
                PARTITION BY type
                ORDER BY pct_change ASC) AS ranknum    
    FROM union_all
    )
    
SELECT 
    type, 
    value, 
    CAST(sales_before/1000000 AS DECIMAL(10,2)) AS sales_before, 
    CAST(sales_after/1000000 AS DECIMAL(10,2)) AS sales_after, 
    CAST((sales_before + sales_after)/1000000 AS DECIMAL(10,2)) AS sales_total,
    CAST(act_change/1000000 AS DECIMAL(10,2)) * -1 AS act_change, 
    CAST(pct_change AS DECIMAL(10,2)) AS pct_change
FROM ranked_union 
WHERE ranknum = 1
"""

execute_query_to_df(query)

Unnamed: 0,type,value,sales_before,sales_after,sales_total,act_change,pct_change
0,customer_type,Existing,1214.04,1198.29,2412.33,-15.75,-1.3
1,demographic,Families,771.08,758.28,1529.36,-12.81,-1.66
2,platform,Shopify,76.19,73.84,150.03,-2.35,-3.08
3,region,ASIA,537.79,525.45,1063.24,-12.34,-2.29
