In this notebook, I performed ETL and attempted to answer business questions related to the eCommerce data, specifically the transactions table. I connected the notebook to the database using psycopg2 and used my own queries (which can also be found in ../sql-queries/business-insights.sql) to derive insights. The queries range from simple to more complex, covering various aspects of the business.

In [1]:
from sqlalchemy import create_engine
from sqlalchemy.exc import OperationalError

import psycopg2
from psycopg2 import OperationalError

import pandas as pd

In [2]:
def create_conn():
    conn = None
    try:
        conn = psycopg2.connect(
            database="db-ecommerce",
            user="postgres",
            password="superadmin",
            host="localhost",
            port="5432",
        )
        print("Connection to PostgreSQL DB successful")
    except OperationalError as e:
        print(f"The error '{e}' occurred")
    return conn

engine = create_conn()

Connection to PostgreSQL DB successful


In [3]:
def query_data_from_db(engine, query):
    """
    Perform data transformation from a database using a given SQL query
    Args:
        connection: A connection object
        query (str): SQL query
    Returns:
        df: A dataframe containing the transformed data
    """
    df = None
    try:
        df = pd.read_sql_query(query, con=engine)
        print("Data fetched successfully.")
    except Exception as e:
        print(f"Error occurred during data fetching: {e}")
    return df

In [4]:
#  Query to find the top 10 products that generate the highest revenue. 
#  This helps to understand which products contribute most to the company's income.

query = """
    SELECT  
        sku_product, 
        product_name_v2, 
        SUM(product_revenue) AS total_revenue
    FROM 
        transaction_records
    WHERE 
        sku_product IS NOT NULL
    GROUP BY 
        sku_product, product_name_v2
    ORDER BY
        total_revenue DESC
    LIMIT 10;
"""

result_df = query_data_from_db(engine, query)

result_df

Data fetched successfully.


  df = pd.read_sql_query(query, con=engine)


Unnamed: 0,sku_product,product_name_v2,total_revenue
0,GGOEGEVB070499,Google Bluetooth Headphones,2007600000.0
1,GGOEGETB023799,Google Power Bank,1685700000.0
2,GGOEGAUB058315,Google Women's Performance Polo Grey/Black,1071142000.0
3,GGOEGEVR014999,UFO Bluetooth Water Resistant Speaker,1050450000.0
4,GGOEGAUB058313,Google Women's Performance Polo Grey/Black,819182500.0
5,GGOENEBJ081899,Nest® Learning Thermostat 3rd Gen - CA - Stain...,804000000.0
6,GGOEGBRA037499,Waterproof Backpack,800400000.0
7,GGOEGBPB081999,UpCycled Bike Saddle Bag,800030800.0
8,GGOEGEVA022399,Micro Wireless Earbud,799953900.0
9,GGOEGAUB058316,Google Women's Performance Polo Grey/Black,693202500.0


In [5]:
# -- Query to find the top 10 cities that generate the highest number of transactions. 
# -- This gives an overview of the geographical distribution of the company's sales.

query = """
    SELECT 
        geo_network_city, 
        COUNT(*) AS number_of_transactions
    FROM
        transaction_records
    WHERE 
        geo_network_city IS NOT NULL AND geo_network_city <> 'not available in demo dataset'
    GROUP BY 
        geo_network_city
    ORDER BY 
        number_of_transactions DESC 
    LIMIT 10;
"""

result_df = query_data_from_db(engine, query)

result_df

Data fetched successfully.


  df = pd.read_sql_query(query, con=engine)


Unnamed: 0,geo_network_city,number_of_transactions
0,Toronto,84
1,(not set),51
2,New York,51
3,Mountain View,32
4,San Francisco,23
5,Sunnyvale,21
6,Maracaibo,18
7,Chicago,17
8,Sao Paulo,15
9,Salem,14


In [6]:
# -- Query to find the top 10 countries that generate the highest number of transactions. 
# -- This could inform international marketing strategies.

query = """
	SELECT 
		geo_network_country, 
		COUNT(*) AS number_of_transactions
	FROM
		transaction_records
	WHERE 
		geo_network_country IS NOT NULL AND geo_network_country <> 'not available in demo dataset'
	GROUP BY 
		geo_network_country
	ORDER BY 
		number_of_transactions DESC 
	LIMIT 10;
"""

result_df = query_data_from_db(engine, query)

result_df

Data fetched successfully.


  df = pd.read_sql_query(query, con=engine)


Unnamed: 0,geo_network_country,number_of_transactions
0,United States,387
1,Canada,204
2,Venezuela,78
3,Japan,40
4,China,39
5,Mexico,23
6,Brazil,22
7,Taiwan,21
8,India,19
9,Ukraine,14


In [7]:
# -- Query to find the average time a user spends on the site. 
# -- This is a useful indicator of user engagement.

query = """
	SELECT 
		ROUND(AVG(time_spent), 2) AS average_time_spend_seconds,
		ROUND(AVG(time_spent) / 60, 2) AS average_time_spend_minutes
	FROM
		transaction_records;
"""

result_df = query_data_from_db(engine, query)

result_df

Data fetched successfully.


  df = pd.read_sql_query(query, con=engine)


Unnamed: 0,average_time_spend_seconds,average_time_spend_minutes
0,1890.35,31.51


In [8]:
# -- Query to find the top 10 products with the highest revenue per transaction. 
# -- This could help identify high-value products.

query = """
	SELECT
		sku_product,
		product_name_v2,
		SUM(product_revenue) / COUNT(transaction_id) AS revenue_per_transaction
	FROM
		transaction_records 
	WHERE 
		sku_product IS NOT NULL
	GROUP BY
		sku_product,
		product_name_v2
	ORDER BY 
		revenue_per_transaction DESC 
	LIMIT 10;
"""

result_df = query_data_from_db(engine, query)

result_df

Data fetched successfully.


  df = pd.read_sql_query(query, con=engine)


Unnamed: 0,sku_product,product_name_v2,revenue_per_transaction
0,GGOEGEVB070499,Google Bluetooth Headphones,2007600000.0
1,GGOEGETB023799,Google Power Bank,1685700000.0
2,GGOEGAUB058315,Google Women's Performance Polo Grey/Black,1071143000.0
3,GGOEGEVR014999,UFO Bluetooth Water Resistant Speaker,1050450000.0
4,GGOEGAUB058313,Google Women's Performance Polo Grey/Black,819182500.0
5,GGOENEBJ081899,Nest® Learning Thermostat 3rd Gen - CA - Stain...,804000000.0
6,GGOEGBRA037499,Waterproof Backpack,800400000.0
7,GGOEGBPB081999,UpCycled Bike Saddle Bag,800030800.0
8,GGOEGEVA022399,Micro Wireless Earbud,799953900.0
9,GGOEGAUB058316,Google Women's Performance Polo Grey/Black,693202500.0


In [16]:
# -- More complex query that uses window functions to find the top 3 products (by revenue) for each city. 
# -- This can provide insights into regional product preferences.

query = """
	SELECT 
		city_product_rank.geo_network_city,
		city_product_rank.sku_product,
		city_product_rank.total_revenue,
		city_product_rank.product_rank
	FROM
		(
			SELECT
				geo_network_city,
				sku_product,
				SUM(product_revenue) AS total_revenue,
				RANK() OVER (
					PARTITION BY geo_network_city
					ORDER BY SUM(product_revenue) DESC
				) AS product_rank
			FROM 
				transaction_records
			WHERE 
				geo_network_city IS NOT NULL AND geo_network_city NOT IN ('not available in demo dataset', '(not set)')
			GROUP BY
				geo_network_city,
				sku_product
		) AS city_product_rank
	WHERE 
		city_product_rank.product_rank <= 3;
"""

result_df = query_data_from_db(engine, query)

result_df

Data fetched successfully.


  df = pd.read_sql_query(query, con=engine)


Unnamed: 0,geo_network_city,sku_product,total_revenue,product_rank
0,Ahmedabad,GGOEGOBG023599,3865000.0,1
1,Ahmedabad,GGOEYFKQ020699,2865000.0,2
2,Ahmedabad,GGOEGOAB021699,2865000.0,2
3,Amsterdam,GGOEGAWR061050,26190000.0,1
4,Ann Arbor,GGOEADHJ015599,20690000.0,1
...,...,...,...,...
153,Washington,GGOEAAXJ066228,15256666.0,2
154,Yokohama,GGOEGAXC065228,17990000.0,1
155,Yokohama,GGOEGAXJ065528,12890000.0,2
156,Zurich,GGOEYAQB073216,44880908.0,1


In [17]:
engine.close()