In [1]:
import pypyodbc as odbc
import pandas as pd

DRIVER_NAME = 'SQL SERVER'
SERVER_NAME = 'DESKTOP-I0V76P2\SQLEXPRESS' 
DATABASE_NAME = 'data_bank'

connection_string = f"""
    DRIVER={{{DRIVER_NAME}}};
    SERVER={SERVER_NAME};
    DATABASE={DATABASE_NAME};
    Trust_Connection=yes;
    # uid=;
    # pwd=;
"""

def execute_query_to_df(query):
    """Executes a SQL query and returns the results as a pandas DataFrame."""

    conn = odbc.connect(connection_string)
    cursor = conn.cursor()
    cursor.execute(query)
    rows = cursor.fetchall()
    df = pd.DataFrame(rows, columns=cursor.description)
    df.columns = [col[0] for col in df.columns]
    cursor.close()
    conn.close()
    return df

def execute_update(query):
    """Executes a SQL query that doesn't return results."""

    conn = odbc.connect(connection_string)
    cursor = conn.cursor()
    cursor.execute(query)
    conn.commit()  # Commit the changes to the database
    cursor.close()
    conn.close()

In [3]:
query = """
SELECT TABLE_NAME
FROM INFORMATION_SCHEMA.TABLES;
"""

execute_query_to_df(query)

Unnamed: 0,table_name
0,regions
1,customer_nodes
2,customer_transactions


A. Customer Nodes Exploration

How many unique nodes are there on the Data Bank system?

What is the number of nodes per region?

How many customers are allocated to each region?

How many days on average are customers reallocated to a different node?

What is the median, 80th and 95th percentile for this same reallocation days metric for each region?


In [5]:
# 1. How many unique nodes are there on the Data Bank system?

query = """
SELECT COUNT(DISTINCT node_id) FROM customer_nodes
"""

execute_query_to_df(query)

Unnamed: 0,Unnamed: 1
0,5


In [12]:
# 2. What is the number of nodes per region??

query = """
    SELECT 
        region_id, 
        COUNT(DISTINCT node_id) AS count_nodes
    FROM customer_nodes 
    GROUP BY region_id
"""

execute_query_to_df(query)

Unnamed: 0,region_id,count_nodes
0,1,5
1,2,5
2,3,5
3,4,5
4,5,5


In [18]:
# 3. How many customers are allocated to each region?

query = """
    SELECT 
        r.region_name,
        COUNT(cn.customer_id) AS customer_count
    FROM customer_nodes AS cn
        INNER JOIN regions AS r
            ON cn.region_id = r.region_id
    GROUP BY r.region_name
"""

execute_query_to_df(query)

Unnamed: 0,region_name,customer_count
0,Africa,714
1,America,735
2,Asia,665
3,Australia,770
4,Europe,616


In [23]:
query = """
    UPDATE customer_nodes
        SET
            end_date = CASE WHEN end_date = '1900-01-01' THEN NULL ELSE end_date END

"""

execute_update(query)

In [26]:
# 4. How many days on average are customers reallocated to a different node?

query = """
    SELECT
        AVG(DATEDIFF(day, start_date, end_date))
    FROM customer_nodes
    WHERE end_date IS NOT NULL
"""

execute_query_to_df(query)

Unnamed: 0,Unnamed: 1
0,14


In [30]:


query = """
    SELECT
        r.region_name,
        AVG(DATEDIFF(day, start_date, end_date)) AS avg_reallocation_days,
        
    FROM customer_nodes AS cn
        INNER JOIN regions AS r
            ON r.region_id = cn.region_id
    WHERE end_date IS NOT NULL
    GROUP BY r.region_name
"""

execute_query_to_df(query)

Unnamed: 0,region_name,avg_reallocation_days
0,Africa,14
1,America,14
2,Asia,14
3,Australia,14
4,Europe,14


In [59]:
# 5. What is the median, 80th and 95th percentile for this same reallocation 
# days metric for each region?

query = """
WITH cte AS(
    SELECT
        r.region_name,
        DATEDIFF(day, start_date, end_date) AS reallocation_days
    FROM customer_nodes AS cn
        INNER JOIN regions AS r
            ON r.region_id = cn.region_id
    WHERE end_date IS NOT NULL
    ),
    
cte2 AS(
    SELECT
        region_name,
        PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY reallocation_days) OVER (PARTITION BY region_name) AS Median,
        PERCENTILE_CONT(0.8) WITHIN GROUP (ORDER BY reallocation_days) OVER (PARTITION BY region_name) AS Percentile80,
        PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY reallocation_days) OVER (PARTITION BY region_name) AS Percentile95
    FROM cte
    )
    
SELECT 
region_name, 
    AVG(Median) AS median, 
    AVG(Percentile80) As Percentile80, 
    AVG(Percentile95) As Percentile95 
FROM cte2 
GROUP BY region_name
"""

execute_query_to_df(query)

Unnamed: 0,region_name,median,percentile80,percentile95
0,Africa,15.0,24.0,28.0
1,America,15.0,23.0,28.0
2,Asia,15.0,23.0,28.0
3,Australia,15.0,23.0,28.0
4,Europe,15.0,24.0,28.0


What is the unique count and total amount for each transaction type?

What is the average total historical deposit counts and amounts for all customers?

For each month - how many Data Bank customers make more than 1 deposit and either 1 purchase or 1 withdrawal in a single month?

What is the closing balance for each customer at the end of the month?

What is the percentage of customers who increase their closing balance by more than 5%?


In [57]:
# What is the unique count and total amount for each transaction type?

query = """
SELECT 
    txn_type AS transaction_type, 
    COUNT(txn_type) AS count_transactions, 
    SUM(txn_amount) AS total_amount
FROM customer_transactions
GROUP BY txn_type
"""

execute_query_to_df(query)

Unnamed: 0,transaction_type,count_transactions,total_amount
0,purchase,1617,806537
1,withdrawal,1580,793003
2,deposit,2671,1359168


In [64]:
# What is the average total historical deposit counts and amounts for all customers?

query = """
SELECT 
    txn_type AS transaction_type, 
    COUNT(txn_type) / COUNT(DISTINCT customer_id) AS avg_count_ransactions, 
    AVG(txn_amount) AS avg_amt
FROM customer_transactions
WHERE txn_type = 'deposit'
GROUP BY txn_type
"""

execute_query_to_df(query)

Unnamed: 0,transaction_type,avg_count_ransactions,avg_amt
0,deposit,5,508


In [83]:
# For each month - how many Data Bank customers make more than 1 deposit
# and either 1 purchase or 1 withdrawal in a single month?

query = """
    WITH cte AS(
        SELECT 
            customer_id,
            DATEPART(month, txn_date) AS month,
            txn_type,
            CASE WHEN txn_type = 'deposit' THEN 1 END AS deposit,
            CASE WHEN txn_type = 'purchase' THEN 1 END AS purchase,
            CASE WHEN txn_type = 'withdrawal' THEN 1 END AS withdrawal
        FROM customer_transactions
    ),
    
    cte2 AS (
    SELECT 
        customer_id, 
        month,
        CASE 
            WHEN SUM(deposit) > 1 AND (SUM(purchase) >= 1 OR SUM(withdrawal) > = 1)
                THEN 1 ELSE 0 END AS test
    FROM cte
    GROUP BY customer_id, month
    )
    
SELECT 
    month, 
    SUM(test) AS ct_test 
FROM cte2 GROUP BY month
"""

execute_query_to_df(query)
 

Unnamed: 0,month,ct_test
0,1,168
1,2,181
2,3,192
3,4,70


In [111]:
# What is the closing balance for each customer at the end of the month?

query = """
    WITH deposit_cte AS (
    SELECT 
        customer_id, 
        DATEPART(month, txn_date) AS month,
        SUM(txn_amount) AS sum_deposit
    FROM customer_transactions
    WHERE txn_type = 'deposit'
    GROUP BY customer_id, DATEPART(month, txn_date)
    ),

    spend_cte AS (
        SELECT 
        customer_id, 
        DATEPART(month, txn_date) AS month,
        SUM(txn_amount) AS sum_spend
    FROM customer_transactions
    WHERE txn_type <> 'deposit'
    GROUP BY customer_id, DATEPART(month, txn_date)
    )

SELECT
    d.customer_id,
    d.month,
    SUM(d.sum_deposit - s.sum_spend) OVER(
        PARTITION BY d.customer_id
        ORDER BY d.month
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS balance        
FROM deposit_cte AS d
    LEFT OUTER JOIN spend_cte AS s
        ON d.customer_id = s.customer_id 
        AND d.month = s.month
"""

execute_query_to_df(query)

Unnamed: 0,customer_id,month,balance
0,1,1,
1,1,3,-952.0
2,2,1,
3,2,3,
4,3,1,
...,...,...,...
1421,499,2,1415.0
1422,499,3,599.0
1423,500,1,1594.0
1424,500,2,2981.0


In [149]:
# What is the percentage of customers who increase their closing balance by more than 5%?

query = """
    WITH deposit_cte AS (
    SELECT 
        customer_id, 
        DATEPART(month, txn_date) AS month,
        SUM(txn_amount) AS sum_deposit
    FROM customer_transactions
    WHERE txn_type = 'deposit'
    GROUP BY customer_id, DATEPART(month, txn_date)
    ),

    spend_cte AS (
        SELECT 
        customer_id, 
        DATEPART(month, txn_date) AS month,
        SUM(txn_amount) AS sum_spend
    FROM customer_transactions
    WHERE txn_type <> 'deposit'
    GROUP BY customer_id, DATEPART(month, txn_date)
    ),

    balance_cte AS (
    SELECT
        d.customer_id,
        d.month,
        SUM(d.sum_deposit - s.sum_spend) OVER(
            PARTITION BY d.customer_id
            ORDER BY d.month
            ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS balance        
    FROM deposit_cte AS d
        LEFT OUTER JOIN spend_cte AS s
            ON d.customer_id = s.customer_id 
            AND d.month = s.month
    ),

    increase_cte AS (
        SELECT
            month,
            CASE 
                WHEN (CASE WHEN balance IS NULL OR balance = 0 THEN 0
            ELSE LAG(balance) OVER(PARTITION BY customer_id ORDER BY month) / CAST(balance AS FLOAT)
            END) > 1.05
                THEN 1 ELSE 0 END AS increase
        FROM balance_cte
    )
    
SELECT
    month,
    CAST(((SELECT COUNT(increase) FROM increase_cte WHERE month = i.month AND increase = 1) * 1.00 /
    (SELECT COUNT(increase) FROM increase_cte WHERE month = i.month) * 100) AS DECIMAL(10,2)) AS increase_above_5pct
FROM increase_cte AS i
GROUP BY month
"""

execute_query_to_df(query)

Unnamed: 0,month,increase_above_5pct
0,1,0.0
1,2,9.14
2,3,15.86
3,4,18.14


### C. Data Allocation Challenge

To test out a few different hypotheses - the Data Bank team wants to run an experiment where different groups of customers would be allocated data using 3 different options:

Option 1: data is allocated based off the amount of money at the end of the previous month
Option 2: data is allocated on the average amount of money kept in the account in the previous 30 days
Option 3: data is updated real-time
For this multi-part challenge question - you have been requested to generate the following data elements to help the Data Bank team estimate how much data will need to be provisioned for each option:

running customer balance column that includes the impact each transaction
customer balance at the end of each month
minimum, average and maximum values of the running balance for each customer
Using all of the data available - how much data would have been required for each option on a monthly basis?

### D. Extra Challenge
Data Bank wants to try another option which is a bit more difficult to implement - they want to calculate data growth using an interest calculation, just like in a traditional savings account you might have with a bank.

If the annual interest rate is set at 6% and the Data Bank team wants to reward its customers by increasing their data allocation based off the interest calculated on a daily basis at the end of each day, how much data would be required for this option on a monthly basis?

Special notes:

Data Bank wants an initial calculation which does not allow for compounding interest, however they may also be interested in a daily compounding interest calculation so you can try to perform this calculation if you have the stamina!