In [None]:
%load_ext sql
%env DATABASE_URL = postgresql://marcio_gabriel:123456@localhost:5432/data_engineering

In [None]:
%sql DROP TABLE IF EXISTS users;

In [None]:
%%sql
CREATE TABLE users (
    user_id SERIAL PRIMARY KEY,
    user_first_name VARCHAR(30) NOT NULL,
    user_last_name VARCHAR(30) NOT NULL,
    user_email_id VARCHAR(50) NOT NULL,
    user_email_validated BOOLEAN DEFAULT FALSE,
    user_password VARCHAR(200),
    user_role VARCHAR(1) NOT NULL DEFAULT 'U', --U and A
    is_active BOOLEAN DEFAULT FALSE,
    created_dt DATE DEFAULT CURRENT_DATE,
    last_updated_ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

In [None]:
%sql DROP TABLE IF EXISTS users_part;

In [None]:
%%sql
CREATE TABLE users_part (
    user_id SERIAL,
    user_first_name VARCHAR(30) NOT NULL,
    user_last_name VARCHAR(30) NOT NULL,
    user_email_id VARCHAR(50) NOT NULL,
    user_email_validated BOOLEAN DEFAULT FALSE,
    user_password VARCHAR(200),
    user_role VARCHAR(1) NOT NULL DEFAULT 'U', --U and A
    is_active BOOLEAN DEFAULT FALSE,
    created_dt DATE DEFAULT CURRENT_DATE,
    last_updated_ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (user_role, user_id)
)PARTITION BY LIST(user_role);

In [None]:
%%sql
CREATE INDEX users_part_email_id_idx
    ON users_part(user_email_id)

below INSERT statement will fail as we have not added any partitions to the table 'users_part' even though it is created as partitioned table

In [None]:
%%sql
INSERT INTO users_part (user_first_name, user_last_name, user_email_id)
VALUES
    ('Scott', 'Tiger', 'scott@tiger.com'),
    ('Donal', 'Duck', 'donald@duck.com'),
    ('Mickey', 'Mouse', 'mickey@mouse.com')

# MANAGIN PARTITIONS - LIST

In [None]:
%%sql
CREATE TABLE users_part_default
PARTITION OF users_part DEFAULT

In [None]:
%%sql
INSERT INTO users_part (user_first_name, user_last_name, user_email_id, user_role)
VALUES
    ('Scott', 'Tiger', 'scott@tiger.com','U'),
    ('Donal', 'Duck', 'donald@duck.com', 'U'),
    ('Mickey', 'Mouse', 'mickey@mouse.com', 'U')

In [None]:
%sql SELECT * FROM users_part

In [None]:
%%sql
CREATE TABLE users_part_a
PARTITION OF users_part 
FOR VALUES IN ('A')

In [None]:
%%sql
UPDATE users_part
SET
    user_role = 'A'
WHERE user_email_id = 'scott@tiger.com'

In [None]:
%sql SELECT * FROM users_part

# PARTITIONING RANGE

In [None]:
%sql DROP TABLE IF EXISTS users_part_range_part

In [None]:
%%sql
CREATE TABLE users_part_range_part (
    user_id SERIAL,
    user_first_name VARCHAR(30) NOT NULL,
    user_last_name VARCHAR(30) NOT NULL,
    user_email_id VARCHAR(50) NOT NULL,
    user_email_validated BOOLEAN DEFAULT FALSE,
    user_password VARCHAR(200),
    user_role VARCHAR(1) NOT NULL DEFAULT 'U', --U and A
    is_active BOOLEAN DEFAULT FALSE,
    created_dt DATE DEFAULT CURRENT_DATE,
    last_updated_ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (created_dt, user_id)
)PARTITION BY RANGE(created_dt);

In [None]:
%%sql
CREATE TABLE users_range_part_default
PARTITION OF users_part_range_part DEFAULT

In [None]:
%%sql
CREATE TABLE users_range_part_2016
PARTITION OF users_part_range_part
FOR VALUES FROM ('2016-01-01') TO ('2016-12-31')

In [None]:
%%sql
ALTER TABLE users_part_range_part
    DETACH PARTITION users_range_part_2016

In [None]:
import pandas as pd
from pandas.tseries.offsets import MonthBegin, MonthEnd

months = pd.date_range(start='1/1/2016', end='3/31/2016', freq='1M')
for month in months:
    begin_date = month - MonthBegin()
    end_date = month + MonthEnd(0)
    #print(end_date)
    #print(begin_date)
    print(str(month).replace('-','')[:6], end=':')
    print(str(begin_date).split(' ')[0], end=':')
    print(str(end_date).split(' ')[0])

In [None]:
! pip install psycopg2

In [None]:
import pandas as pd
import psycopg2
from pandas.tseries.offsets import MonthBegin, MonthEnd

months = pd.date_range(start='1/1/2016', end='12/31/2022', freq='1M')

connection = psycopg2.connect(
    host='localhost',
    port='5432',
    database='data_engineering',
    user='marcio_gabriel',
    password='123456'

)

cursor = connection.cursor()
table_name = 'users_part_range_part'
query = '''
    CREATE TABLE {table_name}_{yyyymm}
    PARTITION OF {table_name}
    FOR VALUES FROM ('{begin_date}') TO ('{end_date}')
'''

for month in months:
    begin_date = month - MonthBegin()
    end_date = month + MonthEnd(0)
    print(f'Adding partition for {begin_date} and {end_date}')
    cursor.execute(
        query.format(
            table_name = table_name,
            yyyymm = str(month).replace('-','')[:6],
            begin_date = str(begin_date).split(' ')[0],
            end_date = str(end_date).split(' ')[0]
        ),()
    )
connection.commit()
cursor.close()
connection.close()

# HAS PARTITIONING

In [None]:
%sql DROP TABLE IF EXISTS user_hash_part

In [None]:
%%sql
CREATE TABLE users_hash_part (
    user_id SERIAL,
    user_first_name VARCHAR(30) NOT NULL,
    user_last_name VARCHAR(30) NOT NULL,
    user_email_id VARCHAR(50) NOT NULL,
    user_email_validated BOOLEAN DEFAULT FALSE,
    user_password VARCHAR(200),
    user_role VARCHAR(1) NOT NULL DEFAULT 'U', --U and A
    is_active BOOLEAN DEFAULT FALSE,
    created_dt DATE DEFAULT CURRENT_DATE,
    last_updated_ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (user_id)
)PARTITION BY HASH(user_id);

In [None]:
%%sql
CREATE TABLE users_hash_part_0_of_8
PARTITION OF users_hash_part
FOR VALUES WITH(modulus 8, remainder 0)

In [None]:
%%sql
CREATE TABLE users_hash_part_1_of_8
PARTITION OF users_hash_part
FOR VALUES WITH(modulus 8, remainder 1)

In [None]:
%%sql
CREATE TABLE users_hash_part_2_of_8
PARTITION OF users_hash_part
FOR VALUES WITH(modulus 8, remainder 2)

In [None]:
%%sql
CREATE TABLE users_hash_part_3_of_8
PARTITION OF users_hash_part
FOR VALUES WITH(modulus 8, remainder 3)

In [None]:
%%sql
CREATE TABLE users_hash_part_4_of_8
PARTITION OF users_hash_part
FOR VALUES WITH(modulus 8, remainder 4)

In [None]:
%%sql
CREATE TABLE users_hash_part_5_of_8
PARTITION OF users_hash_part
FOR VALUES WITH(modulus 8, remainder 5)

In [None]:
%%sql
CREATE TABLE users_hash_part_6_of_8
PARTITION OF users_hash_part
FOR VALUES WITH(modulus 8, remainder 6)

In [None]:
%%sql
CREATE TABLE users_hash_part_7_of_8
PARTITION OF users_hash_part
FOR VALUES WITH(modulus 8, remainder 7)

In [None]:
%%sql
INSERT INTO users_hash_part (user_first_name, user_last_name, user_email_id, created_dt)
VALUES
    ('Scott', 'Tiger', 'scott@tiger.com','2018-10-01'),
    ('Donal', 'Duck', 'donald@duck.com', '2019-02-10'),
    ('Mickey', 'Mouse', 'mickey@mouse.com', '2017-06-22')

In [None]:
%sql SELECT * FROM users_hash_part

# SUB PARTITIONING

In [None]:
%sql DROP TABLE IF EXISTS user_qtly

In [None]:
%%sql
CREATE TABLE user_qtly (
    user_id SERIAL,
    user_first_name VARCHAR(30) NOT NULL,
    user_last_name VARCHAR(30) NOT NULL,
    user_email_id VARCHAR(50) NOT NULL,
    user_email_validated BOOLEAN DEFAULT FALSE,
    user_password VARCHAR(200),
    usser_role VARCHAR(1) NOT NULL DEFAULT 'U', --U and A
    is_active BOOLEAN DEFAULT FALSE,
    created_dt DATE DEFAULT CURRENT_DATE,
    created_year INT,
    created_mnth INT,
    last_update_ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY(created_year, created_mnth, user_id)
)PARTITION BY LIST(created_year)

In [None]:
%%sql
CREATE TABLE user_qtly_2016
PARTITION OF user_qtly
FOR VALUES IN (2016)
PARTITION BY LIST(created_mnth)

In [None]:
%%sql
CREATE TABLE user_qtly_2016q1
PARTITION OF user_qtly_2016
FOR VALUES IN (1, 2, 3)

In [None]:
%%sql
CREATE TABLE user_qtly_2016q2
PARTITION OF user_qtly_2016
FOR VALUES IN (4, 5, 6)

In [None]:
%%sql
CREATE TABLE user_qtly_2016q3
PARTITION OF user_qtly_2016
FOR VALUES IN (7, 8, 9)

In [None]:
%%sql
CREATE TABLE user_qtly_2016q4
PARTITION OF user_qtly_2016
FOR VALUES IN (10, 11, 12)

# EXERCISES

Exercise 1¶
Create table orders_part with the same columns as orders.

Partition the table by month using range partitioning on order_date.

Add 14 partitions - 13 based up on the data and 1 default. Here is the naming convention.

Default - orders_part_default

Partition for 2014 January - orders_part_201401



In [None]:
%%sql SELECT * FROM orders 
ORDER BY order_date DESC 
LIMIT 10


In [None]:
%%sql
SELECT * FROM information_schema.columns WHERE table_name = 'orders'

In [None]:
%%sql
SELECT * FROM information_schema.columns WHERE table_name = 'customers'

In [None]:
%%sql
CREATE TABLE orders_part(
    order_id SERIAL,
    order_date DATE DEFAULT CURRENT_DATE,
    order_customer_id INT,
    order_status VARCHAR(45),
    PRIMARY KEY(order_date,order_id)
)PARTITION BY RANGE(order_date)

In [None]:
%%sql
ALTER TABLE
orders_part
ADD CONSTRAINT fk_customer_order FOREIGN KEY (order_customer_id) REFERENCES customers(customer_id)

In [None]:
%%sql
DROP TABLE IF EXISTS orders_part_default

In [None]:
%%sql
CREATE TABLE orders_part_default
PARTITION OF orders_part DEFAULT

In [None]:
import pandas as pd
import psycopg2
from pandas.tseries.offsets import MonthBegin, MonthEnd

months = pd.date_range(start='1/1/2013', end='12/31/2015', freq='1M')

connection = psycopg2.connect(
    host='localhost',
    port='5432',
    database='data_engineering',
    user='marcio_gabriel',
    password='123456'

)

cursor = connection.cursor()
table_name = 'orders_part'
query = '''
    CREATE TABLE {table_name}_{yyyymm}
    PARTITION OF {table_name}
    FOR VALUES FROM ('{begin_date}') TO ('{end_date}')
'''

for month in months:
    begin_date = month - MonthBegin()
    end_date = month + MonthEnd(0)
    print(f'Adding partition for {begin_date} and {end_date}')
    cursor.execute(
        query.format(
            table_name = table_name,
            yyyymm = str(month).replace('-','')[:6],
            begin_date = str(begin_date).split(' ')[0],
            end_date = str(end_date).split(' ')[0]
        ),()
    )
connection.commit()
cursor.close()
connection.close()

Let us load and validate data in the partitioned table.

Load the data from orders into orders_part.

Get count on orders_part as well as all the 14 partitions. You should get 0 for default partition and all the records should be distributed using the other 13 partitions.

In [None]:
%%sql
INSERT INTO orders_part
SELECT * FROM orders

In [None]:
%%sql
SELECT * FROM orders_part_201405