In [2]:
# %% Imports and Configuration
import pandas as pd
from sqlalchemy import create_engine, text, inspect
from IPython.display import display, Markdown

# PostgreSQL Configuration
PG_CONFIG = {
    'host': 'localhost',
    'port': 5432,
    'dbname': 'olap',
    'user': 'postgres',
    'password': 'aa'
}

# Create SQLAlchemy engine
conn_str = f"postgresql://{PG_CONFIG['user']}:{PG_CONFIG['password']}@{PG_CONFIG['host']}:{PG_CONFIG['port']}/{PG_CONFIG['dbname']}"
engine = create_engine(conn_str)

print('‚úÖ Connected to PostgreSQL')
print(f"üóÑÔ∏è  Database: {PG_CONFIG['dbname']} @ {PG_CONFIG['host']}:{PG_CONFIG['port']}")

‚úÖ Connected to PostgreSQL
üóÑÔ∏è  Database: olap @ localhost:5432


## 1. Database Overview

In [3]:
# %% Database Version and Connection Info
print('=' * 80)
print('üîß DATABASE SERVER INFO')
print('=' * 80)

version_query = "SELECT version();"
version = pd.read_sql(version_query, engine)['version'].iloc[0]
print(f'\nüìå PostgreSQL Version:')
print(version)

# Current database
db_info = pd.read_sql("SELECT current_database(), current_user, inet_server_addr(), inet_server_port();", engine)
print(f'\nüìå Connection Details:')
display(db_info)

üîß DATABASE SERVER INFO

üìå PostgreSQL Version:
PostgreSQL 16.11 (Ubuntu 16.11-0ubuntu0.24.04.1) on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0, 64-bit

üìå Connection Details:


Unnamed: 0,current_database,current_user,inet_server_addr,inet_server_port
0,olap,postgres,127.0.0.1,5432


In [16]:
# %% Database Size
print('=' * 80)
print('üíæ DATABASE SIZE')
print('=' * 80)

size_query = """
SELECT 
    pg_database.datname AS database_name,
    pg_size_pretty(pg_database_size(pg_database.datname)) AS size
FROM pg_database
WHERE datname = current_database();
"""
db_size = pd.read_sql(size_query, engine)
display(db_size)

üíæ DATABASE SIZE


Unnamed: 0,database_name,size
0,olap,10 MB


## 2. Tables Overview

In [17]:
# %% List All Tables with Row Counts
print('=' * 80)
print('üìã ALL TABLES WITH ROW COUNTS')
print('=' * 80)

tables_query = """
SELECT 
    schemaname AS schema,
    tablename AS table_name,
    pg_size_pretty(pg_total_relation_size(schemaname || '.' || tablename)) AS total_size,
    pg_size_pretty(pg_relation_size(schemaname || '.' || tablename)) AS data_size,
    pg_size_pretty(pg_indexes_size(schemaname || '.' || tablename)) AS index_size
FROM pg_tables
WHERE schemaname = 'public'
ORDER BY pg_total_relation_size(schemaname || '.' || tablename) DESC;
"""
tables_df = pd.read_sql(tables_query, engine)

# Add row counts
row_counts = []
for table in tables_df['table_name']:
    try:
        cnt = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {table}", engine)['cnt'].iloc[0]
        row_counts.append(cnt)
    except:
        row_counts.append(0)

tables_df['row_count'] = row_counts
tables_df['row_count'] = tables_df['row_count'].apply(lambda x: f'{x:,}')

display(tables_df)

üìã ALL TABLES WITH ROW COUNTS


Unnamed: 0,schema,table_name,total_size,data_size,index_size,row_count
0,public,fact_sales,1672 kB,968 kB,672 kB,9800
1,public,dim_product,384 kB,208 kB,144 kB,1861
2,public,dim_customer,176 kB,56 kB,96 kB,793
3,public,dim_time,168 kB,88 kB,48 kB,1230
4,public,dim_geography,112 kB,56 kB,32 kB,628


## 3. Table Schemas

In [18]:
# %% Detailed Schema for Each Table
print('=' * 80)
print('üìê TABLE SCHEMAS')
print('=' * 80)

schema_query = """
SELECT 
    c.table_name,
    c.column_name,
    c.data_type,
    c.character_maximum_length,
    c.numeric_precision,
    c.is_nullable,
    c.column_default
FROM information_schema.columns c
WHERE c.table_schema = 'public'
ORDER BY c.table_name, c.ordinal_position;
"""

schema_df = pd.read_sql(schema_query, engine)

# Display grouped by table
for table in schema_df['table_name'].unique():
    print(f'\nüìã {table.upper()}')
    print('-' * 60)
    table_schema = schema_df[schema_df['table_name'] == table][['column_name', 'data_type', 'is_nullable', 'column_default']]
    display(table_schema)

üìê TABLE SCHEMAS

üìã DIM_CUSTOMER
------------------------------------------------------------


Unnamed: 0,column_name,data_type,is_nullable,column_default
0,customer_key,integer,NO,nextval('dim_customer_customer_key_seq'::regcl...
1,customer_id,character varying,NO,
2,customer_name,character varying,YES,
3,segment,character varying,YES,



üìã DIM_GEOGRAPHY
------------------------------------------------------------


Unnamed: 0,column_name,data_type,is_nullable,column_default
4,geo_key,integer,NO,nextval('dim_geography_geo_key_seq'::regclass)
5,region,character varying,YES,
6,state,character varying,YES,
7,city,character varying,YES,
8,country,character varying,YES,'United States'::character varying
9,postal_code,integer,YES,



üìã DIM_PRODUCT
------------------------------------------------------------


Unnamed: 0,column_name,data_type,is_nullable,column_default
10,product_key,integer,NO,nextval('dim_product_product_key_seq'::regclass)
11,product_id,character varying,YES,
12,category,character varying,YES,
13,sub_category,character varying,YES,
14,product_name,character varying,YES,



üìã DIM_TIME
------------------------------------------------------------


Unnamed: 0,column_name,data_type,is_nullable,column_default
15,time_key,integer,NO,nextval('dim_time_time_key_seq'::regclass)
16,date_full,date,NO,
17,year,smallint,NO,
18,quarter,smallint,NO,
19,month,smallint,NO,
20,week,smallint,NO,
21,day,smallint,NO,
22,day_name,character varying,YES,
23,quarter_name,character varying,YES,
24,month_name,character varying,YES,



üìã FACT_SALES
------------------------------------------------------------


Unnamed: 0,column_name,data_type,is_nullable,column_default
25,fact_key,integer,NO,nextval('fact_sales_fact_key_seq'::regclass)
26,time_key,integer,YES,
27,customer_key,integer,YES,
28,geo_key,integer,YES,
29,product_key,integer,YES,
30,order_id,character varying,YES,
31,ship_mode,character varying,YES,
32,sales,numeric,NO,
33,quantity,integer,YES,1
34,load_date,timestamp without time zone,YES,CURRENT_TIMESTAMP


## 4. Primary Keys and Foreign Keys

In [19]:
# %% Primary Keys
print('=' * 80)
print('üîë PRIMARY KEYS')
print('=' * 80)

pk_query = """
SELECT 
    tc.table_name,
    kcu.column_name,
    tc.constraint_name
FROM information_schema.table_constraints tc
JOIN information_schema.key_column_usage kcu 
    ON tc.constraint_name = kcu.constraint_name
    AND tc.table_schema = kcu.table_schema
WHERE tc.constraint_type = 'PRIMARY KEY'
    AND tc.table_schema = 'public'
ORDER BY tc.table_name;
"""

pk_df = pd.read_sql(pk_query, engine)
display(pk_df)

üîë PRIMARY KEYS


Unnamed: 0,table_name,column_name,constraint_name
0,dim_customer,customer_key,dim_customer_pkey
1,dim_geography,geo_key,dim_geography_pkey
2,dim_product,product_key,dim_product_pkey
3,dim_time,time_key,dim_time_pkey
4,fact_sales,fact_key,fact_sales_pkey


In [8]:
# %% Foreign Keys
print('\n' + '=' * 80)
print('üîó FOREIGN KEYS')
print('=' * 80)

fk_query = """
SELECT 
    tc.table_name AS from_table,
    kcu.column_name AS from_column,
    ccu.table_name AS to_table,
    ccu.column_name AS to_column,
    tc.constraint_name
FROM information_schema.table_constraints tc
JOIN information_schema.key_column_usage kcu 
    ON tc.constraint_name = kcu.constraint_name
    AND tc.table_schema = kcu.table_schema
JOIN information_schema.constraint_column_usage ccu 
    ON tc.constraint_name = ccu.constraint_name
WHERE tc.constraint_type = 'FOREIGN KEY'
    AND tc.table_schema = 'public'
ORDER BY tc.table_name;
"""

fk_df = pd.read_sql(fk_query, engine)
if len(fk_df) > 0:
    display(fk_df)
else:
    print('No foreign keys found.')


üîó FOREIGN KEYS


Unnamed: 0,from_table,from_column,to_table,to_column,constraint_name
0,fact_sales,time_key,dim_time,time_key,fact_sales_time_key_fkey
1,fact_sales,customer_key,dim_customer,customer_key,fact_sales_customer_key_fkey
2,fact_sales,geo_key,dim_geography,geo_key,fact_sales_geo_key_fkey
3,fact_sales,product_key,dim_product,product_key,fact_sales_product_key_fkey


## 5. Indexes

In [9]:
# %% All Indexes
print('=' * 80)
print('üìá INDEXES')
print('=' * 80)

idx_query = """
SELECT 
    tablename AS table_name,
    indexname AS index_name,
    indexdef AS index_definition
FROM pg_indexes
WHERE schemaname = 'public'
ORDER BY tablename, indexname;
"""

idx_df = pd.read_sql(idx_query, engine)
display(idx_df)

üìá INDEXES


Unnamed: 0,table_name,index_name,index_definition
0,dim_customer,dim_customer_customer_id_key,CREATE UNIQUE INDEX dim_customer_customer_id_k...
1,dim_customer,dim_customer_pkey,CREATE UNIQUE INDEX dim_customer_pkey ON publi...
2,dim_geography,dim_geography_pkey,CREATE UNIQUE INDEX dim_geography_pkey ON publ...
3,dim_product,dim_product_pkey,CREATE UNIQUE INDEX dim_product_pkey ON public...
4,dim_product,dim_product_product_id_key,CREATE UNIQUE INDEX dim_product_product_id_key...
5,dim_time,dim_time_pkey,CREATE UNIQUE INDEX dim_time_pkey ON public.di...
6,fact_sales,fact_sales_pkey,CREATE UNIQUE INDEX fact_sales_pkey ON public....
7,fact_sales,idx_fact_customer,CREATE INDEX idx_fact_customer ON public.fact_...
8,fact_sales,idx_fact_geo,CREATE INDEX idx_fact_geo ON public.fact_sales...
9,fact_sales,idx_fact_product,CREATE INDEX idx_fact_product ON public.fact_s...


## 6. Unique Constraints

In [10]:
# %% Unique Constraints
print('=' * 80)
print('üéØ UNIQUE CONSTRAINTS')
print('=' * 80)

unique_query = """
SELECT 
    tc.table_name,
    kcu.column_name,
    tc.constraint_name
FROM information_schema.table_constraints tc
JOIN information_schema.key_column_usage kcu 
    ON tc.constraint_name = kcu.constraint_name
WHERE tc.constraint_type = 'UNIQUE'
    AND tc.table_schema = 'public'
ORDER BY tc.table_name;
"""

unique_df = pd.read_sql(unique_query, engine)
if len(unique_df) > 0:
    display(unique_df)
else:
    print('No unique constraints found.')

üéØ UNIQUE CONSTRAINTS


Unnamed: 0,table_name,column_name,constraint_name
0,dim_customer,customer_id,dim_customer_customer_id_key
1,dim_product,product_id,dim_product_product_id_key


## 7. Data Sample Preview

In [11]:
# %% Preview Data from Each Table
print('=' * 80)
print('üëÅÔ∏è DATA PREVIEW (First 5 Rows per Table)')
print('=' * 80)

inspector = inspect(engine)
tables = inspector.get_table_names()

for table in tables:
    print(f'\nüìã {table.upper()}')
    print('-' * 60)
    try:
        preview = pd.read_sql(f"SELECT * FROM {table} LIMIT 5", engine)
        display(preview)
    except Exception as e:
        print(f'Error: {e}')

üëÅÔ∏è DATA PREVIEW (First 5 Rows per Table)

üìã DIM_TIME
------------------------------------------------------------


Unnamed: 0,time_key,date_full,year,quarter,month,week,day,day_name,quarter_name,month_name
0,1,2017-11-08,2017,4,11,45,8,Wednesday,Q4,November
1,2,2017-06-12,2017,2,6,24,12,Monday,Q2,June
2,3,2016-10-11,2016,4,10,41,11,Tuesday,Q4,October
3,4,2015-06-09,2015,2,6,24,9,Tuesday,Q2,June
4,5,2018-04-15,2018,2,4,15,15,Sunday,Q2,April



üìã FACT_SALES
------------------------------------------------------------


Unnamed: 0,fact_key,time_key,customer_key,geo_key,product_key,order_id,ship_mode,sales,quantity,load_date
0,1,1,1,1,1,CA-2017-152156,Second Class,261.96,1,2026-01-07 18:21:43.865914
1,2,1,1,1,2,CA-2017-152156,Second Class,731.94,1,2026-01-07 18:21:43.865914
2,3,2,2,2,3,CA-2017-138688,Second Class,14.62,1,2026-01-07 18:21:43.865914
3,4,3,3,3,4,US-2016-108966,Standard Class,957.58,1,2026-01-07 18:21:43.865914
4,5,3,3,3,5,US-2016-108966,Standard Class,22.37,1,2026-01-07 18:21:43.865914



üìã DIM_CUSTOMER
------------------------------------------------------------


Unnamed: 0,customer_key,customer_id,customer_name,segment
0,1,CG-12520,Claire Gute,Consumer
1,2,DV-13045,Darrin Van Huff,Corporate
2,3,SO-20335,Sean O'Donnell,Consumer
3,4,BH-11710,Brosina Hoffman,Consumer
4,5,AA-10480,Andrew Allen,Consumer



üìã DIM_GEOGRAPHY
------------------------------------------------------------


Unnamed: 0,geo_key,region,state,city,country,postal_code
0,1,South,Kentucky,Henderson,United States,42420
1,2,West,California,Los Angeles,United States,90036
2,3,South,Florida,Fort Lauderdale,United States,33311
3,4,West,California,Los Angeles,United States,90032
4,5,South,North Carolina,Concord,United States,28027



üìã DIM_PRODUCT
------------------------------------------------------------


Unnamed: 0,product_key,product_id,category,sub_category,product_name
0,1,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase
1,2,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,..."
2,3,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...
3,4,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table
4,5,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System


## 8. Star Schema Summary

In [12]:
# %% Star Schema Summary
print('=' * 80)
print('‚≠ê STAR SCHEMA SUMMARY')
print('=' * 80)

# Identify dimension and fact tables
dim_tables = [t for t in tables if t.startswith('dim_')]
fact_tables = [t for t in tables if t.startswith('fact_')]
other_tables = [t for t in tables if not t.startswith('dim_') and not t.startswith('fact_')]

print(f'\nüìä Dimension Tables ({len(dim_tables)}):')
for t in dim_tables:
    cnt = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {t}", engine)['cnt'].iloc[0]
    print(f'   ‚Ä¢ {t}: {cnt:,} rows')

print(f'\nüìà Fact Tables ({len(fact_tables)}):')
for t in fact_tables:
    cnt = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {t}", engine)['cnt'].iloc[0]
    print(f'   ‚Ä¢ {t}: {cnt:,} rows')

if other_tables:
    print(f'\nüìÅ Other Tables ({len(other_tables)}):')
    for t in other_tables:
        cnt = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {t}", engine)['cnt'].iloc[0]
        print(f'   ‚Ä¢ {t}: {cnt:,} rows')

‚≠ê STAR SCHEMA SUMMARY

üìä Dimension Tables (4):
   ‚Ä¢ dim_time: 1,230 rows
   ‚Ä¢ dim_customer: 793 rows
   ‚Ä¢ dim_geography: 628 rows
   ‚Ä¢ dim_product: 1,861 rows

üìà Fact Tables (1):
   ‚Ä¢ fact_sales: 9,800 rows


In [13]:
# %% Fact Table Foreign Key Analysis
print('\n' + '=' * 80)
print('üîç FACT TABLE FOREIGN KEY COVERAGE')
print('=' * 80)

if 'fact_sales' in tables:
    fk_analysis = """
    SELECT
        COUNT(*) AS total_rows,
        COUNT(time_key) AS with_time_key,
        COUNT(customer_key) AS with_customer_key,
        COUNT(geo_key) AS with_geo_key,
        COUNT(product_key) AS with_product_key,
        ROUND(100.0 * COUNT(time_key) / NULLIF(COUNT(*), 0), 2) AS time_pct,
        ROUND(100.0 * COUNT(customer_key) / NULLIF(COUNT(*), 0), 2) AS customer_pct,
        ROUND(100.0 * COUNT(geo_key) / NULLIF(COUNT(*), 0), 2) AS geo_pct,
        ROUND(100.0 * COUNT(product_key) / NULLIF(COUNT(*), 0), 2) AS product_pct
    FROM fact_sales;
    """
    fk_cov = pd.read_sql(fk_analysis, engine)
    display(fk_cov.T)
else:
    print('fact_sales table not found.')


üîç FACT TABLE FOREIGN KEY COVERAGE


Unnamed: 0,0
total_rows,9800.0
with_time_key,9800.0
with_customer_key,9800.0
with_geo_key,9800.0
with_product_key,9800.0
time_pct,100.0
customer_pct,100.0
geo_pct,100.0
product_pct,100.0


## 9. Database Statistics

In [14]:
# %% Table Statistics
print('=' * 80)
print('üìä TABLE STATISTICS')
print('=' * 80)

stats_query = """
SELECT 
    relname AS table_name,
    n_live_tup AS live_rows,
    n_dead_tup AS dead_rows,
    last_vacuum,
    last_autovacuum,
    last_analyze,
    last_autoanalyze
FROM pg_stat_user_tables
ORDER BY n_live_tup DESC;
"""

stats_df = pd.read_sql(stats_query, engine)
display(stats_df)

üìä TABLE STATISTICS


Unnamed: 0,table_name,live_rows,dead_rows,last_vacuum,last_autovacuum,last_analyze,last_autoanalyze
0,fact_sales,9800,0,,2026-01-07 17:22:28.372324+00:00,,2026-01-07 17:22:28.410409+00:00
1,dim_product,1861,0,,2026-01-07 17:22:28.417184+00:00,,2026-01-07 17:22:28.427022+00:00
2,dim_time,1230,0,,2026-01-07 17:22:28.361563+00:00,,2026-01-07 17:22:28.368779+00:00
3,dim_customer,793,0,,NaT,,2026-01-07 17:22:28.413928+00:00
4,dim_geography,628,0,,NaT,,2026-01-07 17:22:28.416212+00:00


In [15]:
# %% Final Summary
print('\n' + '=' * 80)
print('‚úÖ DATABASE INFO COMPLETE')
print('=' * 80)

total_tables = len(tables)
total_rows = sum([pd.read_sql(f"SELECT COUNT(*) as cnt FROM {t}", engine)['cnt'].iloc[0] for t in tables])

print(f'\nüìä Total Tables: {total_tables}')
print(f'üìä Total Rows: {total_rows:,}')
print(f'üìä Dimension Tables: {len(dim_tables)}')
print(f'üìä Fact Tables: {len(fact_tables)}')


‚úÖ DATABASE INFO COMPLETE

üìä Total Tables: 5
üìä Total Rows: 14,312
üìä Dimension Tables: 4
üìä Fact Tables: 1

üìä Total Tables: 5
üìä Total Rows: 14,312
üìä Dimension Tables: 4
üìä Fact Tables: 1
