restructuring

kmoppel · kmoppel · commit a898afa5c811 · 2017-08-14T14:03:20.000+03:00
diff --git a/00_create_database.sql b/00_create_database.sql
@@ -1,4 +1,21 @@
+DROP DATABASE IF EXISTS pg_features_demo;
+
+/*
+-- NB! Dropping databases will fail if there are sessions connected to it. In such cases connected users can be for exmple "killed" with:
+SELECT count(*) AS sessions_killed FROM (SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = 'pg_features_demo') a;
+*/
+
 CREATE DATABASE pg_features_demo;
 
+/* Background info:
+
+One Postgres cluster (called also a host or an instance, distinguished by a unique network IP+port pair) can have X amount of databases 
+that only share user accounts (roles).
+
+In case of multiple environments (enterprize scenario) it is highly recommended to include the environment name into the DB name, to 
+minimize chances of executing things on the wrong DB - e.g. dev_app1_db, prod_app1_db.
+
+*/
+
 -- assuming "psql" as execution environment here
 \c pg_features_demo
diff --git a/01_create_role.sql b/01_create_role.sql
@@ -1,3 +1,9 @@
+-- Create a role (a.k.a: user, login, group) that is allowed to log in.
+-- NB! By default roles are allowed to connect to all databases of a cluster. This does not mean that they can automatically access tables though.
+-- But if this is not wanted, per DB connections can be set up with "REVOKE/GRANT CONNECT ON DATABASE"
 CREATE ROLE demorole WITH LOGIN;
 
+-- Will later be set up to only allow selecting data
 CREATE ROLE demorole_ro WITH LOGIN;
+
+-- TODO Super, password, mention other auth methods
diff --git a/02_create_schema.sql b/02_create_schema.sql
@@ -1,8 +1,25 @@
-CREATE SCHEMA IF NOT EXISTS demo AUTHORIZATION demorole;
+/*
+Schemas in Postgres are basically "namespaces", allowing tables with same names within one DB, if schema names differ.
+By default there is always one "public" schema pre-created with every database, but for bigger applications it is usually 
+preferable to create more schemas to logically group business functionality: e.g. customer_data, product_data.
+ 
+Hierarchy of objects is thus such:   Cluster -> Database -> Schema -> Table
+*/
+CREATE SCHEMA IF NOT EXISTS banking_demo AUTHORIZATION demorole;
 
--- "usage" for schemas allows looking at structures within schemas
--- "public" means all users
-GRANT USAGE ON SCHEMA demo TO public;
+/*
+When creating schemas, one also usually should define according access privileges.
+"USAGE" privilege for schemas is a privilege allowing looking at structures within schemas.
+Here we also introduce the "public" role, meaning an internal synonym for "all users" - this if you grant some rights to 
+"public", every user has it.
+*/
+GRANT USAGE ON SCHEMA banking_demo TO public;
 
-ALTER DEFAULT PRIVILEGES IN SCHEMA demo
+
+/*
+Postgres has the feature of DEFAULT PRIVILEGES that can be used to easily maintain very granular privilege systems.
+Here for example we declare that for all tables we create in our banking_demo schema, system will automatically grant 
+read rights for the "demorole_ro" role.
+*/
+ALTER DEFAULT PRIVILEGES IN SCHEMA banking_demo
 	GRANT SELECT ON TABLES TO demorole_ro;
diff --git a/02_search_path.sql b/02_search_path.sql
@@ -0,0 +1,22 @@
+/* 
+
+Important concept tied to schemas is "search_path". Basically it's a priority list of schemas, 
+used to look for objects when the schema name is not specified by user. Can be set by all users in their session,
+but when using multiple schemas it usually makes sense to set it DB-wide to decrease amount of typing.
+Schemas can also be used to do sort of "API versioning" when using views or stored procedures.
+
+*/
+
+-- default search path
+SELECT current_setting('search_path') AS default_search_path;
+
+-- currently we need to type for example something like that:
+-- SELECT count(*) from banking_demo.tableX
+
+-- will become active on next login or after server settings reload
+ALTER DATABASE pg_features_demo SET search_path TO banking_demo, public;
+-- for this session also
+SET search_path TO banking_demo, public;
+
+-- after that we can type:
+-- SELECT count(*) from tableX
diff --git a/03_make_public_schema_secure.sql b/03_make_public_schema_secure.sql
@@ -1,6 +1,9 @@
--- "usage" for schemas allows looking at structures within schemas
--- avoid usage of "public" schema or better revoke "public" access for sensitive environments
+/*
+"USAGE" for schemas allows looking at structures within schemas.
+Thus for sensitive environments it is recommended to avoid creating object in the "public" schema (namespace) or to 
+revoke "public" (all users) access like below.
+*/
 REVOKE USAGE ON SCHEMA public FROM public;
 
--- whitelist needed users
+-- Whitelist needed users later
 GRANT USAGE ON SCHEMA public TO demorole;
diff --git a/04_alter_role.sql b/04_alter_role.sql
@@ -1,4 +1,6 @@
--- effective with next login
+-- It's possible to set global, or per user or per user/db settings. Most used such settings are search_path and statement_timeout.
+-- Changes are effective with next login.
+
 ALTER ROLE demorole IN DATABASE pg_features_demo SET search_path TO demo, public;
 ALTER ROLE demorole_ro IN DATABASE pg_features_demo SET search_path TO demo, public;
 ALTER ROLE demorole_ro IN DATABASE pg_features_demo SET statement_timeout TO '5s';
diff --git a/05_create_table.sql b/05_create_table.sql
@@ -1,26 +1,70 @@
 -- assume "application" role
+-- it's a good practice to own all objects by one "application" role, so that changes could be done using the same role, 
+-- not requiring the samewhat dangerous "superuser".
 SET ROLE TO demorole;
 
-CREATE TABLE t_demo (
-	id serial PRIMARY KEY,
-	data jsonb,
-	department text NOT NULL,
-	created_by text NOT NULL DEFAULT current_user,
-	created_on timestamptz NOT NULL DEFAULT now(),
-	last_modified_on timestamptz
+
+-- CREATE TABLEs for our super-simplified banking schema. For those more familiar with Postgres you may notice the schema is 
+-- very similar to the one used by default Postgres benchmarking tool "pgbench"
+
+CREATE TABLE banking_demo.branch(
+    branch_id       int NOT NULL PRIMARY KEY,   -- using just "id" for name here is not recommended, the more explicit the better for important stuff
+    balance         int NOT NULL DEFAULT 0
+);
+
+CREATE TABLE banking_demo.teller(
+    teller_id       int NOT NULL PRIMARY KEY,
+    branch_id       int NOT NULL,
+    balance         int NOT NULL DEFAULT 0
+);
+
+CREATE TABLE banking_demo.account(
+    account_id      int NOT NULL PRIMARY KEY,
+    branch_id       int NOT NULL,
+    teller_id       int NOT NULL,
+    balance         int NOT NULL DEFAULT 0
+);
+
+CREATE TABLE banking_demo.transaction_history(
+    teller_id       int NOT NULL,
+    branch_id       int NOT NULL,
+    account_id      int NOT NULL,
+    delta           int NOT NULL,
+    created_on      timestamp with time zone NOT NULL DEFAULT now()
 );
 
-COMMENT ON TABLE t_demo IS 'a simple table';
-COMMENT ON COLUMN t_demo.data IS 'JSONB is designed for NoSQL';
 
-INSERT INTO t_demo (data, department)
-  VALUES ('{"user_id": 1, "order_items": [{"item_id":3, "code": "EAS123"}]}', 'sales');
 
--- index top level keys for a simple NoSQL use case.
-CREATE INDEX CONCURRENTLY ON t_demo USING gin (data);
+-- generate 1 branch, 10 tellers for branch, 10K accounts for each teller with random balances
+
+INSERT INTO banking_demo.branch (branch_id)
+    VALUES (1);
+
+INSERT INTO banking_demo.teller (teller_id, branch_id)
+    SELECT generate_series(1, 10), 1;
+
+INSERT INTO banking_demo.account (account_id, teller_id, branch_id)
+    SELECT i, i % 10 + 1, 1 FROM generate_series(1, 1e5) i;
+
+
+-- Adding foreign keys and indexes
+-- (more correct would be to add them before inserting data but also inserts would be slower then)
+
+CREATE INDEX ON banking_demo.account (teller_id);
+CREATE INDEX ON banking_demo.account (branch_id);
+CREATE INDEX ON banking_demo.transaction_history (account_id);
+CREATE INDEX ON banking_demo.transaction_history (teller_id);
+CREATE INDEX ON banking_demo.transaction_history (created_on);
+
+ALTER TABLE banking_demo.teller ADD FOREIGN KEY (branch_id) REFERENCES banking_demo.branch;
+ALTER TABLE banking_demo.account ADD FOREIGN KEY (branch_id) REFERENCES banking_demo.branch;
+ALTER TABLE banking_demo.account ADD FOREIGN KEY (teller_id) REFERENCES banking_demo.teller;
+ALTER TABLE banking_demo.transaction_history ADD FOREIGN KEY (branch_id) REFERENCES banking_demo.branch;
+ALTER TABLE banking_demo.transaction_history ADD FOREIGN KEY (teller_id) REFERENCES banking_demo.teller;
+ALTER TABLE banking_demo.transaction_history ADD FOREIGN KEY (account_id) REFERENCES banking_demo.account;
 
--- index everything
-CREATE INDEX ON t_demo USING gin (data jsonb_path_ops);
 
--- Prepare for frequent changes, increase FILLFACTOR
-ALTER TABLE t_demo SET (fillfactor=80);
+-- Also when adding/changing a lot of row that will be used immediately it is benefical to explicitly force gathering 
+-- of column statistics with ANALYZE
+ANALYZE banking_demo.teller;
+ANALYZE banking_demo.account;
diff --git a/06_create_table_like.sql b/06_create_table_like.sql
diff --git a/06_create_table_options.sql b/06_create_table_options.sql
@@ -0,0 +1,25 @@
+/*
+Other ways of creating tables are:
+    1) using LIKE to use an existing table as a templates and selecting (or leaving out) some constraints/checks/indexes
+    2) create table as select ...
+*/
+
+-- create a copy of 
+CREATE TABLE temp  (LIKE t_demo EXCLUDING INDEXES);
+
+-- could also do:
+-- create table t_demo_log as select * from t_demo where false;
+
+
+/*
+Other types of tables are:
+    1) temporary tables
+    2) "unlogged" tables
+*/
+
+-- temporary tables are not persistent and visible only in that session that created them
+CREATE TEMP TABLE t (LIKE t_demo);
+
+-- unlogged tables are not WAL-logged (emptied after a crash) thus a lot faster to work with
+CREATE UNLOGGED TABLE t_data_staging (LIKE t_demo);
+
diff --git a/07_alter_table.sql b/07_alter_table.sql
@@ -0,0 +1,6 @@
+/*
+Another significant performance tweaking option for tables is the FILLFACTOR parameter.
+It tells Postgres to fill up tables only to specified percentage, so that future row updates would
+have a chance to be performed "in line". Some "terms and conditions" apply but for certain usecases huge boosts are possible.
+*/
+ALTER TABLE 
diff --git a/07_create_ulogged_table.sql b/07_create_ulogged_table.sql
diff --git a/09_data_type_showcase.sql b/09_data_type_showcase.sql
@@ -0,0 +1,28 @@
+
+
+
+
+
+CREATE TABLE t_demo (
+	id serial PRIMARY KEY,
+	data jsonb,
+	department text NOT NULL,
+	created_by text NOT NULL DEFAULT current_user,
+	created_on timestamptz NOT NULL DEFAULT now(),
+	last_modified_on timestamptz
+);
+
+COMMENT ON TABLE t_demo IS 'a simple table';
+COMMENT ON COLUMN t_demo.data IS 'JSONB is designed for NoSQL';
+
+INSERT INTO t_demo (data, department)
+  VALUES ('{"user_id": 1, "order_items": [{"item_id":3, "code": "EAS123"}]}', 'sales');
+
+-- index top level keys for a simple NoSQL use case.
+CREATE INDEX CONCURRENTLY ON t_demo USING gin (data);
+
+-- index everything
+CREATE INDEX ON t_demo USING gin (data jsonb_path_ops);
+
+-- Prepare for frequent changes, increase FILLFACTOR
+ALTER TABLE t_demo SET (fillfactor=80);
diff --git a/20_stored_procedures.sql b/20_stored_procedures.sql
@@ -1,7 +1,36 @@
+-- in PL/pgSQL there's no distinction between a function (restricted to returning a single scalar value in Oracle PL/SQL) and a procedure - 
+-- all stored procedural code are 'functions' that can return anything (scalars, multiple values, rows of multiple values or table types) or nothing.
+
 CREATE OR REPLACE FUNCTION f1_returns_text() RETURNS text as
 $SQL$
 BEGIN
 	RETURN 'demo';
 END;
 $SQL$
 LANGUAGE plpgsql;
+
+
+-- stored procedures support error handling and subtransactions via BEGIN/EXCEPTION/END block
+
+CREATE OR REPLACE FUNCTION merge_db(key INT, data TEXT) RETURNS VOID AS
+$$
+BEGIN
+    LOOP
+        -- first try to update the key
+        UPDATE db SET b = data WHERE a = key;
+        IF found THEN
+            RETURN;
+        END IF;
+        -- not there, so try to insert the key
+        -- if someone else inserts the same key concurrently,
+        -- we could get a unique-key failure
+        BEGIN
+            INSERT INTO db(a,b) VALUES (key, data);
+            RETURN;
+        EXCEPTION WHEN unique_violation THEN
+            -- Do nothing, and loop to try the UPDATE again.
+        END;
+    END LOOP;
+END;
+$$
+LANGUAGE plpgsql;
diff --git a/60_transaction_management.sql b/60_transaction_management.sql
diff --git a/70_basic_analytics.sql b/70_basic_analytics.sql
diff --git a/80_advanced_indexing.sql b/80_advanced_indexing.sql
@@ -0,0 +1,5 @@
+testmain
+testunpriv
+testunpriv2
+hstore_tbl
+hstore_tbl_fail
diff --git a/python_connectivity.py b/python_connectivity.py
@@ -0,0 +1,10 @@
+# Using python and psycopg2 driver ('sudo apt install python-psycopg2' or 'sudo pip install psycopg2') to work with Postgres databases
+
+import psycopg2
+import psycopg2.extras
+
+conn = psycopg2.connect(host='localhost', dbname='pg_features_demo')
+conn.autocommit = True
+cur = conn.cursor(psycopg2.extras.RealDictCursor)
+cur.execute('select current_date as today')
+print cur.fetchone()['today']
diff --git a/todo.txt b/todo.txt
@@ -6,4 +6,8 @@
  - regex
  - FILTER
  - indexing GiST, BRIN...
- 
+ * CTE, wr, rec
+* Transactional DDL
+* Review of data types
+* Extensions
+* COPY