# This is a summary for basic SQL command for postgreSQL

No data is linked to this notebook yet, so don't run !

## 1. Basic select column from a table

In [None]:
SELECT name From people

## 2. Select multiple columns

In [None]:
# select one col
SELECT name, birthday FROM people
# select all cols
SELECT * FROM people
# select all cols, but limit rows
SELECT * FROM people LIMIT 10

## 3. Select Distinct

select all the unique values from a column

In [None]:
SELECT DISTINCT language FROM films

## 4. Count

count the number of rows in one or more columns

In [None]:
SELECT COUNT(*) FROM people

## 5. filtering: WHERE

In [None]:
SELECT title FROM films WHERE title = 'Metropolis' #single quote must
SELECT title FROM films WHERE title <> 'Metropolis' #<> is not equal here
SELECT * FROM films WHERE budget > 1000 #numerical

## 6. Multiple condition: WHERE AND / WHERE OR

In [None]:
SELECT title FROM films WHERE release_year > 1994 AND release_year < 2000
SELECT title FROM films WHERE release_year > 1994 OR release_year < 2000

## 7. checking range: BETWEEN (instead of WHERE AND)

In [None]:
SELECT title FROM films WHERE release_year BETWEEN 1994 AND 2000 #both ends inclusive

## 8. WHERE IN: specify multiple values in a WHERE cluase (instead of WHERE OR many times)

In [None]:
SELECT name FROM kids WHERE age IN (2,4,6,8,10)

#same as
SELECT name
FROM kids
WHERE age = 2
OR age = 4
OR age = 6
OR age = 8
OR age = 10;

## 9. NULL and IS NULL and IS NOT NULL

In SQL, NULL represent missing or unknown value.

In [None]:
#count the number of missing birthdates in the people table:
SELECT COUNT(*)
FROM people
WHERE birthdate IS NULL;

#names of all people whose birth dates are not missing
SELECT name
FROM people
WHERE birthdate IS NOT NULL;

## 10. LIKE and NOT LIKE

Used in a WHERE cluase to search for pattern.    
% match zero,one or many char in text    
_ match one char    

In [None]:
SELECT name
FROM companies
WHERE name LIKE 'Data%'; #matches 'Data', 'DataC' 'DataCamp', 'DataMind'

SELECT name
FROM companies
WHERE name LIKE 'DataC_mp'; # matches 'DataCamp', 'DataComp'

## 11. Aggregate functions

In [None]:
SELECT AVG(budget)
FROM films;

SELECT MAX(budget)
FROM films;

SELECT MAX(budget)
FROM films;

SELECT SUM(budget)
FROM films;

## 12. Aliasing: AS

change output col name

In [None]:
SELECT MAX(budget) AS max_budget,
       MAX(duration) AS max_duration
FROM films;

## 13. ORDER BY

used to sort results in ascending or descending order according to the values of one or more columns.   

By default ORDER BY will sort in ascending order. If you want to sort the results in descending order, you can use the DESC keyword. For example,

In [None]:
#sort single col
SELECT title
FROM films
ORDER BY release_year DESC;

#sort multiple cols
SELECT birthdate, name
FROM people
ORDER BY birthdate, name; # the order of col matters. First sort birthdates(oldest to newest, then sort names in alpha order

## 14. GROUP BY

GROUP BY allows you to group a result by one or more columns.    
Commonly, GROUP BY is used with aggregate functions like COUNT() or MAX(). Note that GROUP BY always goes after the FROM clause!

In [None]:
SELECT sex, count(*)
FROM employees
GROUP BY sex;

SELECT sex, count(*)
FROM employees
GROUP BY sex
ORDER BY count DESC; #ORDER BY is last

## 15. HAVING

In SQL, aggregate functions can't be used in WHERE clauses.    
This means that if you want to filter based on the result of an aggregate function, you need another way! That's where the HAVING clause comes in.   

In [None]:
# NOT valid
SELECT release_year
FROM films
GROUP BY release_year
WHERE COUNT(title) > 10;

#valid
SELECT release_year
FROM films
GROUP BY release_year
HAVING COUNT(title) > 10;

## 16. Query information_schema with SELECT

information_schema is a meta-database that holds information about your current database. information_schema has multiple tables you can query with the known SELECT * FROM syntax:   

tables: information about all tables in your current database    
columns: information about all columns in all of the tables in your current database    

The 'public' schema holds information about user-defined tables and databases.    

In [None]:
#Query the right table in information_schema
# output is the name of the table
SELECT table_name 
FROM information_schema.tables
#Specify the correct table_schema value
WHERE table_schema = 'public';

# Query the right table in information_schema to get columns
SELECT column_name, data_type 
FROM information_schema.columns
WHERE table_name = 'university_professors' AND table_schema = 'public';

## 17. Create new tables with CREATE TABLE

In [None]:
# sample template
CREATE TABLE table_name (
column_a data_type,
column_b data_type,
column_c data_type
);

# e.g.
CREATE TABLE weather (
clouds text,
temperature numeric,
weather_station char(5)
);

## 18. ADD a COLUMN with ALTER TABLE

In [None]:
# sample syntax
ALTER TABLE table_name
ADD COLUMN column_name data_type;

## 19. INSERT DISTINCT records INTO new tables

In [None]:
INSERT INTO organizations # new table, or target table
SELECT DISTINCT organizations, #select the col from the source table, only copy over distinct organization
organization_sector
FROM university_professors #source table

#normal INSERT INTO usage
INSERT INTO table_name (column_a, column_b)
VALUES ("value_a","value_b");

## 20. RENAME a COLUMN

In [None]:
ALTER TABLE table_name
RENAME COLUMN old_name TO new_name;

## 21. DROP a COLUMN

In [None]:
ALTER TABLE table_name
DROP COLUMN column_name;

## 22. Integrity constraits

1) Attribute constraints: e.g. data types on columns   
2) Key constraints: e.g. primary keys   
3) Referential integrity constraints:  enforced through foreign keys, A record in table A cannot point to a record in table B that does not exist.    

Referential integrity is violated if:    
if a record in table B that is referenced from a record in table A is deleted.      
if a record in table A referencing a non-existing record from table B is inserted.     


Constraints give the data structure, help with consistency and data quality    


Data types defined "domain" (what form these values should take) of a column and therefore what operations are possible   
text: char strings of any length    
varchar[(x)] a max of n char    
char[(x)] a fixed length string of n char    
boolean: only TRUE, FALSE and NULL (unknown) is allowed    
date, time and timestamp     
numeric: arbitray precision numbers (e.g. numeric(3,2) e.g. 5.54)    
integer or bigint

## 23. type CAST

In [None]:
SELECT CAST(some_column AS integer) #some_column is text type originally, this CAST is temporarily
FROM table;

## 24. ALTER types after table creation

In [None]:
ALTER TABLE students
ALTER COLUMN name
TYPE varchar(128)

## 25. USING

In [None]:
ALTER TABLE students
ALTER COLUMN average_grade
TYPE integer
# turns 5.54 into 6, not 5, before type conversion
USING ROUND(average_grade)


# Because you want to reserve only x characters for column_name, 
#you have to retain a SUBSTRING of every value, i.e. the first x characters of it, 
#and throw away the rest. This way, the values will fit the varchar(x) requirement.
ALTER TABLE table_name
ALTER COLUMN column_name
TYPE varchar(x)
USING SUBSTRING(column_name FROM 1 FOR x)

## 26 not-null constraint

disallow NuLL values in a certain column   
this persist though current state and also future state    


In [None]:
#create a table with not-null constraints
CREATE TABLE students (
ssn integer not null,
hone_phone integer
);

#alter the table has been created
ALTER TABLE students
ALTER COLUMN home_phone
SET NOT NULL;

#alter the table has been created
ALTER TABLE students
ALTER COLUMN ssn
DROP NOT NULL;

## 27. the unique constraints

disallow duplicate values in a column   


In [None]:
#create a table with unique constraits
CREATE TABLE students (
ssn integer UNIQUE
);

#alter the table has been created
ALTER TABLE students
ADD CONSTRAINT some_name UNIQUE(columne_name) #you have to give the constraint a name some_name


## 28. Keys and super keys

Keys are attributes that uniquely identify a record in a table.   
Super keys: as long as attribute can be removed. Normally the combination of all attribute in a table should be unique, hence each row is a superkey.    
If no more attribute can be removed, and each record can still be uniquely identified, it is called a minimal superkey or **key**.    

Identify a key:   
1) Count the distinct records for all possible combinations of columns. If the resulting number x equals the number of all rows in the table for a combination, you have discovered a superkey.   

2) Then remove one column after another until you can no longer remove columns without seeing the number x decrease. If that is the case, you have discovered a (candidate) key.   

**Primary Keys:**     
(Almost) Every database table should have a primary key.    
Primary keys need to be defined on columns that don't accept NULL or duplicate values.    
Primary keys constraits must be time-invariant (applicable for now and future)    

**Surrogate Keys:**    
They are keys that not part of the existing columns for data, but on a column that exist just for the sake of having a primary key.    


**Foreign Keys:**    
A foreign key points to the primary key of another table.    
The domain and datatype must be the same as the primary key.    
Each value of foreign key must exist in primary key of the other table (referential integrity)     
Foreign keys are not actual key: duplicates and NULLs are allowed.     

1 to N relationship:    
implemented using 1 foreign key in the table that has at most one foreign entity associated.     

N to M relationship:    
create a table with 2 foreign keys that point to both connected entity (points the two primary key of the 2 connected tables.)


In [None]:
#example of specifying a primary key
CREATE TABLE products (
product_no integer PRIMARY KEY,
name text,
price numeric
);

#multiple cols as primary key
CREATE TABLE example (
a integer,
b integer,
c integer,
PRIMARY KEY (a,c)
);

#adding primary constraints to existing table
ALTER TABLE table_name
ADD CONSTRAINT some_name PRIMARY KEY (column_name) # you also need to give the key some name like the unqiye constraint

# e.g.
# Make id a primary key
ALTER TABLE organizations
ADD CONSTRAINT organization_pk PRIMARY KEY (id);

In [None]:
# adding a surrogate key with serial data type
ALTER TABLE cars
ADD COLUMN id serial PRIMARY KEY; # all entry will be numbered

#add new record to table: cars, which has 3 column + id (auto incremented upon adding)
INSERT INTO cars
VALUES('Volkswagen','Blitz','black')

In [None]:
# create a surrogate key by combining 2 exising cols into a new one
#add a new col with varchar datatype
ALTER TABLE table_name
ADD COLUMN column_c varchar(256);

#update that column with the concatanaiton of the two exisiting cols.
UPDATE table_name
SET column_c = CONCAT(column_a, column_b);

#turn that col into a primary key
ALTER TABLE table_name
ADD CONSTRIANT pk PRIMARY KEY (column_c);

In [None]:
# modeling 1 to N relationship
# specifying foreign keys
CREATE TABLE manufacturers (
name varchar(255) PRIMARY KEY
);

INSERT INTO manufacturers
VALUE ('Ford'),('VW'),('GM');

CREATE TABLE cars (
model varchar(255) PRIMARY KEY,
manufacturer_name integer REFERENCES manufacturers(name)
);

INSERT INTO cars
VALUES ('Ranger','Ford'),('Beetle','VW');

#only cars with valid and exisiting manufacturers can enter this table


#specifying foreign keys to existing tables
ALTER TABLE a
ADD CONSTRAINT a_fkey FOREIGN KEY (b_id) REFERENCES b (id);

In [None]:
#modelling n to m relationship
# create a table with addition attribute called function (specific in this example)
# this table connects N:M between professors and organization
#no primary key is defined ! (threse three cols can uniquely identify an entry)
CREATE TABLE affiliations (
professor_id integer REFERENCES professors (id),
organization_id varchar(256) REFERENCES organization (id),
function varchar(256)
);

In [None]:
# update columns of a table based on values in another table
# For each row in table_a, find the corresponding row in table_b where condition1, condition2, etc., 
# are met.
Set the value of column_to_update to the value of column_to_update_from (from that corresponding row)
UPDATE table_a
SET column_to_update = table_b.column_to_update_from
FROM table_b
WHERE condition1 AND condition2 AND ...;


In [None]:
# Dealing with referential integrity violation
# tell database what if an entry in the referenced table is deleted.
CREATE TABLE a (
id integer PRIMARY KEY,
column_a varchar(64),
b_id integer REFERENCES b (id) ON DELETE NO ACTION
); #by default, on delete no action is applied
#if you try to delete a record in table b, the system will throw an error.

CREATE TABLE a (
id integer PRIMARY KEY,
column_a varchar(64),
b_id integer REFERENCES b (id) ON DELETE CASCADE
); # allow the delection a record from table b, then auto delete all referencing record in table a

ON DELETE RESTRICT # almost same as no action, throw an error
ON DELETE SET NULL # set the value of this foreign key to null
ON DELETE SET DEFAULT #need preset a default first, then fill in that default value, if the referenced record is deleted

## 29. Inner Join (with INNER JOIN)

Assuming we have two table: left_table and right_table (each table with id(key) and val(value) column, and multiple rows)     

With inner joins we look for matches in the right_table corresponding to all entries in the key field in the left_table.      
Also introducing table alias:    
For tables you also use AS to add the alias immediately after the table name with a space

In [None]:
# example of inner join
SELECT p1.country, p1.continent #because both table as country and continent, so need p1. (first table alias below)
       prime_minister, president 
FROM prime_ministers AS p1
INNER JOIN presidents AS p2
ON p1.country = p2.country;

In [None]:
# multiple inner join
SELECT *
FROM left_table
  INNER JOIN right_table
    ON left_table.id = right_table.id
  INNER JOIN another_table
    ON left_table.id = another_table.id;

## 30. Inner Join (with USING)

If the id from both table are the same, instead of:      
ON p1.country = p2.country      
we can use:      
USING (country)      


In [None]:
# example of inner join, with USING
SELECT p1.country, p1.continent #because both table as country and continent, so need p1. (first table alias below)
       prime_minister, president 
FROM prime_ministers AS p1
INNER JOIN presidents AS p2
USING (country)

## 31. Self Join

Self-joins are used to compare values in a field to other values of the same field from within the same table.     


In [None]:
SELECT p1.country AS country1, p2.country AS country2, p1.continent
FROM prime_ministers AS p1
INNER JOIN prime_minister AS p2
ON p1.continent = p2.continent AND p1.country <> p2,country #2nd arg eliminate paring with itself

## 32. CASE (with WHEN, THEN)

works with numerical values

In [None]:
SELECT name, continent, indep_year,
CASE WHEN indep_year < 1990 THEN 'before 1900'
WHEN indep_year <= 1930 THEN 'between 1900 and 1930'
ELSE 'after 1930' END
AS indep_year_group #creates a new col named indep_year_group with label above('before 1900' etc)
FROM states
ORDER BY indep_year_group;

In [None]:
# bonus: create a new table named country_plus using INTO
SELECT name, continent, code, surface_area,
    CASE WHEN surface_area > 2000000
            THEN 'large'
       WHEN surface_area > 350000
            THEN 'medium'
       ELSE 'small' END
       AS geosize_group
INTO countries_plus
FROM countries;

## 33. Outer Join : LEFT JOIN, RIGHT JOIN, FULL JOIN

Left Join:     
keeps all the record in the left_table, then marks the values as  missing in the right_table for thoese that don't have a match. Unmatched records in the right_table are ignored.       
If one record in the left_table (i.e. key) matches with multiple records in the right_table (i.e. with their key in the right_table), then there will be multiple record in the joined table (each corresponding to a record in right_table with the same record id in the left_table)       

Right Join:     
Same as left join just that it does the reverse.       

Full Join:     
Essentially combines left join and right join.    
It will bring in all records from both the left_table and right_table and keep track of the missing values accordingly.     
If select all cols from the two tables, it will create 4 cols (left_id right_id left_val right_val with corresponding missing values)       

Order matters when doing full join, the output will look abit different.     








In [None]:
# left join example
SELECT p1.country, prime_minister, president
FROM prime_ministers AS p1
LEFT JOIN president AS p2
ON p1.country = p2.country;



In [None]:
# right join
SELECT right_table.id AS R_id
left_table.val AS L_val,
right_table.val AS R_val
FROM left_table
RIGHT JOIN right_table
ON left_table.id = right_table.id

In [None]:
# FULL join syntax
SELECT left_table.id AS L_id,
right_table.id AS R_id,
left_table.val AS L_val,
right_table.val AS R_val
FROM left_table
FULL JOIN right_table
USING (id)

## 34. Cross Join


Create all possible combination of two tables (i.e. all combination on the left_id and right_id).     



In [None]:
# example of cross join
SELECT prime_minister, president
FROM prime_ministers AS p1
CROSS JOIN presidents AS p2
WHERE p1.continent IN ('North America', 'Ocenia')

## 35. Set theory: union, union all, intersect, except

They only bind fields on top of one another in the two tables (not like join)     
**NOTE** intersect look for RECORDS in common, NOT individual key field like what a join deos to match     
same data type required since they return a single field (column)     
union all: similar to union, but double counting the intersection.     
excet: results in only these records in one table but not the other.     


In [None]:
#union
# primister and monarch
SELECT prime_minister AS leader, country
FROM prime_ministers
UNION
SELECT monarchs, country
FROM monarchs
ORDER BY country;

In [None]:
# intersect on one field
SELECT country
FROM prime_ministers
INTERSECT
SELECT country
FROM presidents;

In [None]:
#intersect on two field
# in this case it will return country leader that has the same country, same prime minsiter and same president.
# i.e. a country with same name for prime minister and president
SELECT country, prime_minister AS leader #only two col will be returned: coutnry and leader
FROM prime_ministers
INTERSECT
SELECT country, president
FROM presients;

In [None]:
#except
# monarchs that aren't prime minister
SELECT monarch, country
FROM monarchs
EXCEPT
SELECT prime_minister, country
FROM primme_ministers;

## 36. Semi-join and Anti-join (subqueries)

Use a right table to determine which records to keep in the left table.     
In a way similar to a WHERE clause dependent on the values of a second table     

Semi-join:     
chooses records in the first table where a condition is met in a second table.    

Anti-join:     
choose records in the first table where a condition is not met in the second table.    

In [None]:
#semi-join

#WHERE clause
SELECT name
FROM states
WHERE indep_year<1800

#display certain field
SELECT president, country, continent
FROM president;

# combine
# use the first queiry as the condition for the second one
SELECT president, country, continent
FROM presidents
WHERE country IN
    (SELECT name
    FROM states
    WHERE indep_year < 1800);


In [None]:
# anit-join
SELECT president, country, continent
FROM presidents
WHERE country NOT IN
    (SELECT name
    FROM states
    WHERE indep_year < 1800);


## 37. Nested query (subquery): WHERE and SELECT

Most common type of subquery is one inside of a WHERE statemebt     


In [None]:
#e.g. Asian countries below average "fert_rate"
# subqueries in a WHERE clause
SELECT name, fert_rate
FROM states
WHERE continent = 'Asia'
AND fert_rate < 
(SELECT AVG(fert_rate)
FROM states);

In [None]:
# subqueries in a SELECT clause
# e.g. count the number of countries listed in the states table for each continent in the prime minsiter table
SELECT DISTINCT continent,
(SELECT COUNT(*)
FROM states
WHERE prime_ministers.continent = states.continent) AS contries_num #subqueiry in SELECT needs an alias to display
FROM prime_ministers;

## 38. Subquery: FROM



In [None]:
#build-up
#maximum percentage of women in parliament for each continet listed in states tabble
SELECT continent, MAX(woemn_parli_perc) AS max_perc
FROM states
GROUP BY continent
ORDER BY continent;

#if we only interested in those the monachs table
SELECT DISTINCT monarchs.continent, subquery.max_perc
FROM monarch,
    (SELECT continent, MAX(women_parli_perc) AS MAX_perc
    FROM states
    GROUP BY continent) AS subquery #need the alias, this is a temprorary table for FROM
WHERE monarchs.continent = subquery.continent
ORDER BY continent;