<img src="https://imgs.xkcd.com/comics/exploits_of_a_mom.png" />

In [1]:
%load_ext sql
%sql postgres://jovyan:si330studentuser@localhost:5432/si330
import psycopg2
import sqlalchemy
import pandas as pd
engine = sqlalchemy.create_engine('postgres://jovyan:si330studentuser@localhost:5432/si330')
%sql drop table if exists person cascade
pd.read_csv("datasets/person.csv").to_sql("person",engine)
%sql drop table if exists sales cascade
pd.read_csv("datasets/sales.csv").to_sql("sales",engine)

 * postgres://jovyan:***@localhost:5432/si330
Done.
 * postgres://jovyan:***@localhost:5432/si330
Done.


# A brief reminder of the power of SQL
* There is no ordering in SQL
* Everything is a table (relation), and we can thus pass around select statements as if they were tables
* This allows us to make subselects, and doing so can be very efficient
* We can restrict the data we want back as far as the columns and we can use where clauses to restrict rows on criteria
* We can run aggregation functions across sets of data, and we can group data by column values if we want to

# SQL Pointers
* Practice shaping the data to your needs, not fighting to force it to be your result
* Think about writing complex SQL queries as a choreography of manipulations and not a single monolithic "correct" statement

<img src="https://media1.tenor.com/images/2bea107f3e0d43b0610ab11324ba2b6d/tenor.gif?itemid=4849726" />

# Joins
* It is common to bring together datasets through SQL joins
* Unfortunatly, there are many different syntaxes for joining in SQL, which can impede learning
* Just like in pandas, joining in SQL (which we called merging in pandas) is based on 2 tables only, a left and a right
* A quick reminder on joining in general

<img src="https://i.stack.imgur.com/hMKKt.jpg" />

In [2]:
# to join we need a common column between the two tables
# usually this column is unique in one table (like a person 
# person in our person table)
%sql select * from person limit 1

 * postgres://jovyan:***@localhost:5432/si330
1 rows affected.


index,first_name,last_name,street_address,city,postcode,id
0,Amanda,Miller,4302 John Skyway Apt. 650,West Victor,55311,4910


In [3]:
# and usually this column is not unique in the other table (like
# the person identifier in the sales table)
%sql select * from sales limit 1

 * postgres://jovyan:***@localhost:5432/si330
1 rows affected.


index,credit_card_number,credit_card_provider,id,price,purchase_date,isbn10
0,2286937402408045,Discover,8360,7681,2020-03-02,0-7276-4998-1


In [4]:
# whoever designed this data did a poor job, since it's unclear
# where we expect to link data. We can strengthen this in a few ways,
# including key constraints. Or a better naming convention, like
%sql alter table sales rename column id to person_id;

 * postgres://jovyan:***@localhost:5432/si330
Done.


[]

In [4]:
%sql select * from sales limit 1

 * postgres://jovyan:***@localhost:5432/si330
1 rows affected.


index,credit_card_number,credit_card_provider,id,price,purchase_date,isbn10
0,2286937402408045,Discover,8360,7681,2020-03-02,0-7276-4998-1


 * postgres://jovyan:***@localhost:5432/si330
(psycopg2.errors.UndefinedColumn) column sales.person_id does not exist
LINE 1: SELECT * FROM sales, person WHERE sales.person_id = person.i...
                                          ^

[SQL: SELECT * FROM sales, person WHERE sales.person_id = person.id limit 5;]
(Background on this error at: http://sqlalche.me/e/13/f405)


In [5]:
# in this instance, we might want to answer the a question like "Give me all
# of the sales and people information for people who bought things". This is
# an inner join, since we want all data but on strict condition checking.
%sql select * from sales s, person p where s.person_id=p.id limit 5;

 * postgres://jovyan:***@localhost:5432/si330
(psycopg2.errors.UndefinedColumn) column s.person_id does not exist
LINE 1: select * from sales s, person p where s.person_id=p.id limit...
                                              ^

[SQL: select * from sales s, person p where s.person_id=p.id limit 5;]
(Background on this error at: http://sqlalche.me/e/13/f405)


In [6]:
# we have can limit these columns to just include those from the two named tables
# using glob functions, or specific names
%sql select p.*, s.price, s.purchase_date from sales s, person p where s.person_id=p.id limit 5;

 * postgres://jovyan:***@localhost:5432/si330
(psycopg2.errors.UndefinedColumn) column s.person_id does not exist
LINE 1: ...ice, s.purchase_date from sales s, person p where s.person_i...
                                                             ^

[SQL: select p.*, s.price, s.purchase_date from sales s, person p where s.person_id=p.id limit 5;]
(Background on this error at: http://sqlalche.me/e/13/f405)


In [9]:
# ok, the inner join is easy, and that's the syntax I use all the time
# but we can do the same inner join using the SQL join syntax, and this give us the ability
# to do other kinds of joins as well
# inner join is default
# the join syntax is

# SELECT FROM table1 JOIN table2 ON table1.column=table2.column
%sql select * from person p join sales s on p.id=s.person_id limit 2;

#inner join
#%sql select * from person p INNER JOIN sales s on p.id=s.person_id limit 2;

#full (outer) join
#%sql select * from person p FULL JOIN sales s on p.id=s.person_id limit 2;

#left join
#%sql select * from person p LEFT JOIN sales s on p.id=s.person_id limit 2;

#right join
#%sql select * from person p RIGHT JOIN sales s on p.id=s.person_id limit 2;

 * postgres://jovyan:***@localhost:5432/si330
2 rows affected.


index,first_name,last_name,street_address,city,postcode,id,index_1,credit_card_number,credit_card_provider,person_id,price,purchase_date,isbn10
981,Caleb,Perry,6298 Ruben Shoal,New Evanfurt,14301,8360,0,2286937402408045,Discover,8360,7681,2020-03-02,0-7276-4998-1
480,Geoffrey,Ray,28512 Hunt Wells,East Reginaldberg,15931,2416,1,30009424238819,JCB 15 digit,2416,1820,2020-04-13,0-617-66614-8


* to do the other kinds of joins, we just change JOIN to one of
  * INNER JOIN (this is a synonym for what we already have)
  * LEFT JOIN (give us everything in the left as well as things that might match in the right, or null if no match)
  * RIGHT JOIN (give us everything in the right as well as things that might match in the left, or null if no match)
  * FULL JOIN (give us everything everywhere with nulls as appropriate)

In [10]:
# First, let's do a sanity check, how many people are in the system but have not 
# bought anything
%sql select count(*) from person p where p.id not in (select person_id from sales);

 * postgres://jovyan:***@localhost:5432/si330
1 rows affected.


count
200


In [11]:
# and how many people are in the sales but are not in the system?
%sql select count(*) from sales s where s.person_id not in (select id from person);

 * postgres://jovyan:***@localhost:5432/si330
1 rows affected.


count
0


In [None]:
# let's save it!

In [12]:
%%sql
create table datastore as (
    select p.first_name, p.last_name, s.price
    from person p join sales s on p.id=s.person_id
)

 * postgres://jovyan:***@localhost:5432/si330
1000 rows affected.


[]

In [13]:
%sql select * from datastore limit 10;

 * postgres://jovyan:***@localhost:5432/si330
10 rows affected.


first_name,last_name,price
Caleb,Perry,7681
Geoffrey,Ray,1820
Misty,Smith,2782
Cindy,Lewis,844
Corey,Michael,4740
Alejandro,Jones,2332
Christopher,Caldwell,5996
Michael,Thompson,7010
Mariah,Lucas,9009
Kyle,Mcfarland,8473


In [None]:
# So our sales table is, with respect to person identifier, a subset of our person table

In [14]:
# To demonstrate the other kinds of joins we need to remove some data from persons so that
# the one table isn't a complete superset of the other. This is just for demonstration purposes
%sql delete from person where id %2=0

 * postgres://jovyan:***@localhost:5432/si330
524 rows affected.


[]

In [15]:
%sql select count(*) from person

 * postgres://jovyan:***@localhost:5432/si330
1 rows affected.


count
476


In [18]:
# Ok, let's demonstrate the inner join again
%sql select p.first_name, s.price from person p join sales s on p.id=s.person_id limit 10;

 * postgres://jovyan:***@localhost:5432/si330
10 rows affected.


first_name,price
Alejandro,2332
Christopher,5996
Mariah,9009
Deborah,6870
David,6233
David,5526
Dustin,4832
Cindy,586
Michael,8981
Todd,8048


In [20]:
# now if I left join it, I could have more records. These would be people who are in our table
# but are not in our sales table. And we should see some null values on the right hand side (sales)
%sql select p.first_name, s.price from person p LEFT JOIN sales s on p.id=s.person_id limit 10;

 * postgres://jovyan:***@localhost:5432/si330
10 rows affected.


first_name,price
Alejandro,2332
Christopher,5996
Mariah,9009
Deborah,6870
David,6233
David,5526
Dustin,4832
Cindy,586
Michael,8981
Todd,8048


In [21]:
# now if I right join it, I could have more records. These would be sales who are in our sales table
# but are not in our people table. And we should see some null values on the left hand side (person)
%sql select p.first_name, s.price from person p RIGHT JOIN sales s on p.id=s.person_id limit 10;

 * postgres://jovyan:***@localhost:5432/si330
10 rows affected.


first_name,price
,7681
,1820
,2782
,844
,4740
Alejandro,2332
Christopher,5996
,7010
Mariah,9009
,8473


In [22]:
# now a full join will bring me the most rows back, because it will bring me back all people and
# all sales regardless of whether they exist in each table, but it will align them if possible (and
# null padd if not)
%sql select p.first_name, s.price from person p FULL JOIN sales s on p.id=s.person_id limit 10;

 * postgres://jovyan:***@localhost:5432/si330
10 rows affected.


first_name,price
,7681
,1820
,2782
,844
,4740
Alejandro,2332
Christopher,5996
,7010
Mariah,9009
,8473


### Summary
* joining is fundamental in SQL
* I've covered it here in a nutshell, you can already use your pandas powers to do this!
* You can use joining to weed out values (many ways to skin a cat with SQL!)

In [23]:
%%sql
-- how would we get the top 25 people we have made money from?
select per.first_name, per.last_name, per.street_address, top25.total_sales, top25.number_bought from person per
join (
    select count(*) as number_bought, sum(s.price) as total_sales, p.id 
    from person p join sales s on p.id=s.person_id 
    group by p.id
    order by total_sales desc
    limit 25
) top25 on per.id=top25.id;

 * postgres://jovyan:***@localhost:5432/si330
25 rows affected.


first_name,last_name,street_address,total_sales,number_bought
Heather,Huber,56064 Colleen Mall Suite 443,12446,2
Howard,Brown,60122 Miller Street Apt. 853,12641,2
Justin,Rivera,593 Lindsay Extensions Apt. 104,18215,2
Holly,Meyers,542 Jefferson Mountains,15825,3
Levi,Singh,075 Cruz Courts Suite 471,23520,4
Geoffrey,Pace,12785 Lisa Creek Suite 589,12775,2
Daniel,Hernandez,0895 Benjamin Highway Suite 263,19401,2
Michael,Dominguez,17506 Phillips Flats Suite 866,13491,2
Michael,Hernandez,850 Bishop Oval Suite 991,13244,4
Matthew,Chapman,65222 Jared Groves Suite 225,13719,2
