In [2]:
import sqlite3
import pandas as pd

In [3]:
pd.options.display.max_columns = None
#pd.options.display.max_rows = None

In [4]:
conn = sqlite3.connect('parch-and-posey.db')

In [5]:
cursor = conn.cursor()
cursor.execute('''
select * from sqlite_master where type = "table";
''')
columns = [col[0] for col in cursor.description]
data = cursor.fetchall()
cursor.close()

In [6]:
pd.DataFrame(data, columns=columns)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,web_events,web_events,2,"CREATE TABLE web_events (\tid integer,\taccoun..."
1,table,sales_reps,sales_reps,7,"CREATE TABLE sales_reps (\tid integer,\tname b..."
2,table,region,region,222,"CREATE TABLE region (\tid integer,\tname bpchar)"
3,table,orders,orders,223,"CREATE TABLE orders (\tid integer,\taccount_id..."
4,table,accounts,accounts,583,"CREATE TABLE accounts (\tid integer,\tname bpc..."


# UNION

UNION Use Case

The UNION operator is used to combine the result sets of 2 or more SELECT statements. It removes duplicate rows between the various SELECT statements.
Each SELECT statement within the UNION must have the same number of fields in the result sets with similar data types.
Typically, the use case for leveraging the UNION command in SQL is when a user wants to pull together distinct values of specified columns that are spread across multiple tables. For example, a chef wants to pull together the ingredients and respective aisle across three separate meals that are maintained in different tables.

Details of UNION

There must be the same number of expressions in both SELECT statements.
The corresponding expressions must have the same data type in the SELECT statements. For example: expression1 must be the same data type in both the first and second SELECT statement.

Expert Tip

UNION removes duplicate rows.
UNION ALL does not remove duplicate rows.

Resources

The resource here on SQL UNIONs is helpful in understanding syntax and examples.

https://www.techonthenet.com/sql/union.php

In [9]:
# UNION drops duplicates = see row number

pd.read_sql_query(sql = '''
SELECT * FROM web_events w1
UNION
SELECT * FROM web_events w2
;
''', con=conn)

Unnamed: 0,id,account_id,occurred_at,channel
0,1,1001,2015-10-06 17:13:58,direct
1,2,1001,2015-11-05 03:08:26,direct
2,3,1001,2015-12-04 03:57:24,direct
3,4,1001,2016-01-02 00:55:03,direct
4,5,1001,2016-02-01 19:02:33,direct
...,...,...,...,...
9068,9069,4491,2016-10-04 15:43:29,facebook
9069,9070,4491,2016-10-04 23:42:41,twitter
9070,9071,4491,2016-11-06 07:23:45,organic
9071,9072,4491,2016-12-18 03:21:31,organic


In [11]:
# no duplicates

pd.read_sql_query(sql = '''
SELECT *, 1 AS 'Input' FROM web_events w1
UNION
SELECT *, 2 AS 'Input' FROM web_events w2
;
''', con=conn)

Unnamed: 0,id,account_id,occurred_at,channel,Input
0,1,1001,2015-10-06 17:13:58,direct,1
1,1,1001,2015-10-06 17:13:58,direct,2
2,2,1001,2015-11-05 03:08:26,direct,1
3,2,1001,2015-11-05 03:08:26,direct,2
4,3,1001,2015-12-04 03:57:24,direct,1
...,...,...,...,...,...
18141,9071,4491,2016-11-06 07:23:45,organic,2
18142,9072,4491,2016-12-18 03:21:31,organic,1
18143,9072,4491,2016-12-18 03:21:31,organic,2
18144,9073,4501,2016-05-30 00:46:53,organic,1


SQL's two strict rules for appending data:

Both tables must have the same number of columns.
Those columns must have the same data types in the same order as the first table.
A common misconception is that column names have to be the same. Column names, in fact, don't need to be the same to append two tables but you will find that they typically are.

In [12]:
# UNION ALL doesn't drop duplicates = see row number

pd.read_sql_query(sql = '''
SELECT * FROM web_events w1
UNION ALL
SELECT * FROM web_events w2
;
''', con=conn)

Unnamed: 0,id,account_id,occurred_at,channel
0,1,1001,2015-10-06 17:13:58,direct
1,2,1001,2015-11-05 03:08:26,direct
2,3,1001,2015-12-04 03:57:24,direct
3,4,1001,2016-01-02 00:55:03,direct
4,5,1001,2016-02-01 19:02:33,direct
...,...,...,...,...
18141,9069,4491,2016-10-04 15:43:29,facebook
18142,9070,4491,2016-10-04 23:42:41,twitter
18143,9071,4491,2016-11-06 07:23:45,organic
18144,9072,4491,2016-12-18 03:21:31,organic


In [23]:
# prepare table (WHERE)

pd.read_sql_query(sql = '''
SELECT * FROM web_events w1
WHERE w1.channel = 'direct'
UNION ALL
SELECT * FROM web_events w2
WHERE w2.channel = 'facebook'
;
''', con=conn)

Unnamed: 0,id,account_id,occurred_at,channel
0,1,1001,2015-10-06 17:13:58,direct
1,2,1001,2015-11-05 03:08:26,direct
2,3,1001,2015-12-04 03:57:24,direct
3,4,1001,2016-01-02 00:55:03,direct
4,5,1001,2016-02-01 19:02:33,direct
...,...,...,...,...
6260,9061,4491,2016-03-10 19:42:34,facebook
6261,9063,4491,2016-04-04 15:57:45,facebook
6262,9066,4491,2016-07-18 21:31:10,facebook
6263,9067,4491,2016-08-14 15:41:44,facebook


In [24]:
# subtable

pd.read_sql_query(sql = '''
SELECT channel, COUNT (*) count
FROM (SELECT * FROM web_events
UNION ALL
SELECT * FROM web_events)
GROUP BY 1
ORDER BY 2 DESC;
''', con=conn)

Unnamed: 0,channel,count
0,direct,10596
1,facebook,1934
2,organic,1904
3,adwords,1812
4,banner,952
5,twitter,948


In [28]:
# CET (common table expression)

pd.read_sql_query(sql = '''
WITH t1 AS (SELECT * FROM web_events
UNION ALL
SELECT * FROM web_events)

SELECT channel, COUNT (*) count
FROM t1
GROUP BY 1
ORDER BY 2 DESC;
''', con=conn)

Unnamed: 0,channel,count
0,direct,10596
1,facebook,1934
2,organic,1904
3,adwords,1812
4,banner,952
5,twitter,948


In [29]:
# BEWARE!!!

pd.read_sql_query(sql = '''
WITH web_events AS (SELECT * FROM web_events
UNION ALL
SELECT * FROM web_events)

SELECT channel, COUNT (*) count
FROM web_events
GROUP BY 1
ORDER BY 2 DESC;
''', con=conn)

DatabaseError: Execution failed on sql '
WITH web_events AS (SELECT * FROM web_events
UNION ALL
SELECT * FROM web_events)

SELECT channel, COUNT (*) count
FROM web_events
GROUP BY 1
ORDER BY 2 DESC;
': circular reference: web_events

In [31]:
# BEWARE!!!

pd.read_sql_query(sql = '''
WITH web_events AS (SELECT * FROM web_events w1
UNION ALL
SELECT * FROM web_events w2)

SELECT channel, COUNT (*) count
FROM web_events
GROUP BY 1
ORDER BY 2 DESC;
''', con=conn)

DatabaseError: Execution failed on sql '
WITH web_events AS (SELECT * FROM web_events w1
UNION ALL
SELECT * FROM web_events w2)

SELECT channel, COUNT (*) count
FROM web_events
GROUP BY 1
ORDER BY 2 DESC;
': circular reference: web_events