In [1]:
import sqlite3
import pandas as pd

In [2]:
pd.options.display.max_columns = None
#pd.options.display.max_rows = None

In [3]:
conn = sqlite3.connect('parch-and-posey.db')

In [4]:
cursor = conn.cursor()
cursor.execute('''
select * from sqlite_master where type = "table";
''')
columns = [col[0] for col in cursor.description]
data = cursor.fetchall()
cursor.close()

In [5]:
pd.DataFrame(data, columns=columns)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,web_events,web_events,2,"CREATE TABLE web_events (\tid integer,\taccoun..."
1,table,sales_reps,sales_reps,92,"CREATE TABLE sales_reps (\tid integer,\tname b..."
2,table,region,region,93,"CREATE TABLE region (\tid integer,\tname bpchar)"
3,table,orders,orders,94,"CREATE TABLE orders (\tid integer,\taccount_id..."
4,table,accounts,accounts,221,"CREATE TABLE accounts (\tid integer,\tname bpc..."


Task 1:

Use DISTINCT to test if there are any accounts associated with more than one region.

In [45]:
# strange approach

pd.read_sql_query(sql='''
SELECT MIN(r.name), MAX(r.name)
FROM accounts a
JOIN sales_reps s
ON a.sales_rep_id = s.id
JOIN region r
ON s.region_id = r.id
GROUP BY a.name
HAVING MIN(r.name) != MAX(r.name);
''', con=conn)

Unnamed: 0,MIN(r.name),MAX(r.name)


In [38]:
# count approach

pd.read_sql_query(sql='''
SELECT a.name, COUNT (*) count
FROM accounts a
JOIN sales_reps s
ON a.sales_rep_id = s.id
JOIN region r
ON s.region_id = r.id
GROUP BY a.name
ORDER BY count DESC;
''', con=conn)

Unnamed: 0,name,count
0,eBay,1
1,Yum Brands,1
2,Xerox,1
3,Xcel Energy,1
4,World Fuel Services,1
...,...,...
346,AIG,1
347,AES,1
348,AECOM,1
349,ADP,1


In [39]:
# compare number of results of all combinations to distinct

pd.read_sql_query(sql='''
SELECT a.name, r.name
FROM accounts a
JOIN sales_reps s
ON a.sales_rep_id = s.id
JOIN region r
ON s.region_id = r.id;
''', con=conn)

Unnamed: 0,name,name.1
0,Walmart,Northeast
1,Exxon Mobil,Northeast
2,Apple,Northeast
3,Berkshire Hathaway,Northeast
4,McKesson,Northeast
...,...,...
346,KKR,West
347,Oneok,West
348,Newmont Mining,West
349,PPL,West


In [40]:
# compare number of results of all combinations to distinct

pd.read_sql_query(sql='''
SELECT DISTINCT id, name FROM accounts;
''', con=conn)

Unnamed: 0,id,name
0,1001,Walmart
1,1011,Exxon Mobil
2,1021,Apple
3,1031,Berkshire Hathaway
4,1041,McKesson
...,...,...
346,4461,KKR
347,4471,Oneok
348,4481,Newmont Mining
349,4491,PPL


Solution:

The below two queries have the same number of resulting rows (351), so we know that every account is associated with only one region. If each account was associated with more than one region, the first query should have returned more rows than the second query.

SELECT a.id as "account id", r.id as "region id", 
a.name as "account name", r.name as "region name"
FROM accounts a
JOIN sales_reps s
ON s.id = a.sales_rep_id
JOIN region r
ON r.id = s.region_id;
and

SELECT DISTINCT id, name
FROM accounts;

Task 2:

Have any sales reps worked on more than one account?

In [48]:
# strange approach

pd.read_sql_query(sql='''
SELECT MIN(a.name), MAX(a.name)
FROM accounts a
JOIN sales_reps s
ON a.sales_rep_id = s.id
GROUP BY s.name
HAVING MIN(a.name) != MAX(a.name);
''', con=conn)

Unnamed: 0,MIN(a.name),MAX(a.name)
0,Comcast,FedEx
1,Applied Materials,W.W. Grainger
2,AmerisourceBergen,Cisco Systems
3,CBS,Viacom
4,Charter Communications,Voya Financial
5,AES,Supervalu
6,Bank of America Corp.,Northwestern Mutual
7,Danaher,Northrop Grumman
8,Altria Group,Whirlpool
9,Abbott Laboratories,Tech Data


In [49]:
# count approach

pd.read_sql_query(sql='''
SELECT s.name, COUNT(*) count
FROM accounts a
JOIN sales_reps s
ON a.sales_rep_id = s.id
GROUP BY s.name
ORDER BY count DESC;
''', con=conn)

Unnamed: 0,name,count
0,Georgianna Chisholm,15
1,Vernita Plump,11
2,Micha Woodford,11
3,Maryanna Fiorentino,11
4,Maren Musto,11
5,Earlie Schleusner,11
6,Dorotha Seawell,11
7,Calvin Ollison,11
8,Saran Ram,10
9,Moon Torian,10


In [44]:
# compare number of results of all combinations to distinct

pd.read_sql_query(sql='''
SELECT *
FROM accounts a
JOIN sales_reps s
ON a.sales_rep_id = s.id;
''', con=conn)

Unnamed: 0,id,name,website,lat,long,primary_poc,sales_rep_id,id.1,name.1,region_id
0,1001,Walmart,www.walmart.com,40.238496,-75.103297,Tamara Tuma,321500,321500,Samuel Racine,1
1,1011,Exxon Mobil,www.exxonmobil.com,41.169156,-73.849374,Sung Shields,321510,321510,Eugena Esser,1
2,1021,Apple,www.apple.com,42.290495,-76.084009,Jodee Lupo,321520,321520,Michel Averette,1
3,1031,Berkshire Hathaway,www.berkshirehathaway.com,40.949021,-75.763898,Serafina Banda,321530,321530,Renetta Carew,1
4,1041,McKesson,www.mckesson.com,42.217093,-75.284998,Angeles Crusoe,321540,321540,Cara Clarke,1
...,...,...,...,...,...,...,...,...,...,...
346,4461,KKR,www.kkr.com,45.545353,-122.655247,Buffy Azure,321970,321970,Georgianna Chisholm,4
347,4471,Oneok,www.oneok.com,45.513513,-122.681500,Esta Engelhardt,321960,321960,Maryanna Fiorentino,4
348,4481,Newmont Mining,www.newmont.com,45.494117,-122.669460,Khadijah Riemann,321970,321970,Georgianna Chisholm,4
349,4491,PPL,www.pplweb.com,45.491720,-122.671880,Deanne Hertlein,321960,321960,Maryanna Fiorentino,4


In [43]:
# compare number of results of all combinations to distinct

pd.read_sql_query(sql='''
SELECT DISTINCT id, name
FROM sales_reps;
''', con=conn)

Unnamed: 0,id,name
0,321500,Samuel Racine
1,321510,Eugena Esser
2,321520,Michel Averette
3,321530,Renetta Carew
4,321540,Cara Clarke
5,321550,Lavera Oles
6,321560,Elba Felder
7,321570,Shawanda Selke
8,321580,Sibyl Lauria
9,321590,Necole Victory


Solution:

Actually all of the sales reps have worked on more than one account. The fewest number of accounts any sales rep works on is 3. There are 50 sales reps, and they all have more than one account. Using DISTINCT in the second query assures that all of the sales reps are accounted for in the first query.

SELECT s.id, s.name, COUNT(*) num_accounts
FROM accounts a
JOIN sales_reps s
ON s.id = a.sales_rep_id
GROUP BY s.id, s.name
ORDER BY num_accounts;
and

SELECT DISTINCT id, name
FROM sales_reps;