## Summarizing Data with SQL

In [65]:
import pandas as pd
import sqlite3
import sqlalchemy
# from sqlalchemy import create_engine, text

In [66]:
conn = sqlite3.connect('ladder.db')

### Summary Statistics

###### 32) How many rows are in the `pets` table?

In [9]:
pd.read_sql("select count(*) from pets;", conn)

Unnamed: 0,count(*)
0,13


###### 33) How many female pets are in the `pets` table?

In [20]:
pd.read_sql("select count(*) from pets where sex = 'F';", conn)

Unnamed: 0,count(*)
0,7


###### 34) How many female cats are in the `pets` table?

In [24]:
pd.read_sql("select count(*) from pets where sex = 'F' and species = 'cat';", conn)

Unnamed: 0,count(*)
0,4


###### 35) What's the mean age of pets in the `pets` table?

In [27]:
pd.read_sql("select AVG(age) from pets;", conn)

Unnamed: 0,AVG(age)
0,5.230769


###### 36) What's the mean age of dogs in the `pets` table?

In [28]:
pd.read_sql("select AVG(age) from pets where species = 'dog';", conn)

Unnamed: 0,AVG(age)
0,6.5


###### 37) What's the mean age of male dogs in the `pets` table?

In [31]:
pd.read_sql("select round(AVG(age), 2) from pets where species = 'dog' and sex = 'M';", conn)

Unnamed: 0,"round(AVG(age), 2)"
0,8.33


###### 38) What's the count, mean, minimum, and maximum of pet ages in the `pets` table?

In [38]:
query = """
select 
count(*) as total,
AVG(age) as mean_age,
MAX(age) as max_age,
MIN(age) as min_age
from pets
"""
pd.read_sql(query, conn)

Unnamed: 0,total,mean_age,max_age,min_age
0,13,5.230769,10,1


###### 39) Repeat the previous problem with the following stipulations:
    * Round the average to one decimal place.
    * Give each column a human-readable column name

In [42]:
query = """
select 
count(*) as 'Number of Pets',
round(AVG(age), 1) as 'Average Age',
MAX(age) as 'Maximum Age',
MIN(age) as 'Minimum Age'
from pets;
"""
pd.read_sql(query, conn)

Unnamed: 0,Number of Pets,Average Age,Maximum Age,Minimum Age
0,13,5.2,10,1


###### 40) How many rows in `employees_null` have missing salaries?

In [44]:
query = """
select 
count(*)
from employees_null
where salary is NULL
"""
pd.read_sql(query, conn)

Unnamed: 0,count(*)
0,10


###### 41) How many salespeople in `employees_null` having _nonmissing_ salaries?

In [46]:
query = """
select 
count(*)
from employees_null
where salary is NULL and job = 'Sales' 
"""
pd.read_sql(query, conn)

Unnamed: 0,count(*)
0,8


###### 42) What's the mean salary of employees who joined the company after 2010? Go back to the usual `employees` table for this one.

In [70]:
query = """
select
AVG(salary) as avg_salary
from employees
where startdate >= 2010
"""
pd.read_sql(query, conn)

Unnamed: 0,avg_salary
0,79814.6


###### 43) What's the mean salary of employees in Swiss Francs?
* _Hint:_ Swiss Francs are abbreviated "CHF" and 1 USD = 0.97 CHF.

In [76]:
query = """
select
AVG(salary)*0.97 as avg_salary_chf
from employees
"""
pd.read_sql(query, conn)

Unnamed: 0,avg_salary_chf
0,75727.5605


###### 44) Create a query that computes the mean salary in USD as well as CHF. Give the columns human-readable names (for example "Mean Salary in USD"). Also, format them with comma delimiters and currency symbols.
 * _NOTE:_ Comma-delimiting numbers is only available for integers in SQLite, so rounding (down) to the nearest dollar or franc will be done for us.
    * _NOTE2:_ The symbols for francs is simply `Fr.` or `fr.`. So an example output will look like `100,000 Fr.`.

In [99]:
query = """
select
printf('$%,.2d', saraly_USD) as "Mean Salary in USD",
printf('Fr %,.2d', salary_Fr) as "Mean Salary in Fr"

from(select
AVG(salary) as saraly_USD,
AVG(salary) * 0.97 as salary_Fr
from employees)

"""
pd.read_sql(query, conn)

Unnamed: 0,Mean Salary in USD,Mean Salary in Fr
0,"$78,069","Fr 75,727"


## Aggregating Statistics with GROUP BY

###### 45) What is the average age of `pets` by species?

In [123]:
query = """
select
AVG(age) as avg_age
from pets
group by species;
"""
pd.read_sql(query, conn)

Unnamed: 0,avg_age
0,4.333333
1,6.5
2,3.0


###### 46) Repeat the previous problem but make sure the species label is also displayed! Assume this behavior is always being asked of you any time you use `GROUP BY`.

In [124]:
query = """
select
species,
AVG(age) as avg_age
from pets
group by species;
"""
pd.read_sql(query, conn)

Unnamed: 0,species,avg_age
0,cat,4.333333
1,dog,6.5
2,lobster,3.0


###### 47) What is the count, mean, minimum, and maximum age by species in `pets`?

In [125]:
query = """
select
species,
count(*) as total_pets,
AVG(age) as avg_age,
MIN(age) as min_age,
MAX(age) as max_age
from pets
group by species;
"""
pd.read_sql(query, conn)

Unnamed: 0,species,total_pets,avg_age,min_age,max_age
0,cat,6,4.333333,2,7
1,dog,6,6.5,1,10
2,lobster,1,3.0,3,3


###### 48) Show the mean salaries of each job title in `employees`.

In [131]:
query = """
select 
job,
round(AVG(salary), 2) as 'Mean salary'
from employees
group by job;
"""
pd.read_sql(query, conn)

Unnamed: 0,job,Mean salary
0,Administrator,71986.14
1,IT,71381.0
2,Operations,74055.25
3,Sales,80778.04


###### 49) Show the mean salaries in New Zealand dollars of each job title in `employees`.
    * _NOTE:_ 1 USD = 1.65 NZD.

In [142]:
query = """
select 
job,
round(AVG(salary)) *1.65 as 'Mean salary in NZD'
from employees
group by job;
"""
pd.read_sql(query, conn)

Unnamed: 0,job,Mean salary in NZD
0,Administrator,118776.9
1,IT,117778.65
2,Operations,122190.75
3,Sales,133283.7


###### 50) Show the mean, min, and max salaries of each job title in `employees`, as well as the numbers of employees in each category.

In [143]:
query = """
select
job,
count(*) as 'Number of employees',
round(AVG(salary)) as 'Mean salary',
MIN(salary) as 'Minimumu salary',
MAX(salary) as 'Maximum salary'
from employees
group by job;
"""
pd.read_sql(query, conn)

Unnamed: 0,job,Number of employees,Mean salary,Minimumu salary,Maximum salary
0,Administrator,14,71986.0,41151,120492
1,IT,10,71381.0,37397,115729
2,Operations,8,74055.0,41797,108989
3,Sales,68,80778.0,31333,124474


###### 51) Show the mean salaries of each job title in `employees` sorted descending by salary.

In [144]:
query = """
select
job,
round(AVG(salary)) as 'Mean salary'
from employees
group by job
order by 'Mean salary' desc;
"""
pd.read_sql(query, conn)

Unnamed: 0,job,Mean salary
0,Sales,80778.0
1,Operations,74055.0
2,IT,71381.0
3,Administrator,71986.0


###### 52) What are the top 5 most common first names among `employees`?

In [148]:
query = """
select
firstname,
count(firstname) as count 
from employees
group by firstname
order by count desc
limit 5
"""
pd.read_sql(query, conn)

Unnamed: 0,firstname,count
0,Thomas,3
1,Robert,3
2,Michael,3
3,Lisa,3
4,William,2


###### 53) Show all first names which have exactly 2 occurrences in `employees`.

In [173]:
query = """
select
firstname,
count(firstname) as count 
from employees
group by firstname
order by count = 2 desc
limit 5
"""
pd.read_sql(query, conn)

Unnamed: 0,firstname,count
0,William,2
1,Shannon,2
2,Mark,2
3,Leslie,2
4,Joseph,2


###### 54) Take a look at the `transactions` table to get a idea of what it contains. Note that a transaction may span multiple rows if different items are purchased as part of the same order. The employee who made the order is also given by their ID.

In [177]:
query = """
select *
from transactions
limit 2
"""
pd.read_sql(query, conn)

Unnamed: 0,order_id,customer,unit_price,quantity,orderdate,employee_id
0,0,Bautista Group,20.5,12,2018-10-27,81
1,0,Bautista Group,24.0,11,2018-10-27,81


###### 55) Show the top 5 largest orders (and their respective customer) in terms of the numbers of items purchased in that order.

In [184]:
query = """
select
unit_price,
quantity
from transactions
order by quantity desc
limit 5
"""
pd.read_sql(query, conn)

Unnamed: 0,unit_price,quantity
0,17.25,27
1,22.75,25
2,10.25,24
3,3.5,24
4,3.75,24


###### 56) Show the total cost of each transaction.
    * _Hint:_ The `unit_price` column is the price of _one_ item. The customer may have purchased multiple.

In [193]:
query = """
select 
unit_price * quantity as total_price
from transactions
order by total_price 
"""
pd.read_sql(query, conn)

Unnamed: 0,total_price
0,0.00
1,3.25
2,4.00
3,4.25
4,5.50
...,...
50076,519.75
50077,534.75
50078,552.00
50079,568.75


###### 57) Show the top 5 transactions in terms of total cost.

In [197]:
query = """
select
unit_price * quantity as total_price
from transactions
order by total_price desc
limit 5
"""
pd.read_sql(query, conn)

Unnamed: 0,total_price
0,569.25
1,568.75
2,552.0
3,534.75
4,519.75


###### 58) Show the top 5 customers in terms of total revenue (ie, which customers have we done the most business with in terms of money?)

In [198]:
query = """
select
customer,
unit_price * quantity as total_price
from transactions
order by total_price desc
limit 5
"""
pd.read_sql(query, conn)

Unnamed: 0,customer,total_price
0,Barnett-Keller,569.25
1,Norman-Briggs,568.75
2,Smith-Thomas,552.0
3,"Moore, Odonnell and Adams",534.75
4,Martinez and Sons,519.75


###### 59) Show the top 5 employees in terms of revenue generated (ie, which employees made the most in sales?)

In [199]:
query = """
select
employee_id,
customer,
unit_price * quantity as total_price
from transactions
order by total_price desc
limit 5
"""
pd.read_sql(query, conn)

Unnamed: 0,employee_id,customer,total_price
0,93,Barnett-Keller,569.25
1,51,Norman-Briggs,568.75
2,20,Smith-Thomas,552.0
3,57,"Moore, Odonnell and Adams",534.75
4,1,Martinez and Sons,519.75


###### 60) Which customer worked with the largest number of employees?
    * _Hint:_ This is a tough one! Check out the `DISTINCT` keyword.

In [239]:
query = """
select 
count(employee_id),
employee_id,
customer
from transactions
group by customer
order by count(employee_id) desc
limit 1

"""
pd.read_sql(query, conn)

Unnamed: 0,count(employee_id),employee_id,customer
0,627,91,Kelly-Wright


###### 61) Show all customers who've done more than $80,000 worth of business with us.

In [263]:
query = """
select
distinct customer,
sum(unit_price * quantity) as total_business_USD
from transactions
group by customer
having total_business_USD > 80000

"""
pd.read_sql(query, conn)

Unnamed: 0,customer,total_business_USD
0,Ewing-Black,83294.25
1,Kelly-Wright,89645.25
2,Norman-Briggs,80331.5
3,Sanders PLC,84383.0
4,"Taylor, Patel and Harvey",81818.25
5,Thompson-Fowler,80152.25
6,Tucker Ltd,85485.0
7,"Vega, Rivera and Elliott",81595.0
