# SQL Demos
Notebook by Chris Pyles, data from Lahman Baseball Dataset

In [None]:
import sqlite3

import pandas as pd
from sqlalchemy import create_engine

db = sqlite3.connect("./lahman2016.sqlite")

look at structure of database

In [None]:
query = """
SELECT * FROM sqlite_master WHERE type = 'table'
"""
pd.read_sql(query, db)

look at years of dataset

In [None]:
query = """
SELECT DISTINCT yearid FROM salaries
"""
pd.read_sql(query, db).T     # transpose to make it look nicer

average salary of hall of famers  in each year in the 5 years with the highest average salaries

In [None]:
query = """
SELECT s.yearid, AVG(salary) AS avg_salary
FROM salaries s
JOIN halloffame h
ON s.playerid = h.playerid
WHERE h.inducted = 'Y'
GROUP BY s.yearid
ORDER BY avg_salary DESC
LIMIT 5
"""
pd.read_sql(query, db)

calculate slugging percentage of players with more than 50 at-bats:

$$\Large
SLG = \frac{H + 2B + 2 \cdot 3B + 3 \cdot 4B}{AB}
$$

(this is a modified formula because `H` is all hits, not just singles)

In [None]:
db.execute("DROP VIEW IF EXISTS slg")

db.execute("""
CREATE VIEW slg(playerid, yearid, slg) AS
SELECT playerid, yearid, (CAST(H AS FLOAT) + '2B' + 2 * '3B' + 3 * HR) / AB
FROM batting
WHERE AB > 50;
""")

query = """
SELECT * FROM slg;
"""
pd.read_sql(query, db).head()

schools whose players have highest salaries (top 10, names only)

In [None]:
query = """
SELECT name_full
FROM schools c
JOIN (
    SELECT schoolid, AVG(salary) AS salary
    FROM collegeplaying c
    JOIN (
        SELECT playerid, AVG(salary) AS salary
        FROM salaries
        GROUP BY playerid
    ) AS s
    ON c.playerid = s.playerid
    GROUP BY schoolid
) AS s
ON c.schoolid = s.schoolid
ORDER BY s.salary DESC
LIMIT 10
"""
pd.read_sql(query, db).head()

what is the YoY percent change in average salary?

In [None]:
query = """
SELECT y2.yearid, (y2.salary - y1.salary) / y1.salary 
FROM (
    SELECT yearid, AVG(salary) AS salary
    FROM salaries
    GROUP BY yearid
    HAVING yearid != (
        SELECT MIN(yearid)
        FROM salaries
    )
) AS y1 
JOIN (
    SELECT yearid, AVG(salary) AS salary
    FROM salaries
    GROUP BY yearid
    HAVING yearid != (
        SELECT MAX(yearid)
        FROM salaries
    )
) AS y2
ON y1.yearid = y2.yearid - 1
"""
pd.read_sql(query, db).head()