Division

In [None]:
-- Select average revenue per employee by sector
SELECT 
     sector, 
     AVG(revenues/employees::numeric) AS avg_rev_employee
FROM fortune500
GROUP BY sector
 -- Use the column alias to order the results
ORDER BY avg_rev_employee;

Explore with division

In [None]:
-- Divide unanswered_count by question_count
SELECT 
     unanswered_count/question_count::numeric AS computed_pct, 
     -- What are you comparing the above quantity to?
     unanswered_pct
FROM stackoverflow
-- Select rows where question_count is not 0
WHERE question_count != 0
LIMIT 10;

Summarize numeric columns

In [None]:
-- Select min, avg, max, and stddev of fortune500 profits
SELECT 
  min(profits),
  avg(profits),
  max(profits),
  stddev(profits)
FROM fortune500;

In [None]:
-- Select sector and summary measures of fortune500 profits
SELECT 
  sector,
  min(profits),
  avg(profits) AS avg,
  max(profits),
  stddev(profits)
FROM fortune500
-- What to group by?
GROUP BY sector
-- Order by the average profits
ORDER BY avg;

Summarize group statistics

In [None]:
-- Compute standard deviation of maximum values
SELECT 
    min(maxval),
    max(maxval),
    avg(maxval),
    stddev(maxval)
  -- Subquery to compute max of question_count by tag
FROM (
    SELECT max(question_count) AS maxval
    FROM stackoverflow
    -- Compute max by...
    GROUP BY tag) AS max_results; -- alias for subquery

Truncate

In [None]:
-- Truncate employees
SELECT 
  trunc(employees, -5) AS employee_bin,
  -- Count number of companies with each truncated value
  COUNT(*)
FROM fortune500
-- Use alias to group
GROUP BY employee_bin
-- Use alias to order
ORDER BY employee_bin;

In [None]:
-- Truncate employees
SELECT 
  trunc(employees, -4) AS employee_bin,
  -- Count number of companies with each truncated value
  COUNT(*)
FROM fortune500
-- Limit to which companies?
WHERE employees < 100000
-- Use alias to group
GROUP BY employee_bin
-- Use alias to order
ORDER BY employee_bin;

Generate series

In [None]:
-- Select the min and max of question_count
SELECT 
     min(question_count), 
     max(question_count)
-- From what table?
FROM stackoverflow
-- For tag dropbox
WHERE tag =  'dropbox';

In [None]:
-- Create lower and upper bounds of bins
SELECT 
    generate_series(2200, 3050, 50) AS lower,
    generate_series(2250, 3100, 50) AS upper;

In [None]:
-- Bins created in Step 2
WITH bins AS (
  SELECT generate_series(2200, 3050, 50) AS lower,
  generate_series(2250, 3100, 50) AS upper),
  -- Subset stackoverflow to just tag dropbox (Step 1)
dropbox AS (
  SELECT question_count 
  FROM stackoverflow
  WHERE tag='dropbox')

-- Select columns for result
-- What column are you counting to summarize?
SELECT 
  lower, 
  upper, 
  COUNT(question_count) 
FROM bins  -- Created above
-- Join to dropbox (created above), 
-- keeping all rows from the bins table in the join
LEFT JOIN dropbox
-- Compare question_count to lower and upper
  ON question_count >= lower 
  AND question_count < upper
-- Group by lower and upper to count values in each bin
GROUP BY lower, upper
-- Order by lower to put bins in order
ORDER BY lower;

Correlation

In [None]:
-- Correlation between revenues and profit
SELECT 
        CORR(revenues, profits) AS rev_profits,
        -- Correlation between revenues and assets
        CORR(revenues, assets) AS rev_assets,
        -- Correlation between revenues and equity
        CORR(revenues, equity) AS rev_equity 
FROM fortune500;

Mean and Median

In [None]:
-- What groups are you computing statistics by?
SELECT 
  sector,
  -- Select the mean of assets with the avg function
  AVG(assets) AS mean,
  -- Select the median
  PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY assets) AS median
FROM fortune500
-- Computing statistics for each what?
GROUP BY sector
-- Order results by a value of interest
ORDER BY mean;

Create a temp table

In [None]:
-- To clear table if it already exists;
-- fill in name of temp table
DROP TABLE IF EXISTS profit80;

-- Create the temporary table
CREATE TEMP TABLE profit80 AS
  -- Select the two columns you need; alias as needed
  SELECT 
    sector, 
    percentile_disc(0.8) WITHIN GROUP (ORDER BY profits) AS pct80
    -- What table are you getting the data from?
  FROM fortune500
   -- What do you need to group by?
  GROUP BY sector;

-- See what you created: select all columns and rows 
-- from the table you created
SELECT * 
FROM profit80;

In [None]:
-- Code from previous step
DROP TABLE IF EXISTS profit80;

CREATE TEMP TABLE profit80 AS
SELECT 
       sector, 
       percentile_disc(0.8) WITHIN GROUP (ORDER BY profits) AS pct80
FROM fortune500
GROUP BY sector;

-- Select columns, aliasing as needed
SELECT 
       title, 
       fortune500.sector, 
       profits, profits/pct80 AS ratio
-- What tables do you need to join?  
FROM fortune500
LEFT JOIN profit80
-- How are the tables joined?
       ON fortune500.sector=profit80.sector
-- What rows do you want to select?
WHERE profits > pct80;

Create a temp table to simplify a query

In [None]:
-- To clear table if it already exists
DROP TABLE IF EXISTS startdates;

-- Create temp table syntax
CREATE TEMP TABLE startdates AS
-- Compute the minimum date for each what?
SELECT 
  tag,
  min(date) AS mindate
FROM stackoverflow
-- What do you need to compute the min date for each tag?
GROUP BY tag;
 
-- Look at the table you created
SELECT * 
FROM startdates;

In [None]:
-- To clear table if it already exists
DROP TABLE IF EXISTS startdates;

CREATE TEMP TABLE startdates AS
  SELECT 
    tag, 
    min(date) AS mindate
  FROM stackoverflow
  GROUP BY tag;
 
-- Select tag (Remember the table name!) and mindate
SELECT 
  startdates.tag, 
  mindate, 
  -- Select question count on the min and max days
  so_min.question_count AS min_date_question_count,
  so_max.question_count AS max_date_question_count,
  -- Compute the change in question_count (max- min)
  so_max.question_count - so_min.question_count AS change
FROM startdates
-- Join startdates to stackoverflow with alias so_min
INNER JOIN stackoverflow AS so_min
-- What needs to match between tables?
  ON startdates.tag = so_min.tag
  AND startdates.mindate = so_min.date
-- Join to stackoverflow again with alias so_max
INNER JOIN stackoverflow AS so_max
  -- Again, what needs to match between tables?
  ON startdates.tag = so_max.tag
  AND so_max.date = '2018-09-25';

Insert into a temp table

In [None]:
DROP TABLE IF EXISTS correlations;

-- Create temp table 
CREATE TEMP TABLE correlations AS
-- Select each correlation
SELECT 
  'profits'::varchar AS measure,
  -- Compute correlations
  CORR(profits, profits) AS profits,
  CORR(profits, profits_change) AS profits_change,
  CORR(profits, revenues_change) AS revenues_change
FROM fortune500;

In [None]:
DROP TABLE IF EXISTS correlations;

CREATE TEMP TABLE correlations AS
  SELECT 
    'profits'::varchar AS measure,
    CORR(profits, profits) AS profits,
    CORR(profits, profits_change) AS profits_change,
    CORR(profits, revenues_change) AS revenues_change
  FROM fortune500;

-- Add a row for profits_change
-- Insert into what table?
INSERT INTO correlations
-- Follow the pattern of the select statement above
-- Using profits_change instead of profits
SELECT 
  'profits_change'::varchar AS measure,
  CORR(profits_change, profits) AS profits,
  CORR(profits_change, profits_change) AS profits_change,
  CORR(profits_change, revenues_change) AS revenues_change
FROM fortune500;

-- Repeat the above, but for revenues_change
INSERT INTO correlations
SELECT 
  'revenues_change'::varchar AS measure,
  CORR(revenues_change, profits) AS profits,
  CORR(revenues_change, profits_change) AS profits_change,
  CORR(revenues_change, revenues_change) AS revenues_change
FROM fortune500;

In [None]:
DROP TABLE IF EXISTS correlations;

CREATE TEMP TABLE correlations AS
SELECT 
  'profits'::varchar AS measure,
  CORR(profits, profits) AS profits,
  CORR(profits, profits_change) AS profits_change,
  CORR(profits, revenues_change) AS revenues_change
FROM fortune500;

INSERT INTO correlations
SELECT 
  'profits_change'::varchar AS measure,
  CORR(profits_change, profits) AS profits,
  CORR(profits_change, profits_change) AS profits_change,
  CORR(profits_change, revenues_change) AS revenues_change
FROM fortune500;

INSERT INTO correlations
SELECT 
  'revenues_change'::varchar AS measure,
  CORR(revenues_change, profits) AS profits,
  CORR(revenues_change, profits_change) AS profits_change,
  CORR(revenues_change, revenues_change) AS revenues_change
FROM fortune500;

-- Select each column, rounding the correlations
SELECT 
  measure, 
  ROUND(profits::numeric, 2) AS profits,
  ROUND(profits_change::numeric, 2) AS profits_change,
  ROUND(revenues_change::numeric, 2) AS revenues_change
FROM correlations;