<a href="https://colab.research.google.com/github/datacamp/data-analysis-in-sql-live-training/blob/master/notebooks/PostgreSQL_live_session_template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<p align="center">
<img src="https://github.com/datacamp/data-analysis-in-sql-live-session/blob/master/assets/datacamp.svg?raw=True" alt = "DataCamp icon" width="50%">
</p>
<br><br>

## **PostgreSQL Live Training Template**

_Enter a brief description of your session, here's an example below:_

Welcome to this hands-on training where we will immerse yourself in data analysis in SQL. You'll learn how to extract data, and analyze it using SQL.
You will have the ability to:

* Select mulitple columns in SQL.
* Compute aggregations and summarize your data.
* Join tables to get more insights.

## **The Dataset**

_Enter a brief description of your dataset and its columns, here's an example below:_


The dataset to be used in this webinar is a CSV file named `actor.csv`, which contains data on actors:

- `actor_id`: The unique identifier for an actor
- `first_name`: Actor first name
- `last_name`: Actor last name
- `last_update`: Last showing

## **Setting up PostgreSQL**

In [0]:
#@title **This block of code will install PosgreSQL**
%%capture
!wget -qO- https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add -
!echo "deb http://apt.postgresql.org/pub/repos/apt/ bionic-pgdg main" >/etc/apt/sources.list.d/pgdg.list
!apt -qq update
!apt -yq install postgresql-12 postgresql-client-12
!service postgresql start
# make calling psql shorter
!sudo -u postgres psql -c "CREATE USER root WITH SUPERUSER"  
!psql postgres -c "CREATE DATABASE root"  # now just !psql -c "..."
# load SQL extensions
%load_ext sql
%config SqlMagic.feedback=False 
%config SqlMagic.autopandas=True
%sql postgresql+psycopg2://@/postgres

In [0]:
#@title **This will download your data to local environment**
!wget -q https://github.com/datacamp/data-analysis-in-sql-live-training/raw/master/data/user_data.csv

In [5]:
#@title **This will create your table**
%%sql
-- Make sure to amend you table name, column names and types
DROP TABLE IF EXISTS user_sessions;
CREATE TABLE user_sessions(
 session_date date,
 user_id int,
 time_spent_in_mins int
);

COPY user_sessions
-- Make sure to point to correct file and delimiter 
FROM '/content/user_data.csv' DELIMITER ',' CSV HEADER;

 * postgresql+psycopg2://@/postgres


In [6]:
%%sql
-- SELECT first 10 rows

SELECT *
FROM user_sessions 
LIMIT 10;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,session_date,user_id,time_spent_in_mins
0,2020-01-01,1,127
1,2020-01-01,2,147
2,2020-01-01,5,106
3,2020-01-01,6,179
4,2020-01-01,7,143
5,2020-01-01,8,179
6,2020-01-01,10,94
7,2020-01-01,11,76
8,2020-01-01,12,166
9,2020-01-01,14,135


In [15]:
%%sql

SELECT DISTINCT
  DATE_TRUNC('month', session_date) :: DATE AS session_month,
  session_date
FROM user_sessions
ORDER BY session_date ASC;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,session_month,session_date
0,2020-01-01,2020-01-01
1,2020-01-01,2020-01-02
2,2020-01-01,2020-01-03
3,2020-01-01,2020-01-04
4,2020-01-01,2020-01-05
...,...,...
147,2020-05-01,2020-05-27
148,2020-05-01,2020-05-28
149,2020-05-01,2020-05-29
150,2020-05-01,2020-05-30


In [12]:
%%sql

SELECT
  DATE_TRUNC('month', session_date) :: DATE AS session_month,
  COUNT(DISTINCT user_id) AS active_users,
  ROUND(SUM(time_spent_in_mins) / 60, 0) AS hrs_spent
FROM user_sessions
GROUP BY session_month
ORDER BY session_month ASC;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,session_month,active_users,hrs_spent
0,2020-01-01,1472,47241
1,2020-02-01,1621,48826
2,2020-03-01,1723,54883
3,2020-04-01,1879,58647
4,2020-05-01,1995,64295


In [17]:
%%sql

WITH monthly_data AS (
  SELECT
    DATE_TRUNC('month', session_date) :: DATE AS session_month,
    COUNT(DISTINCT user_id) AS active_users,
    ROUND(SUM(time_spent_in_mins) / 60, 0) AS hrs_spent
  FROM user_sessions
  GROUP BY session_month
)

SELECT *
FROM monthly_data
ORDER BY session_month ASC;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,session_month,active_users,hrs_spent
0,2020-01-01,1472,47241
1,2020-02-01,1621,48826
2,2020-03-01,1723,54883
3,2020-04-01,1879,58647
4,2020-05-01,1995,64295


In [23]:
%%sql

WITH monthly_data AS (
  SELECT
    DATE_TRUNC('month', session_date) :: DATE AS session_month,
    COUNT(DISTINCT user_id) AS active_users,
    ROUND(SUM(time_spent_in_mins) / 60, 0) AS hrs_spent
  FROM user_sessions
  GROUP BY session_month
)

SELECT
  session_month,
  active_users,
  LAG(active_users, 1) OVER (ORDER BY session_month ASC) AS previous_users
FROM monthly_data
ORDER BY session_month ASC;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,session_month,active_users,previous_users
0,2020-01-01,1472,
1,2020-02-01,1621,1472.0
2,2020-03-01,1723,1621.0
3,2020-04-01,1879,1723.0
4,2020-05-01,1995,1879.0


In [26]:
%%sql

WITH monthly_data AS (
  SELECT
    DATE_TRUNC('month', session_date) :: DATE AS session_month,
    COUNT(DISTINCT user_id) AS active_users
  FROM user_sessions
  GROUP BY session_month
),

 previous_month_data AS (
SELECT
  session_month,
  active_users,
  COALESCE(
      LAG(active_users, 1) OVER (ORDER BY session_month ASC),
      0
  ) AS prev_month_users
FROM monthly_data)

SELECT *
FROM previous_month_data
ORDER BY session_month ASC;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,session_month,active_users,coalesce
0,2020-01-01,1472,0
1,2020-02-01,1621,1472
2,2020-03-01,1723,1621
3,2020-04-01,1879,1723
4,2020-05-01,1995,1879


In [30]:
%%sql

WITH monthly_data AS (
  SELECT
    DATE_TRUNC('month', session_date) :: DATE AS session_month,
    COUNT(DISTINCT user_id) AS active_users
  FROM user_sessions
  GROUP BY session_month
),

 previous_month_data AS (
SELECT
  session_month,
  active_users,
  COALESCE(
      LAG(active_users, 1) OVER (ORDER BY session_month ASC), 1
  ) AS prev_month_users
FROM monthly_data)

SELECT
  session_month,
  ROUND(
      (active_users - prev_month_users) :: NUMERIC / prev_month_users, 3
  ) AS growth
FROM previous_month_data
ORDER BY session_month ASC;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,session_month,round
0,2020-01-01,1471.0
1,2020-02-01,0.101
2,2020-03-01,0.063
3,2020-04-01,0.091
4,2020-05-01,0.062


In [32]:
%%sql

SELECT
  DATE_TRUNC('month', session_date) :: DATE AS session_month,
  user_id,
  SUM(time_spent_in_mins) AS mins_spent
FROM user_sessions
GROUP BY session_month, user_id
ORDER BY session_month ASC, user_id ASC;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,session_month,user_id,mins_spent
0,2020-01-01,0,1133
1,2020-01-01,1,852
2,2020-01-01,2,1483
3,2020-01-01,3,1762
4,2020-01-01,4,1873
...,...,...,...
8685,2020-05-01,2056,1029
8686,2020-05-01,2057,617
8687,2020-05-01,2058,3366
8688,2020-05-01,2059,2500


In [37]:
%%sql

WITH user_monthly_data AS (
  SELECT
    DATE_TRUNC('month', session_date) :: DATE AS session_month,
    user_id,
    SUM(time_spent_in_mins) AS mins_spent
  FROM user_sessions
  GROUP BY session_month, user_id)

SELECT
 prev_month.user_id,
 prev_month.session_month,
 prev_month.mins_spent,
 cur_month.session_month,
 cur_month.mins_spent
FROM user_monthly_data AS prev_month
LEFT JOIN user_monthly_data AS cur_month
ON prev_month.user_id = cur_month.user_id
AND prev_month.session_month = (cur_month.session_month - interval '1' month)
LIMIT 50;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,user_id,session_month,mins_spent,session_month.1,mins_spent.1
0,756,2020-02-01,2407,2020-03-01,158.0
1,972,2020-01-01,2693,2020-02-01,1599.0
2,719,2020-05-01,208,,
3,1488,2020-03-01,802,,
4,1749,2020-04-01,472,2020-05-01,3808.0
5,1091,2020-02-01,2633,2020-03-01,3853.0
6,1268,2020-05-01,3048,,
7,1509,2020-04-01,3480,2020-05-01,1432.0
8,1489,2020-02-01,662,2020-03-01,2687.0
9,485,2020-05-01,3469,,
