In [None]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
import datetime

Helpful references:
* https://cs.calvin.edu/courses/data/202/ka37/topics/sql.html
* https://www.sqlite.org/lang_select.html
* https://www.sqlite.org/lang_corefunc.html

In [None]:
db_connection = "sqlite:///rides.db"

# Preprocess the data and create the SQL database
Run the code in this section unchanged.

In [None]:
rides = pd.read_csv("2011-capitalbikeshare-tripdata.zip",
                    usecols=["Duration", "Start date", "End date", "Start station number", "End station number", "Member type"])
rides.info()

In [None]:
# rename columns
rides = (
    rides
    .rename(columns=lambda x: x.lower().replace(' ', '_'))
    .rename(columns={'start_date': 'start_timestamp', 'end_date': 'end_timestamp'})
)

In [None]:
rides.head()

In [None]:
rides.to_sql("rides", db_connection, if_exists="replace", index=False)

In [None]:
# Run this code unchanged.
holidays = pd.DataFrame({
    'date': USFederalHolidayCalendar().holidays(datetime.date(2011,1,1), datetime.date(2015,12,31)).date,
    'is_holiday': True})
holidays.head()

In [None]:
holidays.to_sql("holidays", db_connection, if_exists="replace", index=False)

# `head`

In [None]:
rides.head(n=5) # Pandas way

In [None]:
# SQL way
pd.read_sql_query("""
    SELECT
      *
      FROM rides
      ORDER BY start_timestamp ASC
      LIMIT 5
""", db_connection)

## Exercise 1
Get the last 10 rides of the year.

**Note**: ordering options are "ASC" and "DESC".

In [None]:
# Pandas way:
rides.sort_values('start_timestamp', ascending=False).head(10)

In [None]:
# SQL way
# your code here


## Exercise 2
Get only the 'start_timestamp' and 'member_type' columns for the first 5 rides.

Note: you'll need to replace the `*`.

In [None]:
# Pandas way:
rides[['start_timestamp', 'member_type']].head(5)

In [None]:
# SQL way
# your code here


# `query` / `filter`

`WHERE` lets you filter to include only certain rows

In [None]:
# pandas way:
(
    rides
    .query("start_station_number == 31620")
    [['start_station_number', 'start_timestamp', 'member_type']]
    .head(5)
)

In [None]:
# SQL way
pd.read_sql_query("""
    SELECT
      start_station_number, start_timestamp, member_type
      FROM rides
      WHERE start_station_number = 31620
      ORDER BY start_timestamp ASC
      LIMIT 5
""", db_connection)

## Exercise 3
Get the first 5 rides by Members.

Note:
* you'll need to `'quote'` strings, just like in Python---but the quotes should be *single* quotes (`'`) **not** double quotes (`"`)
* but unlike Python, the "equal" operator is just `=`, not `==`.

In [None]:
# SQL way
# your code here


You can also get a single value by using an aggregation function, like `count`:

In [None]:
pd.read_sql_query("""
    SELECT
      COUNT(*)
      FROM rides
""", db_connection)

# Exercise 3.2
Get the total number of rides from Members.

In [None]:
# SQL way
# your code here

# `grouping`

SQL gets powerful (and complicated) when you start grouping.

The basic template is: `SELECT` *things* `GROUP BY` *grouping expression*

*things* can include [**aggregation functions**](https://www.sqlite.org/lang_aggfunc.html) like `count`.

In [None]:
# Pandas way:
rides['member_type'].value_counts()

In [None]:
# get the number of rides by member type
pd.read_sql_query("""
    SELECT
      member_type, COUNT(*)
      FROM rides
      GROUP BY member_type
""", db_connection)

In [None]:
# get the number of departures from each station
pd.read_sql_query("""
    SELECT
      start_station_number, COUNT(*)
      FROM rides
      GROUP BY start_station_number
      LIMIT 5
""", db_connection)

It's often helpful to rename the results of aggregation functions. The `AS` keyword can rename any result column.

In [None]:
# get the number of departures from each station
pd.read_sql_query("""
    SELECT
      start_station_number, COUNT(*) AS num_departures
      FROM rides
      GROUP BY start_station_number
      LIMIT 5
""", db_connection)

## Exercise 4
Get the number of departures from each station, but only for Members.

In [None]:
# Pandas way:
(
    rides
    .query("member_type == 'Member'")
    .groupby('start_station_number')
    .size().to_frame('num_departures')
    .head(5)
)

In [None]:
# SQL way
# your code here

## Exercise 5
Break down those counts by member type. You'll need to:
* Include a column for `member_type`
* add `member_type` to the grouping expression

The result should look like:

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>start_station_number</th>
      <th>member_type</th>
      <th>num_departures</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>31000</td>
      <td>Casual</td>
      <td>302</td>
    </tr>
    <tr>
      <th>1</th>
      <td>31000</td>
      <td>Member</td>
      <td>869</td>
    </tr>
    <tr>
      <th>2</th>
      <td>31001</td>
      <td>Casual</td>
      <td>826</td>
    </tr>
    <tr>
      <th>3</th>
      <td>31001</td>
      <td>Member</td>
      <td>2666</td>
    </tr>
    <tr>
      <th>4</th>
      <td>31002</td>
      <td>Casual</td>
      <td>894</td>
    </tr>
  </tbody>
</table>

In [None]:
# your code here


## Exercise 6
Compute the number of rides per day.

Note: You'll need to use `date(start_timestamp)`, which is one of sqlite's [built-in date/time functions](https://www.sqlite.org/lang_datefunc.html)

In [None]:
# your code here

# `join`

As before, let's mark which rides are on federal holidays. To do that, we'll use the `holidays` table. Notice how it's structured:

In [None]:
df = pd.read_sql_query("""
    SELECT
      *
      FROM holidays
""", db_connection)
print(len(df), "rows")
df.head()

We'll need a `rides_by_date` table... (don't worry about this.)

In [None]:
pd.io.sql.execute("""DROP TABLE IF EXISTS rides_by_date""", db_connection)
pd.io.sql.execute("""CREATE TABLE rides_by_date AS SELECT
      date(start_timestamp) as start_day, COUNT(*) as num_rides
      FROM rides
      GROUP BY start_day""", db_connection);

Consider the example below. Think about why only 9 rows come out. Then:

* Change `JOIN` to `LEFT JOIN`; how many rows do you get then? Does that make sense?
* What values does `is_holiday` take on? Try replacing `is_holiday` by `IFNULL(is_holiday, 0)` (sqlite specific) or `COALESCE(is_holiday, 0)` (standard SQL). Does that achieve the result you hope for?
* Add an `AS is_holiday` clause to rename the `is_holiday` column to a more useful name.

(You don't need to include textual answers, just change the code as directed.)

In [None]:
df = pd.read_sql_query("""
    SELECT
      start_day, num_rides, COALESCE(is_holiday, 0)
      FROM rides_by_date
      LEFT JOIN holidays ON rides_by_date.start_day = holidays.date
""", db_connection)
print(len(df), "rows")
df.head(n=31)